Hadoop笔记 Help

102-数仓项目:模拟数据

项目模拟生成企业数据,利用Python脚本生成大量非规范的结构数据来模拟大数据工程项目中的数据,在实际生产过程中的数据量往往会更大,更不规范。

知识点

  • 掌握Python中常用的函数生成数据;

  • 掌握Python中faker库快速生成模拟数据的方法;

实操

安装faker库

pip install faker
# orchard_id, company, owner, phone, area, category, produce, gdp, date, province, city, county from faker import Faker import datetime import random from tqdm import tqdm fake = Faker('zh_CN') Faker.seed(9) random.seed(9) # faker的基本能使用 print(fake.name()) print(fake.phone_number()) print(fake.date_between(datetime.date(2019, 1, 1), datetime.date(2024, 12, 31))) print(fake.company()) ########### 正片开始 ###################### counties = ['章贡区', '南康区', '赣县区', '瑞金市', '龙南市', '大余县', '上犹县', '崇义县', '信丰县', '定南县', '全南县', '安远县', '宁都县', '于都县', '兴国县', '会昌县', '石城县', '寻乌县'] # 生成果园信息列表,并导出成csv文件 def gen_orchards(num: int) -> list: orchards = [] print('开始生成果园数据..') fp = open('data/orchards.csv', 'w', encoding='utf-8') for i in tqdm(range(1, num + 1)): orchard = {'orchard_id': i, 'company': fake.company(), 'owner': fake.name(), 'phone': fake.phone_number(), 'province': '江西省', 'city': '赣州市', 'county': random.choice(counties)} orchards.append(orchard) fp.write('{},{},{},{},{},{},{}\n'.format( orchard['orchard_id'], orchard['company'], orchard['owner'], orchard['phone'], orchard['province'], orchard['city'], orchard['county'] )) fp.close() print('生成果园数据完成..') return orchards # 生成模拟数据 # orchard_id, company, owner, phone, area, category, produce, gdp, date, province, city, county def gen_data(orchards: list, num: int) -> None: print('开始生成果业数据..') fp = open('data/data.csv', 'w', encoding='utf-8') for i in tqdm(range(num)): orchard = random.choice(orchards) id = orchard['orchard_id'] company = orchard['company'] owner = orchard['owner'] phone = orchard['phone'] province = orchard['province'] city = orchard['city'] county = orchard['county'] area = str(round(random.random(), 3)) + '亩' if random.random() > 0.02 else '' category = '脐橙' if random.random() > 0.5 else random.choice( ['鹰嘴桃', '蜜桔', '葡萄', '荔枝', '柚子', '苹果', '山竹', '桑葚', '西瓜', '柠檬'] ) produce = str(round(random.random(), 3)) + '吨' if random.random() > 0.02 else '' gdp = str(round(random.random(), 20)) + '万元' if random.random() > 0.02 else '' date = fake.date_between(datetime.date(2019, 1, 1), datetime.date(2024, 12, 31)) # 最终数据 data = f'{id},{company},{owner},{phone},{area},{category},{produce},{gdp},{date},{province},{city},{county}\n' fp.write(data) fp.close() print('生成果园数据完成') if __name__ == '__main__': # 生成500个果园 orchards = gen_orchards(5 * 100) # 基于500个果园生成5w条数据 1:100 gen_data(orchards, 500 * 100)
Last modified: 10 October 2024