102-数仓项目:模拟数据
项目模拟生成企业数据,利用Python脚本生成大量非规范的结构数据来模拟大数据工程项目中的数据,在实际生产过程中的数据量往往会更大,更不规范。
知识点
掌握Python中常用的函数生成数据;
掌握Python中faker库快速生成模拟数据的方法;
实操
安装faker库
pip install faker
# orchard_id, company, owner, phone, area, category, produce, gdp, date, province, city, county
from faker import Faker
import datetime
import random
from tqdm import tqdm
fake = Faker('zh_CN')
Faker.seed(9)
random.seed(9)
# faker的基本能使用
print(fake.name())
print(fake.phone_number())
print(fake.date_between(datetime.date(2019, 1, 1), datetime.date(2024, 12, 31)))
print(fake.company())
########### 正片开始 ######################
counties = ['章贡区', '南康区', '赣县区', '瑞金市', '龙南市', '大余县', '上犹县', '崇义县', '信丰县', '定南县',
'全南县', '安远县', '宁都县', '于都县', '兴国县', '会昌县', '石城县', '寻乌县']
# 生成果园信息列表,并导出成csv文件
def gen_orchards(num: int) -> list:
orchards = []
print('开始生成果园数据..')
fp = open('data/orchards.csv', 'w', encoding='utf-8')
for i in tqdm(range(1, num + 1)):
orchard = {'orchard_id': i, 'company': fake.company(), 'owner': fake.name(), 'phone': fake.phone_number(),
'province': '江西省', 'city': '赣州市', 'county': random.choice(counties)}
orchards.append(orchard)
fp.write('{},{},{},{},{},{},{}\n'.format(
orchard['orchard_id'],
orchard['company'],
orchard['owner'],
orchard['phone'],
orchard['province'],
orchard['city'],
orchard['county']
))
fp.close()
print('生成果园数据完成..')
return orchards
# 生成模拟数据
# orchard_id, company, owner, phone, area, category, produce, gdp, date, province, city, county
def gen_data(orchards: list, num: int) -> None:
print('开始生成果业数据..')
fp = open('data/data.csv', 'w', encoding='utf-8')
for i in tqdm(range(num)):
orchard = random.choice(orchards)
id = orchard['orchard_id']
company = orchard['company']
owner = orchard['owner']
phone = orchard['phone']
province = orchard['province']
city = orchard['city']
county = orchard['county']
area = str(round(random.random(), 3)) + '亩' if random.random() > 0.02 else ''
category = '脐橙' if random.random() > 0.5 else random.choice(
['鹰嘴桃', '蜜桔', '葡萄', '荔枝', '柚子', '苹果', '山竹', '桑葚', '西瓜', '柠檬']
)
produce = str(round(random.random(), 3)) + '吨' if random.random() > 0.02 else ''
gdp = str(round(random.random(), 20)) + '万元' if random.random() > 0.02 else ''
date = fake.date_between(datetime.date(2019, 1, 1), datetime.date(2024, 12, 31))
# 最终数据
data = f'{id},{company},{owner},{phone},{area},{category},{produce},{gdp},{date},{province},{city},{county}\n'
fp.write(data)
fp.close()
print('生成果园数据完成')
if __name__ == '__main__':
# 生成500个果园
orchards = gen_orchards(5 * 100)
# 基于500个果园生成5w条数据 1:100
gen_data(orchards, 500 * 100)
Last modified: 10 October 2024