Hadoop笔记 Help

103-数仓项目:数据分析

大屏页面地址:124.221.116.22

基于果业数据和大屏需求进行数据分析构建数仓

create database orchard; create external table orchard_data ( orchard_id string, company string, owner string, phone string, area string, category string, produce string, gdp string, data_date date, province string, city string, county string ) row format delimited fields terminated by ',' location '/orchard_data' ; -- 数据抽样 select * from orchard_data tablesample ( 1 percent ); -- 数据清洗并导入新的分桶表 -- 筛选含有空值的数据, 将area, produce, gdp列转为double类型方便统计计算 create external table standard_data ( orchard_id string, company string, owner string, phone string, area string, category string, produce string, gdp string, data_date date, province string, city string, county string ) clustered by (orchard_id) into 5 buckets row format delimited fields terminated by ','; insert into table standard_data -- insert overwrite directory '/export' select orchard_id, company, owner, phone, cast(split(area, '亩')[0] as double) as area, category, cast(split(produce, '吨')[0] as double) as produce, cast(split(gdp, '万元')[0] as double) as gdp, data_date, province, city, county from orchard_data where area != '' and produce != '' and gdp != ''; select * from standard_data tablesample ( 1 percent ); --统计2024年各类水果产量并插入到新的数据表produce_by_year create table produce_by_year as select category, round(sum(produce), 2) as year_produce, '2024' as year from standard_data where data_date >= '2024-01-01' and data_date <= '2024-12-31' group by category order by year_produce desc ; --查看结果 select * from produce_by_year; --统计2024年各区县水果种植面积、产量和产值并插入到新的数据表area_by_year create table count_by_county as select county, round(sum(area), 2) as year_produce, round(sum(produce), 2) as year_area, round(sum(gdp), 2) as year_gdp, '2024' as year from standard_data where data_date >= '2024-01-01' and data_date <= '2024-12-31' group by county order by year_gdp desc ; --查看结果 select * from count_by_county; -- 查看2024年gdp top10企业 select company, owner, round(sum(gdp), 2) as year_gdp from standard_data where data_date >= '2024-01-01' and data_date <= '2024-12-31' group by company, owner order by year_gdp desc limit 10;
Last modified: 10 October 2024