103-数仓项目:数据分析
大屏页面地址:124.221.116.22
基于果业数据和大屏需求进行数据分析构建数仓
create database orchard;
create external table orchard_data
(
orchard_id string,
company string,
owner string,
phone string,
area string,
category string,
produce string,
gdp string,
data_date date,
province string,
city string,
county string
) row format delimited fields terminated by ','
location '/orchard_data' ;
-- 数据抽样
select * from orchard_data tablesample ( 1 percent );
-- 数据清洗并导入新的分桶表
-- 筛选含有空值的数据, 将area, produce, gdp列转为double类型方便统计计算
create external table standard_data
(
orchard_id string,
company string,
owner string,
phone string,
area string,
category string,
produce string,
gdp string,
data_date date,
province string,
city string,
county string
) clustered by (orchard_id) into 5 buckets
row format delimited fields terminated by ',';
insert into table standard_data
-- insert overwrite directory '/export'
select orchard_id, company, owner, phone, cast(split(area, '亩')[0] as double) as area,
category, cast(split(produce, '吨')[0] as double) as produce,
cast(split(gdp, '万元')[0] as double) as gdp, data_date, province, city, county
from orchard_data
where area != ''
and produce != ''
and gdp != '';
select * from standard_data tablesample ( 1 percent );
--统计2024年各类水果产量并插入到新的数据表produce_by_year
create table produce_by_year as
select category, round(sum(produce), 2) as year_produce, '2024' as year
from standard_data
where data_date >= '2024-01-01' and data_date <= '2024-12-31'
group by category
order by year_produce desc ;
--查看结果
select * from produce_by_year;
--统计2024年各区县水果种植面积、产量和产值并插入到新的数据表area_by_year
create table count_by_county as
select county,
round(sum(area), 2) as year_produce,
round(sum(produce), 2) as year_area,
round(sum(gdp), 2) as year_gdp,
'2024' as year
from standard_data
where data_date >= '2024-01-01' and data_date <= '2024-12-31'
group by county
order by year_gdp desc ;
--查看结果
select * from count_by_county;
-- 查看2024年gdp top10企业
select company, owner, round(sum(gdp), 2) as year_gdp
from standard_data
where data_date >= '2024-01-01' and data_date <= '2024-12-31'
group by company, owner
order by year_gdp desc
limit 10;
Last modified: 10 October 2024