您好,欢迎来到爱问旅游网。
搜索
您的当前位置:首页hive 生成拉链表&拉链表数据查询&拉链表回滚

hive 生成拉链表&拉链表数据查询&拉链表回滚

来源:爱问旅游网

生成拉链表思路分析

ods层

show databases;
create database test_zip;
-- ods
drop table if exists test_zip.ods_user_info;
create table test_zip.ods_user_info
(
    id   string comment 'id',
    name string comment '姓名'
) comment '用户信息表'
    partitioned by (load_date string comment '分区字段');

-- 装载语句
-- 将以下数据插入 ods_user_info
insert into table test_zip.ods_user_info values ("1", "张三", "2020-06-14");
insert into table test_zip.ods_user_info values ("2", "李四", "2020-06-14");
insert into table test_zip.ods_user_info values ("3", "王五", "2020-06-14");
insert into table test_zip.ods_user_info values ("4", "赵六", "2020-06-14");
insert into table test_zip.ods_user_info values ("5", "小明", "2020-06-14");
insert into table test_zip.ods_user_info values ("6", "小华", "2020-06-14");
insert into table test_zip.ods_user_info values ("7", "小强", "2020-06-14");
-- 第二天的新增和变化数据
insert into table test_zip.ods_user_info values ("6", "大华", "2020-06-15");
insert into table test_zip.ods_user_info values ("7", "大强", "2020-06-15");
insert into table test_zip.ods_user_info values ("8", "马克", "2020-06-15");
insert into table test_zip.ods_user_info values ("9", "威廉", "2020-06-15");

dim层 初始化拉链表

drop table if exists dim_user_info_zip;
create table if not exists dim_user_info_zip
(
    id         string comment 'id',
    name       string comment '姓名',
    start_date string comment '开始日期',
    end_date   string comment '日期'
) comment '用户拉链表' partitioned by (dt string comment '分区字段');

首日装载

-- 首日装载(全量数据载) 将 2020-06-14 作为数仓上线第一天
insert overwrite table dim_user_info_zip partition (dt)
select id,
       name,
       '2020-06-14' as start_date,
       '9999-12-31' as end_date,
       '9999-12-31' as dt
from ods_user_info
where dt = '2020-06-14';

select * from dim_user_info_zip;

每日装载(15号)

-- todo 2020-06-15
-- 每日装载
-- 每日装载 获取第二天新增和变化的数据
with tmp as (
            select old.id         as old_id,
                   old.name       as old_name,
                   old.start_date as old_start_date,
                   old.end_date   as old_end_date,
                   new.id         as new_id,
                   new.name       as new_name,
                   new.start_date as new_start_date,
                   new.end_date   as new_end_date
            from (
                 select id,
                        name,
                        start_date,
                        end_date
                 from dim_user_info_zip
                 where dt = '9999-12-31'
                 ) old
                     full outer join
                 ( -- 获取第二天新增和变化的数据
                 select id,
                        name,
                        '2020-06-15' as start_date,
                        '9999-12-31' as end_date
                 from ods_user_info
                 where dt = '2020-06-15'
                 ) new
                 on old.id = new.id
            )
insert overwrite table dim_user_info_zip partition (dt)
select if(new_id is not null, new_id, old_id)                 id,
       if(new_id is not null, new_name, old_name)             name,
       if(new_id is not null, new_start_date, old_start_date) start_date,
       if(new_id is not null, new_end_date, old_end_date)     end_date,
       if(new_id is not null, new_end_date, old_end_date)     dt
from tmp
union all
select old_id                                    as id,
       old_name                                  as name,
       old_start_date                            as start_date,
       cast(date_sub('2020-06-15', 1) as string) as end_date,
       cast(date_sub('2020-06-15', 1) as string) as dt
from tmp
where old_id is not null
  and new_id is not null;
select * from dim_user_info_zip;

select id,
       name,
       start_date,
       '9999-12-31' as end_date,
       '9999-12-31' as dt
from dim_user_info_zip
where end_date >= '2020-06-15'
  and start_date <= '2020-06-15';

-- 15号也是当天最新数据
select *
from dim_user_info_zip
where end_date='9999-12-31';

每日装载(16号)

-- todo 2020-06-16
-- 每日装载
-- 每日装载 获取第二天新增和变化的数据
with tmp as (
            select old.id         as old_id,
                   old.name       as old_name,
                   old.start_date as old_start_date,
                   old.end_date   as old_end_date,
                   new.id         as new_id,
                   new.name       as new_name,
                   new.start_date as new_start_date,
                   new.end_date   as new_end_date
            from (
                 select id,
                        name,
                        start_date,
                        end_date
                 from dim_user_info_zip
                 where dt = '9999-12-31'
                 ) old
                     full outer join
                 ( -- 获取第二天新增和变化的数据
                 select id,
                        name,
                        '2020-06-16' as start_date,
                        '9999-12-31' as end_date
                 from ods_user_info
                 where dt = '2020-06-16'
                 ) new
                 on old.id = new.id
            )
insert overwrite table dim_user_info_zip partition (dt)
select if(new_id is not null, new_id, old_id)                 id,
       if(new_id is not null, new_name, old_name)             name,
       if(new_id is not null, new_start_date, old_start_date) start_date,
       if(new_id is not null, new_end_date, old_end_date)     end_date,
       if(new_id is not null, new_end_date, old_end_date)     dt
from tmp
union all
select old_id                                    as id,
       old_name                                  as name,
       old_start_date                            as start_date,
       cast(date_sub('2020-06-16', 1) as string) as end_date,
       cast(date_sub('2020-06-16', 1) as string) as dt
from tmp
where old_id is not null
  and new_id is not null;
select * from dim_user_info_zip;

select id,
       name,
       start_date,
       '9999-12-31' as end_date,
       '9999-12-31' as dt
from dim_user_info_zip
where end_date >= '2020-06-16'
  and start_date <= '2020-06-16';

-- 16号也是当天最新数据
select *
from dim_user_info_zip
where end_date='9999-12-31';

每日装载(17号)

-- todo 2020-06-17
with tmp as (
            select old.id         as old_id,
                   old.name       as old_name,
                   old.start_date as old_start_date,
                   old.end_date   as old_end_date,
                   new.id         as new_id,
                   new.name       as new_name,
                   new.start_date as new_start_date,
                   new.end_date   as new_end_date
            from (
                 select id,
                        name,
                        start_date,
                        end_date
                 from dim_user_info_zip
                 where dt = '9999-12-31'
                 ) old
                     full outer join
                 ( -- 获取第二天新增和变化的数据
                 select id,
                        name,
                        '2020-06-17' as start_date,
                        '9999-12-31' as end_date
                 from ods_user_info
                 where dt = '2020-06-17'
                 ) new
                 on old.id = new.id
            )
insert overwrite table dim_user_info_zip partition (dt)
select if(new_id is not null, new_id, old_id)                 id,
       if(new_id is not null, new_name, old_name)             name,
       if(new_id is not null, new_start_date, old_start_date) start_date,
       if(new_id is not null, new_end_date, old_end_date)     end_date,
       if(new_id is not null, new_end_date, old_end_date)     dt
from tmp
union all
select old_id                                    as id,
       old_name                                  as name,
       old_start_date                            as start_date,
       cast(date_sub('2020-06-17', 1) as string) as end_date,
       cast(date_sub('2020-06-17', 1) as string) as dt
from tmp
where old_id is not null
  and new_id is not null;
select * from dim_user_info_zip;

select id,
       name,
       start_date,
       '9999-12-31' as end_date,
       '9999-12-31' as dt
from dim_user_info_zip
where end_date >= '2020-06-16'
  and start_date <= '2020-06-16';

-- 17号也是当天最新数据
select *
from dim_user_info_zip
where end_date='9999-12-31';

拉链表回滚

假设2020-06-16号的数据出错了,要进行回滚,要回滚到2020-06-15,然后重新跑16号、17号的数据

1、重修跑16、17号数据,前提是,16号、17号的数据已经修复正确
2、注意备份数据

-- 拉链表回滚

-- 1、回滚到2020-06-15
insert overwrite table dim_user_info_zip partition (dt)
select id,
       name,
       start_date,
       '9999-12-31' as end_date, -- 将15号,闭链的数据,重新开链,即《威廉》
       '9999-12-31' as dt -- 最新分区
from dim_user_info_zip
where end_date >= '2020-06-15'
  and start_date <= '2020-06-15';

-- 2、重新跑16号、17号的数据
-- 2.1 跑一下 每日装载(16号)
-- 2.2 跑一下 每日装载(17号)

因篇幅问题不能全部显示,请点此查看更多更全内容

Copyright © 2019- awee.cn 版权所有 湘ICP备2023022495号-5

违法及侵权请联系:TEL:199 1889 7713 E-MAIL:2724546146@qq.com

本站由北京市万商天勤律师事务所王兴未律师提供法律服务