现场Hive用过的语法

部分现场Hive用过的语法

1.

1
insert into table simba.t_res_phy_attribute_3d partition(action_time='202006', import_time='2020060315') select uuid,code_type,code,attribute_type,attribute,code_area,attribute_area,capture_time,longitude,latitude,location_type,location,location_area,action,origin_data,origin_data_uuid,start_time,end_time,count from simba.simba_phy_attribute where action_time='202006' and import_time>=2020060100 and import_time<2020060400 and origin_data not in ('lt_jizhu','yd_jizhu','dx_jizhu');

  1. 生成uuid

    1
    2
    3
    reflect("java.util.UUID","randomUUID")

    The function GenericUDFReflect(class,method[,arg1[,arg2]...])
  2. 时间格式正则表达式

    1
    2
    3
    4
       ^[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|2|23|[0-1]\d):[0-5]\d:[0-5]\d$
    regexp_extract('2020-06-12 12:37:37','^[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|2|23|[0-1]\d):[0-5]\d:[0-5]\d$',0)

    regexp_extract(offereffdate,'^[1-9]\\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\\s+(20|21|22|23|[0-1]\\d):[0-5]\\d:[0-5]\\d$',0),$s)
    • 如果没有结果,返回空字符串 ‘’
    • 注意是空字符串,而不是’ ‘,同样也不是NULL, 判断是需要使用 x!=’’ ,或者 x!=’’ or x is not null
  3. 删除指定表指定分区

    1
    alter table simba.simba_phy_attribute_temp drop if exists partition (action_time=2001-0) PURGE;
  4. convert a string to timestamp

    1
    from_utc_timestamp(date_format('2020-05-12 19:57:12','yyyy-MM-dd HH:mm:ss'),'UTC')
  5. hive表重命名

    1
    alter table <table_name> rename to <new_table_name>;

    样例:

    1
    alter table titan_mass.t_20200613114054_rxwi rename to simba.t_res_jizhudengji;
  6. 从 Hive 中导出数据到本地

    1. 将需要导出的表,另存为文本格式的表数据
      可以是 csv 格式,语法如下

      1
      create table 另存一张新表 row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde' with SERDEPROPERTIES ('escapeChar'='\"', 'quoteChar'='\"','separatorChar'=',') stored as textfile as select * from 需要导出的表 ;

      例如:

      1
      create table titan_mass.special_app_result01 row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde' with SERDEPROPERTIES ('escapeChar'='\"', 'quoteChar'='\"','separatorChar'=',') stored as textfile as select * from titan_mass.aaaaa;

      也可以是普通 textfile 格式,如下 \t 分割字段,\n 换行

      1
      create table titan_temp.express_yunda row format delimited fields terminated by '\t' lines terminated by '\n' stored as textfile as select * from simba.express-yunda-waybill;
    2. 确认需要导出数据的csv表在hdfs上的路径
      例如:

      1
      2
      desc formatted special_app_result01;
      hdfs://ambari01/apps/hive/warehouse/titan_mass.db/special_app_result01
    3. 在装由hadoop客户端的机器上,执行下面语句,将csv格式的数据导出到本地

      1
      hadoop fs -get hdfs://ambari01/apps/hive/warehouse/titan_mass.db/special_app_result01 /opt/install
    4. 可以将文件导出到windows中,用导入工具处理,也可以先在windows下配置导入工具的模型,然后将导入工具和模型放到数据所在机器,然后linux执行

  7. 取top1
    hive中分组取top1

    1
    create table titan_temp.jizhudengji_etl row format delimited fields terminated by '\t' lines terminated by '\n' stored as textfile as select customername,certtypeid,idcard,cardaddr,originphone,address,status,offerexpdate,offereffdate from (select customername,certtypeid,idcard,cardaddr,originphone,address,status,offerexpdate,offereffdate,row_number() over(partition by originphone order by offereffdate desc) as row_number from simba.tj_jizhudengji) t where row_number=1;

恢复drop和truncate hive表或分区

操作步骤

  1. 切换到 hive 用户
    1
    su - hive
  2. 查看 hdfs 上 .Trash目录下被误删除的表或者分区
    1
    hdfs dfs -ls /user/hive/.Trash/Current/apps/hive/warehouse/titan_temp.db/t_email
  3. 通过 hdfs 的 mv 或者 cp 命令将 .Trash 目录下的数据恢复到原来表路径下
    1
    hdfs dfs -cp /user/hive/.Trash/Current/apps/hive/warehouse/titan_temp.db/t_email/* /apps/hive/warehouse/titan_temp.db/t_email
  4. 恢复 meta 信息
    1
    msck repair table titan_temp.t_email;
  5. 恢复统计信息
    1
    2
    --这里需要查看hive官方文档的StatsDev章节的existing tables -analyze,对两个分区全部分区内容做analyze
    analyze table titan_temp.t_email partition(action_time,import_time) compute statistics;

truncate ,drop 表数据恢复过程

1. 创建表,插入测试数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
--创建表,插入测试数据
create table t_test_recover row format delimited fields terminated by '\t' stored as orc tblproperties ('orc.compress'='SNAPPY') as select * from relate_attribute_stat_temp limit 10000;

create table if not exists titan_temp.t_email
(
cor_id string comment '话单唯一标识',
location_code decimal(6) comment '数据来源地'

)
comment 't_email'
partitioned by (
action_time string comment 'cap_time',
import_time string comment 'import_time'
)
row format delimited
fields terminated by '\t'
stored as orc tblproperties ('orc.compression'='snappy');

ALTER TABLE titan_temp.t_email SET SERDEPROPERTIES ('escape.delim' = '\\');

insert into titan_temp.t_email partition(action_time='202006',import_time='20200629') select cdrid,location_code from simba.data_email where import_time='2020062912' limit 200;
insert into titan_temp.t_email partition(action_time='202007',import_time='20200729') select cdrid,location_code from simba.data_email where import_time='2020072912' limit 200;

--查看测试表中数据量
0: jdbc:hive2://192.168.10.27:2181,192.168.10> select count(1) from titan_temp.t_email;
+------+--+
| _c0 |
+------+--+
| 400 |
+------+--+

2. truncate 的恢复过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
--将表数据清空,truncate
0: jdbc:hive2://192.168.10.27:2181,192.168.10> truncate table titan_temp.t_email;
No rows affected (0.349 seconds)

--切换到hive用户
-- 查看hdfs目录下的Trash目录下被清空的数据
-bash-4.1$ hdfs dfs -ls /user/hive/.Trash/Current/apps/hive/warehouse/titan_temp.db/t_email
Found 2 items
drwx------ - hive hdfs 0 2020-07-31 11:09 /user/hive/.Trash/Current/apps/hive/warehouse/titan_temp.db/t_email/action_time=202006
drwx------ - hive hdfs 0 2020-07-31 11:09 /user/hive/.Trash/Current/apps/hive/warehouse/titan_temp.db/t_email/action_time=202007

--恢复hdfs目录下的数据
-bash-4.1$ hdfs dfs -cp /user/hive/.Trash/Current/apps/hive/warehouse/titan_temp.db/t_email/* /apps/hive/warehouse/titan_temp.db/t_email

-bash-4.1$ hdfs dfs -ls /apps/hive/warehouse/titan_temp.db/t_email
Found 2 items
drwxrwxrwx - hive hdfs 0 2020-07-31 11:09 /apps/hive/warehouse/titan_temp.db/t_email/action_time=202006
drwxrwxrwx - hive hdfs 0 2020-07-31 11:09 /apps/hive/warehouse/titan_temp.db/t_email/action_time=202007

0: jdbc:hive2://192.168.10.27:2181,192.168.10> select * from titan_temp.t_email limit 1;
+---------------------------------------+------------------------+----------------------+----------------------+--+
| t_email.cor_id | t_email.location_code | t_email.action_time | t_email.import_time |
+---------------------------------------+------------------------+----------------------+----------------------+--+
| e3b30b3d-35fc-400b-ae53-fce3ddfd9d6c | 120000 | 202006 | 20200629 |
+---------------------------------------+------------------------+----------------------+----------------------+--+
10 rows selected (0.119 seconds)
0: jdbc:hive2://192.168.10.27:2181,192.168.10> select count(1) from titan_temp.t_email;
+------+--+
| _c0 |
+------+--+
| 0 |
+------+--+

--修复meta信息
0: jdbc:hive2://192.168.10.27:2181,192.168.10> msck repair table titan_temp.t_email;
--统计数据量
0: jdbc:hive2://192.168.10.27:2181,192.168.10> analyze table titan_temp.t_email compute statistics;
Error: Error while compiling statement: FAILED: SemanticException [Error 10115]: Table is partitioned and partition specification is needed (state=42000,code=10115)

--这里需要查看hive官方文档的StatsDev章节的existing tables -analyze,对两个分区全部分区内容做analyze
0: jdbc:hive2://192.168.10.27:2181,192.168.10> analyze table titan_temp.t_email partition(action_time,import_time) compute statistics;
No rows affected (0.485 seconds)

0: jdbc:hive2://192.168.10.27:2181,192.168.10> select count(1) from titan_temp.t_email;
+------+--+
| _c0 |
+------+--+
| 400 |
+------+--+
1 row selected (0.085 seconds)

3. drop 的恢复过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
--DROP TABLE
0: jdbc:hive2://192.168.10.27:2181,192.168.10> drop table titan_temp.t_email;
No rows affected (0.315 seconds)

-bash-4.1$ hdfs dfs -ls /apps/hive/warehouse/titan_temp.db/t_email
ls: `/apps/hive/warehouse/titan_temp.db/t_email': No such file or directory
-bash-4.1$

--重新建表,然后查看新表信息
0: jdbc:hive2://192.168.10.27:2181,192.168.10> select count(1) from titan_temp.t_email;
+------+--+
| _c0 |
+------+--+
| 0 |
+------+--+
1 row selected (0.45 seconds)
0: jdbc:hive2://192.168.10.27:2181,192.168.10> show partitions titan_temp.t_email;
+------------+--+
| partition |
+------------+--+
+------------+--+
No rows selected (0.208 seconds)

--从Trash中恢复数据
-bash-4.1$ hdfs dfs -cp /user/hive/.Trash/Current/apps/hive/warehouse/titan_temp.db/t_email /apps/hive/warehouse/titan_temp.db/

-bash-4.1$ hdfs dfs -ls /apps/hive/warehouse/titan_temp.db/t_email
Found 2 items
drwxr-xr-x - hive hdfs 0 2020-07-31 14:30 /apps/hive/warehouse/titan_temp.db/t_email/action_time=202006
drwxr-xr-x - hive hdfs 0 2020-07-31 14:30 /apps/hive/warehouse/titan_temp.db/t_email/action_time=202007

0: jdbc:hive2://192.168.10.27:2181,192.168.10> select count(1) from titan_temp.t_email;
Error: Error while compiling statement: FAILED: SemanticException [Error 10001]: Line 1:21 Table not found 't_email' (state=42S02,code=10001)

--查看信息
0: jdbc:hive2://192.168.10.27:2181,192.168.10> show partitions titan_temp.t_email;
+------------+--+
| partition |
+------------+--+
+------------+--+
No rows selected (0.208 seconds)

--修复meta信息,恢复分区信息
0: jdbc:hive2://192.168.10.27:2181,192.168.10> msck repair table titan_temp.t_email;
No rows affected (0.238 seconds)

0: jdbc:hive2://192.168.10.27:2181,192.168.10> show partitions titan_temp.t_email;
+------------------------------------------+--+
| partition |
+------------------------------------------+--+
| action_time=202006/import_time=20200629 |
| action_time=202007/import_time=20200729 |
+------------------------------------------+--+
2 rows selected (0.164 seconds)

重新执行count(1) 或者执行统计信息分析
select count(1) 或者 0: jdbc:hive2://192.168.10.27:2181,192.168.10> analyze table titan_temp.t_email partition(action_time,import_time) compute statistics;
No rows affected (0.422 seconds)

0: jdbc:hive2://192.168.10.27:2181,192.168.10> select count(1) from titan_temp.t_email;
+------+--+
| _c0 |
+------+--+
| 400 |
+------+--+
1 row selected (0.154 seconds)

4. 删除分区后的恢复过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
0: jdbc:hive2://192.168.10.27:2181,192.168.10> alter table titan_temp.t_email drop if exists partition (action_time=202006);
INFO : Dropped the partition action_time=202006/import_time=20200629
No rows affected (0.272 seconds)
0: jdbc:hive2://192.168.10.27:2181,192.168.10> select count(1) from titan_temp.t_email;
+------+--+
| _c0 |
+------+--+
| 200 |
+------+--+
1 row selected (0.094 seconds)
0: jdbc:hive2://192.168.10.27:2181,192.168.10> show partitions titan_temp.t_email;
+------------------------------------------+--+
| partition |
+------------------------------------------+--+
| action_time=202007/import_time=20200729 |
+------------------------------------------+--+
1 row selected (0.167 seconds)


-bash-4.1$ hdfs dfs -ls /user/hive/.Trash/Current/apps/hive/warehouse/titan_temp.db/t_email
Found 2 items
drwx------ - hive hdfs 0 2020-07-31 14:44 /user/hive/.Trash/Current/apps/hive/warehouse/titan_temp.db/t_email/action_time=202006
drwxr-xr-x - hive hdfs 0 2020-07-31 14:44 /user/hive/.Trash/Current/apps/hive/warehouse/titan_temp.db/t_email/action_time=2020061596177856492


-bash-4.1$ hdfs dfs -mv /user/hive/.Trash/Current/apps/hive/warehouse/titan_temp.db/t_email/action_time=202006 /apps/hive/warehouse/titan_temp.db/t_email
-bash-4.1$ hdfs dfs -ls /apps/hive/warehouse/titan_temp.db/t_email
Found 2 items
drwx------ - hive hdfs 0 2020-07-31 14:44 /apps/hive/warehouse/titan_temp.db/t_email/action_time=202006
drwxr-xr-x - hive hdfs 0 2020-07-31 14:30 /apps/hive/warehouse/titan_temp.db/t_email/action_time=202007

0: jdbc:hive2://192.168.10.27:2181,192.168.10> msck repair table titan_temp.t_email;
No rows affected (0.229 seconds)
0: jdbc:hive2://192.168.10.27:2181,192.168.10> show partitions titan_temp.t_email;
+------------------------------------------+--+
| partition |
+------------------------------------------+--+
| action_time=202006/import_time=20200629 |
| action_time=202007/import_time=20200729 |
+------------------------------------------+--+
2 rows selected (0.161 seconds)
  • 版权声明: 本博客所有文章除特别声明外,著作权归作者所有。转载请注明出处!
  • Copyrights © 2022-2023 ligongzhao
  • 访问人数: | 浏览次数:

请我喝杯咖啡吧~

支付宝
微信