Hive 常用的命令
- 创建外部表
CREATE EXTERNAL TABLE req_sfst_time_20241010_v3 (
cip STRING,
sip STRING,
rtime INT
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n'
NULL DEFINED AS ''
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'viewfs://c9/user_ext/ad_engine/warehouse/req_sfst_time_20241010_v3'
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
2
3
4
5
6
7
8
9
10
11
12
13
14
15
- 将查询结果插入表中
INSERT INTO TABLE req_sfst_time_20241010_v3
select
get_json_object(json_log, '$.server_addr') AS cip,
get_json_object(tt.item, '$.ip') AS sip,
cast(get_json_object(tt.item, '$.rtime') as int) as rtime
from ods_ad_req_log_json_hour
lateral view explode(split(regexp_replace(regexp_replace(get_json_object(json_log, '$.products'), '\\[|\\]',''),'\\}\\,\\{','\\}\\@@\\{'),'\\@@')) tt as item
where
dt='20241010'
and get_json_object(item, '$.product')='Sfst'
and cast(get_json_object(item, '$.rc') as int)>=100
and cast(get_json_object(item, '$.rc') as int)<=600;
1
2
3
4
5
6
7
8
9
10
11
12
2
3
4
5
6
7
8
9
10
11
12
时间戳转换
from_unixtime(CAST(current_timestamp() AS bigint) - 86400, 'yyyyMMdd')
json 字段
get_json_object(extend, '$.launchid') is null
字符串分割
split(adid,',')
列转行
lateral view explode(split(adid,',')) tt as ad
- ad 是多列转换成一列后的列名
json 字符串列转行
- SELECT explode(split(regexp_replace(regexp_replace('[{"website":"www.baidu.com","name":"百度"},{"website":"google.com","name":"谷歌"}]', '\]',''),'\}\,\{','\}\;\{'),'\;'));
SELECT explode(split( regexp_replace( regexp_replace( '[ {"website":"www.baidu.com","name":"百度"}, {"website":"google.com","name":"谷歌"} ]', '\\[|\\]',''), --将 Json 数组两边的中括号去掉 '\\}\\,\\{' --将 Json 数组元素之间的逗号换成分号 ,'\\}\\;\\{'), '\\;')); --以分号作为分隔符
1
2
3
4
5
6
7
8
9
10
11
12
13- 原选项就是字符串,不是hive数组
字符串切割
substr(plat, 3, 3)
- 下标从 1 开始,开始位置,截取长度
字符串拼接
concat_ws('.', "aaa", "bbb")
- 通过拼接符将两个字符串拼接起来
条件语句
select adid, case when (length(uid)>9 and cast(substr(uid,-9,3) as int)<50) or uid="" then "new" else "old" end, count(1)
group by adid, o_url, if((length(uid) > 9 and cast(substr(uid, -9, 3) as int) < 50) or uid is null or uid="", "new", "old")
行编号
row_number() over (partition by uid, deliver_id order by record_time) as ranking
- row_number() over 对结果集中的行进行编号,partition by 子句用于指定分组的字段,order by 子句用于指定排序的字段
- row_number():即排序次序连续,但不会出现同一排名
- rank():当分组内会出现同一个排名,但是排名次序是不连续的
- dense_rank():分组内可能会出现同样的次序,且排序名次是连续的
分位值
select percentile(cast(duration as int), 0.99) as per
编辑 (opens new window)
上次更新: 2024/10/12, 19:03:12