impala 分析函数
impala_analytic_functions
临时表
作用:重复使用到不必重复查询,简化语句复杂度,方便查看
WITH table_name AS (SELECT 1 id, 2 num UNION SELECT 2,2)
SELECT * FROM table_name
%% 取每个账号的第一个创角记录作为临时表
WITH role_unique AS (
SELECT
*
FROM
(
SELECT
role_id,
create_time,
row_number() over(
PARTITION BY
account_name
ORDER BY
create_time asc
) AS row_num
FROM
t_log_role_create
) role_create
where
row_num = 1
)
分析函数
function(args) OVER([partition_by_clause] [order_by_clause [window_clause]])
- FUNCTION 子句
- PARTITION 子句
- ORDER BY 子句
- WINDOWING 子句
PARTITION
类型GROUP BY,分组
ORDER BY
排序
窗口条件
ROWS BETWEEN [ { m | UNBOUNDED } PRECEDING | CURRENT ROW] [ AND [CURRENT ROW | { UNBOUNDED | n } FOLLOWING] ]
RANGE BETWEEN [ {m | UNBOUNDED } PRECEDING | CURRENT ROW] [ AND [CURRENT ROW | { UNBOUNDED | n } FOLLOWING] ]
关于ROWS和RANGE的区别
- ROWS 每一行元素都视为新的计算行,即每一行都是一个新的窗口
- RANGE 具有相同值的所有元素行视为同一计算行,即具有相同值的所有行都是同一个窗口
Query:
with num_table AS (SELECT 1 id, 1 num UNION SELECT 2,2 UNION SELECT 3,3 UNION SELECT 4,6 UNION SELECT 5,4 UNION SELECT 6,5 UNION SELECT 7,5 UNION SELECT 8,4) SELECT num,sum(num) over (ORDER BY num asc rows between unbounded preceding and current row) as total FROM num_table;
+-----+-------+
| num | total |
+-----+-------+
| 1 | 1 |
| 2 | 3 |
| 3 | 6 |
| 4 | 10 |
| 4 | 14 |
| 5 | 19 |
| 5 | 24 |
| 6 | 30 |
Query:
with num_table AS (SELECT 1 id, 1 num UNION SELECT 2,2 UNION SELECT 3,3 UNION SELECT 4,6 UNION SELECT 5,4 UNION SELECT 6,5 UNION SELECT 7,5 UNION SELECT 8,4) SELECT num,sum(num) over (ORDER BY num asc range between unbounded preceding and current row) as total FROM num_table;
+-----+-------+
| num | total |
+-----+-------+
| 1 | 1 |
| 2 | 3 |
| 3 | 6 |
| 4 | 14 |
| 4 | 14 |
| 5 | 24 |
| 5 | 24 |
| 6 | 30 |
Row_Number,Rank,Dense_Rank
- Row_Number,整数的升序顺序,从1开始,逐行加1
- Rank,整数的升序顺序,从1开始,重复值生成重复整数,重复后按值的数量增加序列
- Dense_Rank,整数的升序顺序,从1开始,重复值生成重复整数,重复后按值的数值增加序列
使用:
- 获取最新数据
- 获取topN数据
WITH create_table AS (
SELECT '2018-10-01' create_date, 'account1' account_name UNION ALL
SELECT '2018-10-02', 'account2' UNION ALL
SELECT '2018-10-03', 'account3' UNION ALL
SELECT '2018-10-04', 'account3' UNION ALL
SELECT '2018-10-05', 'account2' UNION ALL
SELECT '2018-10-06', 'account4' UNION ALL
SELECT '2018-10-07', 'account5'
)
SELECT
create_date,
account_name,
row_number() over(
order by
account_name
) row_num,
rank() over (
order by
account_name
) rank_id,
dense_rank() over (
order by
account_name
) dense_id
from
create_table
order by
account_name;
+-------------+--------------+---------+---------+----------+
| create_date | account_name | row_num | rank_id | dense_id |
+-------------+--------------+---------+---------+----------+
| 2018-10-01 | account1 | 1 | 1 | 1 |
| 2018-10-02 | account2 | 2 | 2 | 2 |
| 2018-10-05 | account2 | 3 | 2 | 2 |
| 2018-10-03 | account3 | 4 | 4 | 3 |
| 2018-10-04 | account3 | 5 | 4 | 3 |
| 2018-10-06 | account4 | 6 | 6 | 4 |
| 2018-10-07 | account5 | 7 | 7 | 5 |
LAG,LEAD
- LAG(col, n, DEFAULT) 用于统计窗口内往上第n行值
- LEAD(col, n, DEFAULT) 用于统计窗口内往下第n行值, 与LAG相反
WITH pay_date AS (
SELECT '2018-10-01' dt, 5000 pay UNION ALL
SELECT '2018-10-02', 6000 UNION ALL
SELECT '2018-10-03', 7000 UNION ALL
SELECT '2018-10-04', 8000 UNION ALL
SELECT '2018-10-05', 9000 UNION ALL
SELECT '2018-10-06', 10000 UNION ALL
SELECT '2018-10-07', 11000
)
SELECT
dt,
lag(pay, 1) over (
order by
dt
) as pre_day,
pay,
lead(pay, 1) over (
order by
dt
) as next_day,
avg(pay) over (order by dt
rows between 1 preceding and 1 following) as pay_average
FROM
pay_date order by dt;
前一日,当天,后一天,三天平均值(可根据需要调整窗口)
+------------+---------+-------+----------+-------------+
| dt | pre_day | pay | next_day | pay_average |
+------------+---------+-------+----------+-------------+
| 2018-10-01 | NULL | 5000 | 6000 | 5500 |
| 2018-10-02 | 5000 | 6000 | 7000 | 6000 |
| 2018-10-03 | 6000 | 7000 | 8000 | 7000 |
| 2018-10-04 | 7000 | 8000 | 9000 | 8000 |
| 2018-10-05 | 8000 | 9000 | 10000 | 9000 |
| 2018-10-06 | 9000 | 10000 | 11000 | 10000 |
| 2018-10-07 | 10000 | 11000 | NULL | 10500 |
WITH pay_date AS (
SELECT
FROM_UNIXTIME(pay_time, 'yyyy-MM-dd') AS dt,
sum(pay_money) as pay
FROM
t_log_pay
WHERE
pay_time between 1538323200
AND 1538927999
GROUP BY
dt
)
SELECT
dt,
lag(pay, 1) over (
order by
dt
) as pre_day,
pay,
lead(pay, 1) over (
order by
dt
) as next_day,
avg(pay) over (order by dt
rows between 1 preceding and 1 following) as pay_average
FROM
pay_date order by dt;
FIRST_VALUE,LAST_VALUE
- FIRST_VALUE 取分组内排序后,截止到当前行,第一个值
- LAST_VALUE 取分组内排序后,截止到当前行,最后一个值
- FIRST_VALUE(DESC) 获得组内全局的最后一个值
with test_table AS (
SELECT 1 id, 'test1' test UNION ALL
SELECT 2, 'test1' UNION ALL
SELECT 3, 'test1' UNION ALL
SELECT 4, 'test2' UNION ALL
SELECT 5, 'test2' UNION ALL
SELECT 6, 'test2'
) SELECT id,
test,
first_value(id)
OVER (
PARTITION BY test
ORDER BY id RANGE UNBOUNDED preceding
) as first_val,
last_value(id)
OVER (
PARTITION BY test
ORDER BY id desc RANGE UNBOUNDED preceding
) as last_val,
first_value(id)
OVER (
PARTITION BY test
ORDER BY id desc RANGE UNBOUNDED preceding
) as first_desc
from test_table order by id;
+----+-------+-----------+----------+------------+
| id | test | first_val | last_val | first_desc |
+----+-------+-----------+----------+------------+
| 1 | test1 | 1 | 1 | 3 |
| 2 | test1 | 1 | 2 | 3 |
| 3 | test1 | 1 | 3 | 3 |
| 4 | test2 | 4 | 4 | 6 |
| 5 | test2 | 4 | 5 | 6 |
| 6 | test2 | 4 | 6 | 6 |
使用示例
CREATE TABLE t_log_role_create ( account_name STRING COMMENT ‘账号’, create_time INT COMMENT ‘注册时间’ ) COMMENT ‘玩家注册表’ ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’ STORED AS TEXTFILE ;
CREATE TABLE t_log_login ( account_name STRING COMMENT ‘账号’, login_time INT COMMENT ‘登录时间’ ) COMMENT ‘玩家登录表’ ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’ STORED AS TEXTFILE ;
CREATE TABLE t_log_pay ( account_name STRING COMMENT ‘账号’, pay_time INT COMMENT ‘充值时间’, pay_money float COMMENT ‘充值金额’ ) COMMENT ‘玩家充值表’ ROW FORMAT DELIMITED FIELDS TERMINATED BY ‘\t’ STORED AS TEXTFILE ;
统计留存,用户在某段时间内开始使用游戏,经过一段时间后,仍然继续使用游戏的被认作是留存用户,这部分用户占当日新增用户的比例即是用户留存率
WITH role_unique_a AS (
SELECT
*
FROM
(
SELECT
account_name,
create_time,
row_number() over(
PARTITION BY
account_name
ORDER BY
create_time asc
) AS row_num
FROM
t_log_role_create
WHERE
create_time between 1533657600
AND 1533743999
) role_create
where
row_num = 1
)
SELECT
e.create_date,
f.day,
e.account_num,
f.total,
(f.total / e.account_num) AS rate
FROM
(
SELECT
COUNT(distinct account_name) AS account_num,
FROM_UNIXTIME(create_time, 'yyyy-MM-dd') AS create_date
FROM
role_unique_a
group by
create_date
) e
join (
SELECT
COUNT(distinct b.account_name) AS total,
FROM_UNIXTIME(login_time, 'yyyy-MM-dd') AS day,
FROM_UNIXTIME(create_time, 'yyyy-MM-dd') AS create_date
FROM
t_log_login a
join role_unique_a b on a.account_name = b.account_name
where a.login_time > 1533657600
group by
day,
create_date
) f on e.create_date = f.create_date
ORDER BY
create_date,
day;
统计LTV,(Lifetime-Value):生命周期价值,即平均一个用户在首次登录游戏到最后一次登录游戏内,为该游戏创造的收入总计
WITH role_unique_a AS (
SELECT
*
FROM
(
SELECT
account_name,
create_time,
row_number() over(
PARTITION BY upf,
account_name
ORDER BY
create_time asc
) AS row_num
FROM
t_log_role_create
where
create_time between 1533657600
AND 1533743999
) role_create
where
row_num = 1
)
SELECT
e.create_date,
f.day,
e.account_num,
f.total,
total / account_num as ltv
FROM
(
SELECT
COUNT(distinct account_name) AS account_num,
FROM_UNIXTIME(create_time, 'yyyy-MM-dd') AS create_date
FROM
role_unique_a
GROUP BY
create_date
) e
join (
SELECT
create_date,
day,
sum(pay_money) over (
partition by create_date
ORDER BY
day rows between unbounded preceding
and current row
) as total
from
(
SELECT
SUM(pay_money) as pay_money,
FROM_UNIXTIME(pay_time, 'yyyy-MM-dd') AS day,
FROM_UNIXTIME(b.create_time, 'yyyy-MM-dd') AS create_date
FROM
t_log_pay a
join
role_unique_a b on a.account_name = b.account_name
WHERE
pay_time > 1533657600
GROUP BY
day,
create_date
) c
) f on e.create_date = f.create_date
ORDER BY
create_date,
day;