本文共 4645 字,大约阅读时间需要 15 分钟。
根据用户行为以及文章标签筛选出**用户最感兴趣(阅读最多)**的标签
11,101,2018-12-01 06:01:1022,102,2018-12-01 07:28:1233,103,2018-12-01 07:50:1411,104,2018-12-01 09:08:1222,103,2018-12-01 13:37:1233,102,2018-12-02 07:09:1211,101,2018-12-02 18:42:1235,105,2018-12-03 09:21:1222,104,2018-12-03 16:42:1277,103,2018-12-03 18:31:1299,102,2018-12-04 00:04:1233,101,2018-12-04 19:10:1211,101,2018-12-05 09:07:1235,102,2018-12-05 11:00:1222,103,2018-12-05 12:11:1277,104,2018-12-05 18:02:0299,105,2018-12-05 20:09:11
101,http://www.itcast.cn/1.html,kw8|kw1102,http://www.itcast.cn/2.html,kw6|kw3103,http://www.itcast.cn/3.html,kw7104,http://www.itcast.cn/4.html,kw5|kw1|kw4|kw9105,http://www.itcast.cn/5.html,
hadoop fs -mkdir /tmp/demo hadoop fs -mkdir /tmp/demo/user_action hadoop fs -mkdir /tmp/demo/article_keywords hadoop fs -put user_action.txt /tmp/demo/user_action hadoop fs -put article_keywords.txt /tmp/demo/article_keywords
create external table if not exists user_actions( user_id string, article_id string, time_stamp string ) row format delimited fields terminated by ',' location '/tmp/demo/user_action';
create external table if not exists articles( article_id string, url string, key_words array) row format delimited fields terminated by ',' collection items terminated by '|' location '/tmp/demo/article_keywords'; /* key_words array 数组的数据类型 collection items terminated by '|' 数组的元素之间用'|'分割 */
select user_id,collect_set(article_id) from user_actions group by user_id;
11 ["101","104"] 22 ["102","103","104"] 33 ["103","102","101"] 35 ["105","102"] 77 ["103","104"] 99 ["102","105"]
select article_id,kw from articles laternal view explode(key_words) t as kw;
101 kw8 101 kw1 102 kw6 102 kw3 103 kw7 104 kw5 104 kw1 104 kw4 104 kw9
select a.user_id,b.kw,count(1) as weight from user_actoions as a left outer join (select article_id,kw from articles laternal view explode(key_words) t as kw) b on (a.article_id = b.article_id) group by a.user_id,b.kw order by a.user_id,weight desc
11 kw1 4 11 kw8 3 11 kw5 1 11 kw9 1 11 kw4 1 22 kw7 2 22 kw9 1 22 kw1 1 22 kw3 1 22 kw4 1 22 kw5 1 22 kw6 1 33 kw3 1 33 kw8 1 33 kw7 1 33 kw6 1 33 kw1 1 35 NULL 1 35 kw3 1 35 kw6 1 77 kw1 1 77 kw4 1 77 kw5 1 77 kw7 1 77 kw9 1 99 NULL 1 99 kw3 1 99 kw6 1
select c.user_id,concat_ws(',',collect_set(c.kw_w))from(-- select a.user_id,concat_ws(':',b.kw,cast(count(1) as string)) as kw_w select a.user_id,concat(b.kw,':',count(1)) as kw_w from user_action as a left join (select article_id,kw from article lateral view explode(key_words) t as kw) b on a.article_id = b.article_id group by a.user_id,b.kw)as cgroup by c.user_id;
11 kw1:4,kw4:1,kw5:1,kw8:3,kw9:122 kw1:1,kw3:1,kw4:1,kw5:1,kw6:1,kw7:2,kw9:133 kw1:1,kw3:1,kw6:1,kw7:1,kw8:135 kw3:1,kw6:177 kw1:1,kw4:1,kw5:1,kw7:1,kw9:199 kw3:1,kw6:1
create table user_kws as select c.user_id,str_to_map(concat_ws(',',collect_set(c.kw_w))) as wm from( select a.user_id, concat_ws(':',b.kw,cast (count(1) as string)) as kw_w from user_actions as a left outer join (select article_id,kw from articles lateral view outer explode(key_words) t as kw) b on (a.article_id = b.article_id) group by a.user_id,b.kw ) as c group by c.user_id;
select user_id, wm['kw1'] from user_kws;
11 4 22 1 33 1 35 NULL 77 1 99 NULL
select user_id,map_keys(wm),map_values(wm) from user_kws;
11 ["kw1","kw4","kw5","kw8","kw9"] ["4","1","1","3","1"]22 ["kw1","kw3","kw4","kw5","kw6","kw7","kw9"] ["1","1","1","1","1","2","1"]33 ["kw1","kw3","kw6","kw7","kw8"] ["1","1","1","1","1"]35 ["kw3","kw6"] ["1","1"]77 ["kw1","kw4","kw5","kw7","kw9"] ["1","1","1","1","1"]99 ["kw3","kw6"] ["1","1"]
转载地址:http://fmdsn.baihongyu.com/