hive数据操作
3,154 阅读
hive是基于Hadoop的一个数据仓库工具,可以将结构化的数据文件映射为一张数据库表,并提供简单的sql查询功能,可以将sql语句转换为MapReduce任务进行运行。 其优点是学习成本低,可以通过类SQL语句快速实现简单的MapReduce统计,不必开发专门的MapReduce应用,十分适合数据仓库的统计分析。 下面是我以前写过的hive操作语句,供大家参考:
1.hive查看表结构
desc formatted table_name;
desc table_name;
2.hive建立表
create table `reviews_comments_qian`(
`comm_id` bigint,
`post_id` bigint,
`user_id` string,
`comm_time` date,
`comm_content` string,
`auto_brand` string,
`auto_name` string)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\u0001'
LINES TERMINATED BY '\n';
3.hive操作表
INSERT INTO reviews_comments_qian SELECT DISTINCT c.comm_id,c.post_id,c.user_id,c.comm_time,c.comm_content,aa.auto_brand,aa.auto_name FROM reviews_comments c JOIN (SELECT a.auto_brand,a.auto_name,b.post_id FROM forum_addresses_s a JOIN forum_posts_s b ON a.addr_id=b.addr_id) aa ON aa.post_id=c.post_id;
4.hive删除表
DROP TABLE IF EXISTS reviews_comments_qianyang;
5.hive更改列名
ALTER TABLE reviews_comments_qianyang CHANGE `buying goal` buying_goal STRING
create table `reviews_comments_qianyang`(
`comm_id` bigint,
`post_id` bigint,
`user_id` string,
`comm_time` date,
`comm_content` string,
`auto_brand` string,
`auto_name` string,
`buying_car` string,
`buying_place` string,
`buying_time` string,
`buying_price` string,
`buying_goal` string,
`car_id` string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\u0001'
LINES TERMINATED BY '\n';
INSERT INTO reviews_comments_qianyang select DISTINCT c.comm_id,c.post_id,c.user_id,c.comm_time,c.comm_content,c.auto_brand,c.auto_name,b.buying_car,b.buying_place,b.buying_time,b.buying_price,b.buying_goal,b.car_id from reviews_comments_qian c JOIN (select author_id,buying_car,buying_place,buying_time,buying_price,buying_goal,car_id from autohome_reviews) b ON c.user_id=b.author_id;
create table `reviews_comments_qianyang1`(
`comm_id` bigint,
`post_id` bigint,
`user_id` string,
`comm_time` date,
`comm_content` string,
`auto_brand` string,
`auto_name` string,
`buying_car` string,
`buying_place` string,
`buying_time` string,
`buying_price` string,
`buying_goal` string,
`car_id` string,
`user_birth` string,
`user_gender` string,
`user_province` string,
`user_city` string
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\u0001'
LINES TERMINATED BY '\n';
INSERT INTO reviews_comments_qianyang1 select DISTINCT c.comm_id,c.post_id,c.user_id,c.comm_time,c.comm_content,c.auto_brand,c.auto_name,c.buying_car,c.buying_place,c.buying_time,c.buying_price,c.buying_goal,c.car_id,b.user_birth,b.user_gender,b.user_province,b.user_city from reviews_comments_qianyang c JOIN (select user_birth,user_id,user_gender,user_province,user_city from forum_users_s) b ON c.user_id=b.user_id;
6.针对同一用户合并数据
create table user_content as select user_id,concat_ws('|',collect_set(comm_content)) from reviews_comments_qianyanguse group by user_id;
7.hive向数据库插数据
sqoop export --connect jdbc:mysql://mysql服务器ip:3306/mysql数据库名 --username root --password 112233 --table autohome_posturl2015 --export-dir /user/hive/warehouse/autohome_posturl2015 --input-fields-terminated-by '\001'
8.mysql向hive导入数据
sqoop import --connect jdbc:mysql://114.213.234.104:3306/brandetl_db --username root --password 112233 --table auto_forum_users_s --hive-import --hive-overwrite --create-hive-table --hive-table users --delete-target-dir
10.如何查看一张表里存在另一张表里不存在的记录
SELECT * FROM user_info_sample_qianyang
WHERE user_id NOT IN ( SELECT user_id FROM user_info_sample_qianyang1);
