Hive示例整理
update: 2017-01-16
by: 金凌,发才
常用操作
-
查看表的属性
show tblproperties yourTableName; show partitions tableName;
-
建表
CREATE EXTERNAL TABLE IF NOT EXISTS push_result_get_push_uid_tmp (is_default string, mid string, push_feed_uid string) PARTITIONED BY (dt STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' STORED AS INPUTFORMAT 'org.apache.hadoop.mapred.TextInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' LOCATION '/dw/obja/push_stat/push_result_get_push_uid_tmp';
-
插表
insert overwrite table tmp_uid partition (dt=20160515) select a.push_uid from (select distinct push_uid from push_result_get_push_uid where dt=20160515 and length(push_uid)<=10)a left outer join (select distinct push_uid from push_result_get_push_uid where dt=20160418 and length(push_uid)<=10)b on a.push_uid=b.push_uid where b.push_uid is null;
-
增加栏
ALTER TABLE test1 ADD COLUMNS (access_count1 int,access_count2 date,access_count3 string, ...);
-
修复表
msck repair table push_data;
-
创建多级目录(动态分区)
INSERT OVERWRITE TABLE T PARTITION (ds='2010-03-03', hr) SELECT key, value, ds, hr FROM srcpart WHERE ds is not null and hr>10;
基本设置
-
设置队列
set mapreduce.job.queuename=normal_production;
-
设置内存
set mapreduce.map.memory.mb=5120; set mapreduce.map.java.opts=-Xmx4608m; set mapreduce.reduce.memory.mb=5120; set mapreduce.reduce.java.opts=-Xmx4608m; set mapred.child.java.opts=-Xmx2048m; set mapred.reduce.tasks=50; set mapred.max.split.size=600000000; set mapred.min.split.size.per.node=600000000; set mapred.min.split.size.per.rack=600000000;
惯例:
mapreduce.{map|reduce}.java.opts = mapreduce.{map|reduce}.memory.mb x 0.9
-
SET hive.exec.compress.output=true; -- SET mapred.output.compression.codec=org.apache.hadoop.io.compress.SnappyCodec; -- SET mapred.output.compression.type=BLOCK; -- SET io.seqfile.compression.type=BLOCK;
mapred.output.compression.type: I use block. This will make the compression slittable even for all compression formats (gzip, snappy, and bzip2) just make sure your using a splitable file format like sequence, RCFile, or Avro.
Common input format Compression format | Tool | Algorithm | File extention | Splittable –|–|–|–|– gzip | gzip | DEFLATE | .gz | No bzip2 | bizp2 | bzip2 | .bz2 | Yes LZO | lzop | LZO | .lzo | Yes if indexed Snappy | N/A | Snappy | .snappy | No
-
合并结果中的小文件
set hive.merge.mapfiles=true; set hive.merge.mapredfiles=true; set hive.merge.size.per.task=256000000; set hive.merge.smallfiles.avgsize=256000000;
-
自定义UDF
add jar hive_udf.jar; create temporary function getJsonArray as 'com.weibo.hive.udtf.json.UDFGenerateJsonArray';