海明距离计算
Google 提出的 simhash 去重算法里关于“汉明距离”计算最关键的步骤就是“位计数”(计算位数组中 1 的个数)。
方案 A
最慢
直接数
void count(int x){
int count = 0;
for (int i = 0; i < 32; i++) {
int y = (x >> i) & 1;
if (y == 1) {
count ++;
}
}
}
方案 B
稍好
位计数算法最常见的优化方法就是基于分治策略来统计“1”的个数,先计算每两位中1的个数、然后再计算每4位中1的个数,一步一步最后计算32位中1的个数。
long hamming_dist(long a, long b) {
long x = a ^ b;
x = (x & (0x5555555555555555L)) + ((x >> 1) & (0x5555555555555555L));
x = (x & (0x3333333333333333L)) + ((x >> 2) & (0x3333333333333333L));
x = (x & (0x0f0f0f0f0f0f0f0fL)) + ((x >> 4) & (0x0f0f0f0f0f0f0f0fL));
x = (x & (0x00ff00ff00ff00ffL)) + ((x >> 8) & (0x00ff00ff00ff00ffL));
x = (x & (0x0000ffff0000ffffL)) + ((x >> 16) & (0x0000ffff0000ffffL));
x = (x & (0x00000000FFFFFFFFL)) + ((x >> 32) & (0x00000000FFFFFFFFL));
return x;
}
方案 C
最快
https://code.google.com/archive/p/deduplication-detecting/
int hamming_dist(long a1, long a2) {
int v1 = (int) (a1 ^ a2);
int v2 = (int) ((a1 ^ a2) >> 32);
v1 = v1 - ((v1 >> 1) & 0x55555555);
v2 = v2 - ((v2 >> 1) & 0x55555555);
v1 = (v1 & 0x33333333) + ((v1 >> 2) & 0x33333333);
v2 = (v2 & 0x33333333) + ((v2 >> 2) & 0x33333333);
int c1 = (((v1 + (v1 >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
int c2 = (((v2 + (v2 >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
return c1 + c2;
}
SimHash 64位分4份,每份16位。
rowkey 18位,由2位分区标志 + 16位 SimHash 截取组成
如 01_1111111111111111,01是分区标志,代表截取的第一部分,1111111111111111是16位 SimHash 截取。
假定数据总量:$2^{37}$
(1375亿数据)
每行 $2^{37}$
/ $2^{16}$
= $2^{21}$
(2097152)个候选结果
一条数据插入4行,每行 2097152,需要并行查询 2097152 * 4 =8388608 个候选结果
插入一行数据测试,即生成最高16位相同的 2097152 数据
测试也是查一条最高16位相同的数据
测试数据,存储内容:
simhash:pkid “1111111111111111000000000000000000000000000001101111111110101100”:“4339066868”
//hdfs
hadoop fs -put /opt/wujun_apps/hbase.jar /wujun_apps/hbase_coprocessor_v17.jar
cd /data/apps/hbase-1.2.0/bin/
./hbase shell
//hdfs大小
hadoop fs -du -h /hbase-1.2.0/data/default/test_sim
//Coprocessor
disable 'test_sim'
alter 'test_sim', METHOD => 'table_att_unset', NAME => 'coprocessor$1'
alter 'test_sim', METHOD => 'table_att', 'Coprocessor'=>'hdfs://satest-hadoop/wujun_apps/hbase_coprocessor_v20.jar|com.test.hbase.HBaseSimHashPreGetOpV20||'
enable 'test_sim'
desc 'test_sim'
//操作hbase
get 'test_sim','01_1111111111111111',{COLUMN => 'f:s0'}
//加上 LZO 压缩
alter 'test_sim', NAME => 'f', COMPRESSION => 'LZO'
major_compact 'test_sim'
//加上 Fast-Diff 编码
alter 'test_sim',{NAME => 'f', DATA_BLOCK_ENCODING => 'FAST_DIFF'}
alter 'test_sim',{NAME => 'f', DATA_BLOCK_ENCODING => 'NONE'}
删除列族
alter 'test_sim', {NAME=>'f', METHOD=>'delete'}
删除行
deleteall 'test_sim','01_1111111111111111'
/data/apps/hbase-1.2.0/bin/hbase-daemon.sh restart regionserver
i:1111111111111111111111111111111111111111111111111111111111111111
i >>> 1:111111111111111111111111111111111111111111111111111111111111111
0x5555555555555555L:0101010101010101010101010101010101010101010101010101010101010101
(i >>> 1) & 0x5555555555555555L:101010101010101010101010101010101010101010101010101010101010101
i = i - ((i >>> 1) & 0x5555555555555555L):1010101010101010101010101010101010101010101010101010101010101010
0x3333333333333333L:0011001100110011001100110011001100110011001100110011001100110011
i & 0x3333333333333333L:10001000100010001000100010001000100010001000100010001000100010
i >>> 2:10101010101010101010101010101010101010101010101010101010101010
i = (i >>> 2) & 0x3333333333333333L:10001000100010001000100010001000100010001000100010001000100010
(i & 0x3333333333333333L) + ((i >>> 2) & 0x3333333333333333L):100010001000100010001000100010001000100010001000100010001000100
i >>> 4:10001000100010001000100010001000100010001000100010001000100
i + (i >>> 4:100100010001000100010001000100010001000100010001000100010001000
(i + (i >>> 4)) & 0x0f0f0f0f0f0f0f0fL:100000001000000010000000100000001000000010000000100000001000
i >>> 8:1000000010000000100000001000000010000000100000001000
i + (i >>> 8):100000010000000100000001000000010000000100000001000000010000
i >>> 16:10000001000000010000000100000001000000010000
i + (i >>> 16):100000010000000110000010000000100000001000000010000000100000
i >>> 32:1000000100000001100000100000
i + (i >>> 32):100000010000000110000010000000101000001100000011100001000000
0x7f:01111111
i & 0x7f:1000000
1000000b = 64