海明距离计算

Wu Jun 2020-01-09 11:43:49
Categories: > Tags:

海明距离计算

Google 提出的 simhash 去重算法里关于“汉明距离”计算最关键的步骤就是“位计数”(计算位数组中 1 的个数)。

方案 A

最慢

直接数

void count(int x){
    int count = 0;
    for (int i = 0; i < 32; i++) {
        int y = (x >> i) & 1;
        if (y == 1) {
            count ++;
        }
    }
}

方案 B

稍好

位计数算法最常见的优化方法就是基于分治策略来统计“1”的个数,先计算每两位中1的个数、然后再计算每4位中1的个数,一步一步最后计算32位中1的个数。

long hamming_dist(long a, long b) {
    long x = a ^ b;
    x = (x & (0x5555555555555555L)) + ((x >> 1) & (0x5555555555555555L));
    x = (x & (0x3333333333333333L)) + ((x >> 2) & (0x3333333333333333L));
    x = (x & (0x0f0f0f0f0f0f0f0fL)) + ((x >> 4) & (0x0f0f0f0f0f0f0f0fL));
    x = (x & (0x00ff00ff00ff00ffL)) + ((x >> 8) & (0x00ff00ff00ff00ffL));
    x = (x & (0x0000ffff0000ffffL)) + ((x >> 16) & (0x0000ffff0000ffffL));
    x = (x & (0x00000000FFFFFFFFL)) + ((x >> 32) & (0x00000000FFFFFFFFL));
    return x;
}

方案 C

最快

https://code.google.com/archive/p/deduplication-detecting/

int hamming_dist(long a1, long a2) {
    int v1 = (int) (a1 ^ a2);
    int v2 = (int) ((a1 ^ a2) >> 32);

    v1 = v1 - ((v1 >> 1) & 0x55555555);
    v2 = v2 - ((v2 >> 1) & 0x55555555);
    v1 = (v1 & 0x33333333) + ((v1 >> 2) & 0x33333333);
    v2 = (v2 & 0x33333333) + ((v2 >> 2) & 0x33333333);
    int c1 = (((v1 + (v1 >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;
    int c2 = (((v2 + (v2 >> 4)) & 0x0F0F0F0F) * 0x01010101) >> 24;

    return c1 + c2;
}

SimHash 64位分4份,每份16位。

rowkey 18位,由2位分区标志 + 16位 SimHash 截取组成

如 01_1111111111111111,01是分区标志,代表截取的第一部分,1111111111111111是16位 SimHash 截取。

假定数据总量:$2^{37}$(1375亿数据)

每行 $2^{37}$ / $2^{16}$ = $2^{21}$(2097152)个候选结果

一条数据插入4行,每行 2097152,需要并行查询 2097152 * 4 =8388608 个候选结果

插入一行数据测试,即生成最高16位相同的 2097152 数据

测试也是查一条最高16位相同的数据

测试数据,存储内容:

simhash:pkid “1111111111111111000000000000000000000000000001101111111110101100”:“4339066868”

//hdfs
hadoop fs -put /opt/wujun_apps/hbase.jar /wujun_apps/hbase_coprocessor_v17.jar

cd /data/apps/hbase-1.2.0/bin/

./hbase shell


//hdfs大小
hadoop fs -du -h /hbase-1.2.0/data/default/test_sim

//Coprocessor
disable 'test_sim'

alter 'test_sim', METHOD => 'table_att_unset', NAME => 'coprocessor$1'

alter 'test_sim', METHOD => 'table_att', 'Coprocessor'=>'hdfs://satest-hadoop/wujun_apps/hbase_coprocessor_v20.jar|com.test.hbase.HBaseSimHashPreGetOpV20||'

enable 'test_sim'

desc 'test_sim'

//操作hbase
get 'test_sim','01_1111111111111111',{COLUMN => 'f:s0'}

//加上 LZO 压缩
alter 'test_sim', NAME => 'f', COMPRESSION => 'LZO'

major_compact 'test_sim'

//加上 Fast-Diff 编码
alter 'test_sim',{NAME => 'f', DATA_BLOCK_ENCODING => 'FAST_DIFF'}

alter 'test_sim',{NAME => 'f', DATA_BLOCK_ENCODING => 'NONE'}

删除列族
alter 'test_sim', {NAME=>'f', METHOD=>'delete'}

删除行
deleteall 'test_sim','01_1111111111111111'

/data/apps/hbase-1.2.0/bin/hbase-daemon.sh restart regionserver
i:1111111111111111111111111111111111111111111111111111111111111111

i >>> 1:111111111111111111111111111111111111111111111111111111111111111

0x5555555555555555L:‭0101010101010101010101010101010101010101010101010101010101010101

(i >>> 1) & 0x5555555555555555L:101010101010101010101010101010101010101010101010101010101010101

i = i - ((i >>> 1) & 0x5555555555555555L):1010101010101010101010101010101010101010101010101010101010101010

0x3333333333333333L:‭0011001100110011001100110011001100110011001100110011001100110011‬

i & 0x3333333333333333L:10001000100010001000100010001000100010001000100010001000100010

i >>> 2:10101010101010101010101010101010101010101010101010101010101010

i = (i >>> 2) & 0x3333333333333333L:10001000100010001000100010001000100010001000100010001000100010


(i & 0x3333333333333333L) + ((i >>> 2) & 0x3333333333333333L):100010001000100010001000100010001000100010001000100010001000100

i >>> 4:10001000100010001000100010001000100010001000100010001000100

i + (i >>> 4:100100010001000100010001000100010001000100010001000100010001000

(i + (i >>> 4)) & 0x0f0f0f0f0f0f0f0fL:100000001000000010000000100000001000000010000000100000001000

i >>> 8:1000000010000000100000001000000010000000100000001000

 i + (i >>> 8):100000010000000100000001000000010000000100000001000000010000
 
i >>> 16:10000001000000010000000100000001000000010000

 i + (i >>> 16):100000010000000110000010000000100000001000000010000000100000
 
i >>> 32:1000000100000001100000100000

 i + (i >>> 32):100000010000000110000010000000101000001100000011100001000000
 
0x7f:‭01111111‬

i & 0x7f:1000000

1000000b = 64