File size: 2,690 Bytes
daea7f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#include "sim_hash.h"
#include <cmath>
#include <bitset>
#include <cstring>
#include <stdexcept>
#include <numeric>
#include <iostream>

// ================== Constructor / Destructor ==================
SimHash::SimHash(size_t bit) : bits(bit) {}

SimHash::~SimHash() {}

// ================== Helper functions ==================
std::string SimHash::encode_double(double x, int idx)

{
    char buf[64];
    snprintf(buf, sizeof(buf), "%d:%.9f", idx, round(x));
    return std::string(buf);
}

std::vector<int> SimHash::hashify_double_by_murmur128(double x, int idx, int bits_count)

{
    std::string token = encode_double(x, idx);

    uint64_t hash_output[2];
    MurmurHash3_x64_128(token.c_str(), token.size(), 0, hash_output);

    std::vector<int> bits(bits_count);

    if (bits_count <= 64)
    {
        for (int i = 0; i < bits_count; ++i)
        {
            uint64_t bit = (hash_output[0] >> i) & 1;
            bits[i] = bit ? 1 : -1;
        }
    }
    else if (bits_count <= 128)
    {
        for (int i = 0; i < 64; ++i)
        {
            bits[i] = (hash_output[0] >> i) & 1 ? 1 : -1;
            if (64 + i < bits_count)
                bits[64 + i] = (hash_output[1] >> i) & 1 ? 1 : -1;
        }
    }
    else
    {
        throw std::invalid_argument("Bits <= 128");
    }

    return bits;
}

void SimHash::IDF(const std::vector<std::vector<double>> &allFeatures)

{
    const int N = allFeatures.size();
    const int dim = allFeatures[0].size();
    idfWeights.assign(dim, 0.0);
    std::vector<int> docFreq(dim, 0);

    for (const auto &img : allFeatures)
    {
        for (int j = 0; j < dim; ++j)
        {
            if (img[j] > 1e-9)
                docFreq[j]++;
        }
    }

    for (int j = 0; j < dim; ++j)
    {
        idfWeights[j] = std::log(static_cast<double>(N) / (1.0 + docFreq[j]));
    }
}

size_t SimHash::hashFunction(const std::vector<double> &featureVector)

{
    std::vector<double> V(bits, 0.0);
    const int dim = featureVector.size();

    for (int i = 0; i < dim; ++i)
    {
        double tf = featureVector[i];
        double weight = tf * idfWeights[i];
        if (std::abs(weight) < 1e-9)
            continue;

        std::vector<int> phi = hashify_double_by_murmur128(featureVector[i], i, bits);
        for (size_t j = 0; j < bits; ++j)
        {
            V[j] += phi[j] * weight;
        }
    }

    uint64_t hashValue = 0;
    for (size_t j = 0; j < bits; ++j)
    {
        int bit = (V[j] >= 0.0) ? 1 : 0;
        hashValue = (hashValue << 1) | bit;
    }

    return static_cast<size_t>(hashValue);
}