#include "min_hash.h" MinHash::MinHash(size_t numPlanes, size_t dimension) : numPlanes(numPlanes), dimension(dimension), mean(dimension, 0.0), median(dimension, 0.0), stddev(dimension, 1.0) { randomHyperplanes.resize(numPlanes, std::vector(dimension)); generateRandomHyperplanes(); } void MinHash::generateRandomHyperplanes() { std::random_device rd; std::mt19937 gen(rd()); std::normal_distribution dist(0.0, 1.0); for (size_t i = 0; i < numPlanes; ++i) { double norm = 0.0; for (size_t j = 0; j < dimension; ++j) { randomHyperplanes[i][j] = dist(gen); norm += randomHyperplanes[i][j] * randomHyperplanes[i][j]; } norm = std::sqrt(norm); if (norm == 0.0) norm = 1e-12; for (size_t j = 0; j < dimension; ++j) randomHyperplanes[i][j] /= norm; } } std::vector> MinHash::computeSignatures(const std::vector> &matrix, bool useMedianThreshold) { size_t n = matrix.size(); if (n == 0) return {}; std::vector> v(n, std::vector(dimension)); for (size_t i = 0; i < n; ++i) { const auto &row = matrix[i]; for (size_t j = 0; j < dimension; ++j) { double minval = std::numeric_limits::infinity(); for (size_t p = 0; p < numPlanes; ++p) { double prod = row[j] * randomHyperplanes[p][j]; if (prod < minval) minval = prod; } v[i][j] = minval; } } std::vector tmp; tmp.reserve(n); for (size_t j = 0; j < dimension; ++j) { tmp.clear(); for (size_t i = 0; i < n; ++i) tmp.push_back(v[i][j]); // mean double s = 0.0; for (double x : tmp) s += x; mean[j] = s / static_cast(n); // median std::nth_element(tmp.begin(), tmp.begin() + tmp.size() / 2, tmp.end()); double med = tmp[tmp.size() / 2]; if (tmp.size() % 2 == 0) { double other = *std::max_element(tmp.begin(), tmp.begin() + tmp.size() / 2); med = 0.5 * (med + other); } median[j] = med; // stddev double var = 0.0; for (double x : tmp) { double d = x - mean[j]; var += d * d; } var /= static_cast(n); stddev[j] = std::sqrt(var); if (stddev[j] < 1e-12) stddev[j] = 1e-12; } std::vector> signatures(n, std::vector(dimension, 0)); for (size_t i = 0; i < n; ++i) { for (size_t j = 0; j < dimension; ++j) { bool bit; if (useMedianThreshold) bit = (v[i][j] >= median[j]); else { double z = (v[i][j] - mean[j]) / stddev[j]; bit = (z > 0.0); } signatures[i][j] = static_cast(bit); } } return signatures; } std::vector MinHash::hashFunction(const std::vector &vec, bool useMedianThreshold) const { if (median.empty() || mean.empty() || stddev.empty()) throw std::runtime_error("Must call computeSignatures() before hashFunction()"); std::vector minVals(dimension); for (size_t j = 0; j < dimension; ++j) { double minval = std::numeric_limits::infinity(); for (size_t p = 0; p < numPlanes; ++p) { double prod = vec[j] * randomHyperplanes[p][j]; if (prod < minval) minval = prod; } minVals[j] = minval; } std::vector bits(dimension, 0); for (size_t j = 0; j < dimension; ++j) { bool bit; if (useMedianThreshold) bit = (minVals[j] >= median[j]); else { double z = (minVals[j] - mean[j]) / stddev[j]; bit = (z > 0.0); } bits[j] = static_cast(bit); } return bits; } // Getters const std::vector> &MinHash::getHyperplanes() const { return randomHyperplanes; } const std::vector &MinHash::getMean() const { return mean; } const std::vector &MinHash::getMedian() const { return median; } const std::vector &MinHash::getStddev() const { return stddev; }