File size: 6,645 Bytes
e92be04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
/**
 * P2PCLAW Sparse Memory — Veselov Hierarchical Representation
 * ===========================================================
 * Implements hierarchical sparse number representation (§2.3, §3.5).
 * Level weights grow super-exponentially: w_l = 10^(3·2^(l-1))
 * Memory savings: 100-1000x for sparse embeddings vs dense arrays.
 *
 * Classes:
 *   SparseHierarchicalNumber  — BigInt-based sparse number
 *   SparseEmbeddingStore      — semantic similarity without external model
 */

// Level weights: w0=1, w1=1000, w2=10^6, w3=10^12, w4=10^24, ...
const LEVEL_WEIGHTS = [1n];
for (let i = 1; i <= 20; i++) {
    LEVEL_WEIGHTS.push(LEVEL_WEIGHTS[i - 1] * 1000n);
}

export class SparseHierarchicalNumber {
    constructor() {
        this.levels = new Map(); // level → BigInt value
    }

    set(level, value) {
        if (value === 0n) this.levels.delete(level);
        else this.levels.set(level, value);
    }

    get(level) {
        return this.levels.get(level) || 0n;
    }

    add(other) {
        const result = new SparseHierarchicalNumber();
        const allLevels = new Set([...this.levels.keys(), ...other.levels.keys()]);
        let carry = 0n;
        for (const lvl of [...allLevels].sort((a, b) => a - b)) {
            const total = this.get(lvl) + other.get(lvl) + carry;
            const w = LEVEL_WEIGHTS[lvl + 1] || LEVEL_WEIGHTS[LEVEL_WEIGHTS.length - 1];
            result.set(lvl, total % w);
            carry = total / w;
        }
        if (carry > 0n) {
            const maxLevel = [...result.levels.keys()].length;
            result.set(maxLevel, carry);
        }
        return result;
    }

    get density() {
        return this.levels.size / Math.max(this.maxLevel + 1, 1);
    }

    get maxLevel() {
        return this.levels.size > 0 ? Math.max(...this.levels.keys()) : 0;
    }

    /** Approximate memory in bytes (8B level key + ~16B BigInt) */
    memoryBytes() {
        return this.levels.size * 24;
    }

    toJSON() {
        const obj = {};
        for (const [k, v] of this.levels) obj[k] = v.toString();
        return obj;
    }

    static fromJSON(obj) {
        const n = new SparseHierarchicalNumber();
        for (const [k, v] of Object.entries(obj)) n.set(Number(k), BigInt(v));
        return n;
    }
}

/**
 * Sparse embedding store for papers — O(1) per non-zero dimension.
 * Cosine similarity uses only non-zero dims (fast for sparse vectors).
 */
export class SparseEmbeddingStore {
    constructor() {
        this.embeddings = new Map(); // paperId → { dims: Map<idx,float>, total: number }
    }

    /**
     * Store a dense embedding as sparse (drops dims below threshold).
     * Returns the density ratio (smaller = more memory savings).
     */
    store(paperId, embedding, threshold = 0.01) {
        const sparse = new Map();
        for (let i = 0; i < embedding.length; i++) {
            if (Math.abs(embedding[i]) > threshold) {
                sparse.set(i, embedding[i]);
            }
        }
        this.embeddings.set(paperId, { dims: sparse, total: embedding.length });
        return sparse.size / embedding.length; // density
    }

    /**
     * Store a text-derived sparse embedding using TF-IDF style hashing.
     * No external model needed — uses character n-gram hashing.
     */
    storeText(paperId, text, dimensions = 512) {
        const embedding = new Float32Array(dimensions);
        const words = text.toLowerCase().split(/\W+/).filter(w => w.length > 2);
        for (const word of words) {
            // Simple hash to dimension index
            let h = 0;
            for (let i = 0; i < word.length; i++) h = (h * 31 + word.charCodeAt(i)) % dimensions;
            embedding[h] += 1;
            // Bigram
            if (word.length > 3) {
                let h2 = 0;
                for (let i = 0; i < word.length - 1; i++) {
                    const bigram = word.slice(i, i + 2);
                    for (let j = 0; j < bigram.length; j++) h2 = (h2 * 31 + bigram.charCodeAt(j)) % dimensions;
                }
                embedding[h2 % dimensions] += 0.5;
            }
        }
        // L2 normalize
        let norm = 0;
        for (let i = 0; i < dimensions; i++) norm += embedding[i] * embedding[i];
        norm = Math.sqrt(norm) || 1;
        for (let i = 0; i < dimensions; i++) embedding[i] /= norm;

        return this.store(paperId, embedding);
    }

    cosineSimilarity(paperId1, paperId2) {
        const e1 = this.embeddings.get(paperId1)?.dims;
        const e2 = this.embeddings.get(paperId2)?.dims;
        if (!e1 || !e2) return 0;
        let dot = 0, norm1 = 0, norm2 = 0;
        for (const [i, v] of e1) { norm1 += v * v; if (e2.has(i)) dot += v * e2.get(i); }
        for (const [, v] of e2) norm2 += v * v;
        return dot / (Math.sqrt(norm1) * Math.sqrt(norm2) + 1e-9);
    }

    searchSimilar(queryEmbedding, topK = 5, threshold = 0.01) {
        const querySparse = new Map();
        for (let i = 0; i < queryEmbedding.length; i++) {
            if (Math.abs(queryEmbedding[i]) > threshold) querySparse.set(i, queryEmbedding[i]);
        }
        const results = [];
        for (const [pid, emb] of this.embeddings) {
            let dot = 0, norm1 = 0, norm2 = 0;
            for (const [i, v] of querySparse) { norm1 += v * v; if (emb.dims.has(i)) dot += v * emb.dims.get(i); }
            for (const [, v] of emb.dims) norm2 += v * v;
            const sim = dot / (Math.sqrt(norm1) * Math.sqrt(norm2) + 1e-9);
            results.push({ paperId: pid, similarity: sim });
        }
        return results.sort((a, b) => b.similarity - a.similarity).slice(0, topK);
    }

    searchSimilarText(queryText, topK = 5) {
        const tempId = '__query__';
        this.storeText(tempId, queryText);
        const results = this.searchSimilar(
            [...(this.embeddings.get(tempId)?.dims || new Map())].reduce((arr, [i, v]) => {
                arr[i] = v; return arr;
            }, new Float32Array(512)),
            topK + 1
        ).filter(r => r.paperId !== tempId).slice(0, topK);
        this.embeddings.delete(tempId);
        return results;
    }

    get size() { return this.embeddings.size; }

    memoryStats() {
        let total = 0;
        for (const emb of this.embeddings.values()) total += emb.dims.size * 12; // 4B idx + 8B float
        return { papers: this.embeddings.size, bytes: total, kb: (total / 1024).toFixed(1) };
    }
}

// Singleton store for papers — shared across the API process
export const globalEmbeddingStore = new SparseEmbeddingStore();