File size: 5,910 Bytes
cf6b8d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
// Submodular optimization utilities for string deduplication

export function cosineSimilarity(a: number[], b: number[]): number {
    if (a.length !== b.length) return 0;

    let dotProduct = 0;
    let normA = 0;
    let normB = 0;

    for (let i = 0; i < a.length; i++) {
        dotProduct += a[i] * b[i];
        normA += a[i] * a[i];
        normB += b[i] * b[i];
    }

    if (normA === 0 || normB === 0) return 0;
    return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}

export function computeMarginalGainDiversity(
    newIdx: number,
    currentCoverage: number[],
    similarityMatrix: number[][]
): number {
    const n = similarityMatrix.length;
    let marginalGain = 0;
    const row = similarityMatrix[newIdx];
    for (let i = 0; i < n; i++) {
        const newCoverage = row[i] > currentCoverage[i] ? row[i] : currentCoverage[i];
        marginalGain += newCoverage - currentCoverage[i];
    }
    return marginalGain;
}

export function lazyGreedySelection(embeddings: number[][], k: number): number[] {
    const n = embeddings.length;
    if (k >= n) return Array.from({ length: n }, (_, i) => i);

    const selected: number[] = [];
    const remaining = new Set(Array.from({ length: n }, (_, i) => i));

    // Pre-compute similarity matrix
    const similarityMatrix: number[][] = [];
    for (let i = 0; i < n; i++) {
        similarityMatrix[i] = [];
        for (let j = 0; j < n; j++) {
            // Clamp to non-negative to ensure monotone submodularity of facility-location objective
            const sim = cosineSimilarity(embeddings[i], embeddings[j]);
            similarityMatrix[i][j] = sim > 0 ? sim : 0;
        }
    }

    // Maintain current coverage vector (max similarity to selected set for each element)
    const currentCoverage = new Array(n).fill(0);

    // Priority queue implementation using array (simplified)
    const pq: Array<[number, number, number]> = [];

    // Initialize priority queue
    for (let i = 0; i < n; i++) {
        const gain = computeMarginalGainDiversity(i, currentCoverage, similarityMatrix);
        pq.push([-gain, 0, i]);
    }

    // Sort by gain (descending)
    pq.sort((a, b) => a[0] - b[0]);

    for (let iteration = 0; iteration < k; iteration++) {
        while (pq.length > 0) {
            const [negGain, lastUpdated, bestIdx] = pq.shift()!;

            if (!remaining.has(bestIdx)) continue;

            if (lastUpdated === iteration) {
                selected.push(bestIdx);
                remaining.delete(bestIdx);
                // Update coverage in O(n)
                const row = similarityMatrix[bestIdx];
                for (let i = 0; i < n; i++) {
                    if (row[i] > currentCoverage[i]) currentCoverage[i] = row[i];
                }
                break;
            }

            const currentGain = computeMarginalGainDiversity(bestIdx, currentCoverage, similarityMatrix);
            pq.push([-currentGain, iteration, bestIdx]);
            pq.sort((a, b) => a[0] - b[0]);
        }
    }

    return selected;
}

export function lazyGreedySelectionWithSaturation(
    embeddings: number[][],
    threshold: number = 1e-2
): { selected: number[], optimalK: number, values: number[] } {
    const n = embeddings.length;

    const selected: number[] = [];
    const remaining = new Set(Array.from({ length: n }, (_, i) => i));
    const values: number[] = [];

    // Pre-compute similarity matrix
    const similarityMatrix: number[][] = [];
    for (let i = 0; i < n; i++) {
        similarityMatrix[i] = [];
        for (let j = 0; j < n; j++) {
            const sim = cosineSimilarity(embeddings[i], embeddings[j]);
            similarityMatrix[i][j] = sim > 0 ? sim : 0;
        }
    }

    const currentCoverage = new Array(n).fill(0);

    // Priority queue implementation using array (simplified)
    const pq: Array<[number, number, number]> = [];

    // Initialize priority queue
    for (let i = 0; i < n; i++) {
        const gain = computeMarginalGainDiversity(i, currentCoverage, similarityMatrix);
        pq.push([-gain, 0, i]);
    }

    // Sort by gain (descending)
    pq.sort((a, b) => a[0] - b[0]);

    let earlyStopK: number | null = null;
    for (let iteration = 0; iteration < n; iteration++) {
        while (pq.length > 0) {
            const [negGain, lastUpdated, bestIdx] = pq.shift()!;

            if (!remaining.has(bestIdx)) continue;

            if (lastUpdated === iteration) {
                selected.push(bestIdx);
                remaining.delete(bestIdx);

                // Compute current function value (coverage)
                const row = similarityMatrix[bestIdx];
                for (let i = 0; i < n; i++) {
                    if (row[i] > currentCoverage[i]) currentCoverage[i] = row[i];
                }
                const functionValue = currentCoverage.reduce((sum, val) => sum + val, 0) / n;
                values.push(functionValue);

                // Early stop when the marginal gain (delta of normalized objective) falls below threshold
                if (values.length >= 2) {
                    const delta = values[values.length - 1] - values[values.length - 2];
                    if (delta < threshold) {
                        earlyStopK = values.length; // k is count of selected items
                    }
                }

                break;
            }

            const currentGain = computeMarginalGainDiversity(bestIdx, currentCoverage, similarityMatrix);
            pq.push([-currentGain, iteration, bestIdx]);
            pq.sort((a, b) => a[0] - b[0]);
        }
        if (earlyStopK !== null) break;
    }

    // Choose k: prefer early stop detection; otherwise, use all collected values
    const optimalK = earlyStopK ?? values.length;
    const finalSelected = selected.slice(0, optimalK);

    return { selected: finalSelected, optimalK, values };
}