File size: 5,910 Bytes
cf6b8d4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | // Submodular optimization utilities for string deduplication
export function cosineSimilarity(a: number[], b: number[]): number {
if (a.length !== b.length) return 0;
let dotProduct = 0;
let normA = 0;
let normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
if (normA === 0 || normB === 0) return 0;
return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
}
export function computeMarginalGainDiversity(
newIdx: number,
currentCoverage: number[],
similarityMatrix: number[][]
): number {
const n = similarityMatrix.length;
let marginalGain = 0;
const row = similarityMatrix[newIdx];
for (let i = 0; i < n; i++) {
const newCoverage = row[i] > currentCoverage[i] ? row[i] : currentCoverage[i];
marginalGain += newCoverage - currentCoverage[i];
}
return marginalGain;
}
export function lazyGreedySelection(embeddings: number[][], k: number): number[] {
const n = embeddings.length;
if (k >= n) return Array.from({ length: n }, (_, i) => i);
const selected: number[] = [];
const remaining = new Set(Array.from({ length: n }, (_, i) => i));
// Pre-compute similarity matrix
const similarityMatrix: number[][] = [];
for (let i = 0; i < n; i++) {
similarityMatrix[i] = [];
for (let j = 0; j < n; j++) {
// Clamp to non-negative to ensure monotone submodularity of facility-location objective
const sim = cosineSimilarity(embeddings[i], embeddings[j]);
similarityMatrix[i][j] = sim > 0 ? sim : 0;
}
}
// Maintain current coverage vector (max similarity to selected set for each element)
const currentCoverage = new Array(n).fill(0);
// Priority queue implementation using array (simplified)
const pq: Array<[number, number, number]> = [];
// Initialize priority queue
for (let i = 0; i < n; i++) {
const gain = computeMarginalGainDiversity(i, currentCoverage, similarityMatrix);
pq.push([-gain, 0, i]);
}
// Sort by gain (descending)
pq.sort((a, b) => a[0] - b[0]);
for (let iteration = 0; iteration < k; iteration++) {
while (pq.length > 0) {
const [negGain, lastUpdated, bestIdx] = pq.shift()!;
if (!remaining.has(bestIdx)) continue;
if (lastUpdated === iteration) {
selected.push(bestIdx);
remaining.delete(bestIdx);
// Update coverage in O(n)
const row = similarityMatrix[bestIdx];
for (let i = 0; i < n; i++) {
if (row[i] > currentCoverage[i]) currentCoverage[i] = row[i];
}
break;
}
const currentGain = computeMarginalGainDiversity(bestIdx, currentCoverage, similarityMatrix);
pq.push([-currentGain, iteration, bestIdx]);
pq.sort((a, b) => a[0] - b[0]);
}
}
return selected;
}
export function lazyGreedySelectionWithSaturation(
embeddings: number[][],
threshold: number = 1e-2
): { selected: number[], optimalK: number, values: number[] } {
const n = embeddings.length;
const selected: number[] = [];
const remaining = new Set(Array.from({ length: n }, (_, i) => i));
const values: number[] = [];
// Pre-compute similarity matrix
const similarityMatrix: number[][] = [];
for (let i = 0; i < n; i++) {
similarityMatrix[i] = [];
for (let j = 0; j < n; j++) {
const sim = cosineSimilarity(embeddings[i], embeddings[j]);
similarityMatrix[i][j] = sim > 0 ? sim : 0;
}
}
const currentCoverage = new Array(n).fill(0);
// Priority queue implementation using array (simplified)
const pq: Array<[number, number, number]> = [];
// Initialize priority queue
for (let i = 0; i < n; i++) {
const gain = computeMarginalGainDiversity(i, currentCoverage, similarityMatrix);
pq.push([-gain, 0, i]);
}
// Sort by gain (descending)
pq.sort((a, b) => a[0] - b[0]);
let earlyStopK: number | null = null;
for (let iteration = 0; iteration < n; iteration++) {
while (pq.length > 0) {
const [negGain, lastUpdated, bestIdx] = pq.shift()!;
if (!remaining.has(bestIdx)) continue;
if (lastUpdated === iteration) {
selected.push(bestIdx);
remaining.delete(bestIdx);
// Compute current function value (coverage)
const row = similarityMatrix[bestIdx];
for (let i = 0; i < n; i++) {
if (row[i] > currentCoverage[i]) currentCoverage[i] = row[i];
}
const functionValue = currentCoverage.reduce((sum, val) => sum + val, 0) / n;
values.push(functionValue);
// Early stop when the marginal gain (delta of normalized objective) falls below threshold
if (values.length >= 2) {
const delta = values[values.length - 1] - values[values.length - 2];
if (delta < threshold) {
earlyStopK = values.length; // k is count of selected items
}
}
break;
}
const currentGain = computeMarginalGainDiversity(bestIdx, currentCoverage, similarityMatrix);
pq.push([-currentGain, iteration, bestIdx]);
pq.sort((a, b) => a[0] - b[0]);
}
if (earlyStopK !== null) break;
}
// Choose k: prefer early stop detection; otherwise, use all collected values
const optimalK = earlyStopK ?? values.length;
const finalSelected = selected.slice(0, optimalK);
return { selected: finalSelected, optimalK, values };
}
|