memory-improvements / src /deduplication.ts
loudiman's picture
Add deduplication module
c966a8e verified
// ============================================================
// IMPROVEMENT 1: Memory Deduplication
// Based on Mem0 (arxiv 2504.19413) ADD/UPDATE/DELETE/NOOP logic
// See full source: https://huggingface.co/spaces/loudiman/sandbox-cbb9aab0
// ============================================================
import { MemoryRecord, DeduplicationResult, MemoryConfig, DEFAULT_MEMORY_CONFIG } from './types';
export function cosineSimilarity(a: number[], b: number[]): number {
if (a.length !== b.length) return 0;
let dotProduct = 0, normA = 0, normB = 0;
for (let i = 0; i < a.length; i++) {
dotProduct += a[i] * b[i];
normA += a[i] * a[i];
normB += b[i] * b[i];
}
const denominator = Math.sqrt(normA) * Math.sqrt(normB);
return denominator === 0 ? 0 : dotProduct / denominator;
}
export function checkDeduplication(
newEmbedding: number[], newText: string, existingMemories: MemoryRecord[],
config: MemoryConfig = DEFAULT_MEMORY_CONFIG
): DeduplicationResult {
let highestSimilarity = 0;
let mostSimilarMemory: MemoryRecord | null = null;
for (const memory of existingMemories) {
const similarity = cosineSimilarity(newEmbedding, memory.embedding);
if (similarity > highestSimilarity) { highestSimilarity = similarity; mostSimilarMemory = memory; }
}
if (highestSimilarity >= config.deduplicationThreshold && mostSimilarMemory)
return { operation: 'NOOP', existingMemoryId: mostSimilarMemory.id };
if (highestSimilarity >= config.mergeThreshold && mostSimilarMemory)
return { operation: 'UPDATE', existingMemoryId: mostSimilarMemory.id, mergedText: mergeMemoryTexts(mostSimilarMemory.text, newText) };
return { operation: 'ADD' };
}
function mergeMemoryTexts(existingText: string, newText: string, maxLength = 200): string {
const existingLower = existingText.toLowerCase().trim();
const newLower = newText.toLowerCase().trim();
if (existingLower.includes(newLower) || newLower.includes(existingLower))
return existingText.length >= newText.length ? existingText : newText;
const merged = `${existingText}; ${newText}`;
return merged.length > maxLength ? (newText.length <= maxLength ? newText : newText.substring(0, maxLength)) : merged;
}
export function buildDeduplicationQuery(embedding: number[]) {
return {
sql: `SELECT id, text, embedding, type, source, created_at, last_accessed_at, access_count, importance,
(1 - vec_distance_cosine(embedding, vec_f32(?))) AS similarity
FROM memories WHERE (1 - vec_distance_cosine(embedding, vec_f32(?))) >= ? ORDER BY similarity DESC LIMIT 5`,
params: [JSON.stringify(embedding), JSON.stringify(embedding), 0.65],
};
}
export async function storeMemoryWithDeduplication(
db: any, text: string, embedding: number[], type: MemoryType, source: MemorySource,
config: MemoryConfig = DEFAULT_MEMORY_CONFIG
): Promise<{ operation: MemoryOperation; memoryId?: string }> {
const { sql, params } = buildDeduplicationQuery(embedding);
const candidates = await db.getAll(sql, params);
const existingMemories: MemoryRecord[] = candidates.map((row: any) => ({
id: row.id, text: row.text, embedding: JSON.parse(row.embedding), type: row.type,
source: row.source, createdAt: row.created_at, lastAccessedAt: row.last_accessed_at,
accessCount: row.access_count, importance: row.importance,
}));
const result = checkDeduplication(embedding, text, existingMemories, config);
switch (result.operation) {
case 'ADD': {
const id = `mem_${Date.now()}_${Math.random().toString(36).substring(2, 9)}`;
const now = Date.now();
await db.run(`INSERT INTO memories (id, text, embedding, type, source, created_at, last_accessed_at, access_count, importance) VALUES (?, ?, vec_f32(?), ?, ?, ?, ?, 0, ?)`,
[id, text, JSON.stringify(embedding), type, source, now, now, computeImportance(text)]);
return { operation: 'ADD', memoryId: id };
}
case 'UPDATE':
await db.run(`UPDATE memories SET text = ?, embedding = vec_f32(?), last_accessed_at = ?, access_count = access_count + 1 WHERE id = ?`,
[result.mergedText, JSON.stringify(embedding), Date.now(), result.existingMemoryId]);
return { operation: 'UPDATE', memoryId: result.existingMemoryId };
case 'NOOP':
await db.run(`UPDATE memories SET last_accessed_at = ?, access_count = access_count + 1 WHERE id = ?`, [Date.now(), result.existingMemoryId]);
return { operation: 'NOOP', memoryId: result.existingMemoryId };
default: return { operation: 'NOOP' };
}
}
function computeImportance(text: string): number {
let score = 0.5;
if (text.length > 50) score += 0.1;
if (text.length > 100) score += 0.1;
if (/\b(allergic|allergy|hate|love|always|never|important|must|need)\b/i.test(text)) score += 0.15;
if (/\b(my name|i am|i'm|i live|my job|my work|i work)\b/i.test(text)) score += 0.1;
if (/\b(birthday|anniversary|deadline|appointment)\b/i.test(text)) score += 0.1;
if (/\d+/.test(text)) score += 0.05;
return Math.min(score, 1.0);
}
import { MemoryType, MemorySource, MemoryOperation } from './types';