/** * guide.ts — persistent feedback-guided n-gram table + best-of-N reranker. */ import fs from "fs"; import path from "path"; import { logger } from "./logger"; const DATA_DIR = path.resolve(process.cwd(), "data"); const GUIDE_PATH = path.join(DATA_DIR, "guide_ngrams.json"); const CORPUS_PATH = path.join(DATA_DIR, "finetune_corpus.txt"); const MAX_BIGRAMS = 50_000; const MAX_TRIGRAMS = 30_000; const MAX_CORPUS_BYTES = 8 * 1024 * 1024; type GuideTable = { bigrams: Record; trigrams: Record; upvoted_answers: number; }; let guide: GuideTable = { bigrams: {}, trigrams: {}, upvoted_answers: 0 }; let dirty = false; function ensureDir() { if (!fs.existsSync(DATA_DIR)) fs.mkdirSync(DATA_DIR, { recursive: true }); } function loadGuide() { try { if (fs.existsSync(GUIDE_PATH)) { guide = JSON.parse(fs.readFileSync(GUIDE_PATH, "utf-8")); if (!guide.bigrams) guide.bigrams = {}; if (!guide.trigrams) guide.trigrams = {}; if (!guide.upvoted_answers) guide.upvoted_answers = 0; } } catch (e) { logger.warn({ err: e }, "guide: load failed, starting fresh"); guide = { bigrams: {}, trigrams: {}, upvoted_answers: 0 }; } } loadGuide(); function persist() { if (!dirty) return; ensureDir(); guide.bigrams = trimTopK(guide.bigrams, MAX_BIGRAMS); guide.trigrams = trimTopK(guide.trigrams, MAX_TRIGRAMS); const tmp = GUIDE_PATH + ".tmp"; try { fs.writeFileSync(tmp, JSON.stringify(guide)); fs.renameSync(tmp, GUIDE_PATH); dirty = false; } catch (e) { logger.error({ err: e }, "guide: persist failed"); } } function trimTopK(map: Record, k: number): Record { const entries = Object.entries(map); if (entries.length <= k) return map; entries.sort((a, b) => b[1] - a[1]); const out: Record = {}; for (let i = 0; i < k; i++) out[entries[i][0]] = entries[i][1]; return out; } const STOP = new Set([ "the","a","an","is","are","was","were","of","in","to","and","or","it","its", "this","that","with","for","on","as","by","be","at","from","but","not","if", "then","so","do","does","did","has","have","had","i","you","he","she","we", "they","me","him","her","us","them","my","your","his","our","their", ]); function tokenize(text: string): string[] { return text .toLowerCase() .replace(/[^a-z0-9\s]+/g, " ") .split(/\s+/) .filter(t => t && t.length > 1); } export function ingestUpvotedAnswer(text: string): void { if (!text || text.length < 20) return; const toks = tokenize(text); if (toks.length < 4) return; for (let i = 0; i < toks.length - 1; i++) { if (STOP.has(toks[i]) && STOP.has(toks[i + 1])) continue; const bg = `${toks[i]} ${toks[i + 1]}`; guide.bigrams[bg] = (guide.bigrams[bg] || 0) + 1; } for (let i = 0; i < toks.length - 2; i++) { const tg = `${toks[i]} ${toks[i + 1]} ${toks[i + 2]}`; guide.trigrams[tg] = (guide.trigrams[tg] || 0) + 1; } guide.upvoted_answers += 1; dirty = true; persist(); appendToCorpus(text); } export function appendToCorpus(text: string): void { if (!text || text.length < 50) return; ensureDir(); try { fs.appendFileSync(CORPUS_PATH, text.trim() + "\n\n"); const stat = fs.statSync(CORPUS_PATH); if (stat.size > MAX_CORPUS_BYTES) { const buf = fs.readFileSync(CORPUS_PATH); const tail = buf.subarray(buf.length - Math.floor(MAX_CORPUS_BYTES / 2)); fs.writeFileSync(CORPUS_PATH, tail); } } catch (e) { logger.warn({ err: e }, "guide: corpus append failed"); } } export function getCorpusPath(): string { return CORPUS_PATH; } export function getGuideStats() { return { bigrams: Object.keys(guide.bigrams).length, trigrams: Object.keys(guide.trigrams).length, upvoted_answers: guide.upvoted_answers, corpus_bytes: fs.existsSync(CORPUS_PATH) ? fs.statSync(CORPUS_PATH).size : 0, }; } function bm25ish(candidate: string, contextTerms: Set): number { if (!contextTerms.size) return 0; const tokens = tokenize(candidate); if (!tokens.length) return 0; let hits = 0; const seen = new Set(); for (const t of tokens) { if (contextTerms.has(t) && !seen.has(t)) { hits += 1; seen.add(t); } } return hits / Math.sqrt(tokens.length); } function repetitionRatio(text: string): number { const toks = tokenize(text); if (toks.length < 4) return 1; const uniq = new Set(toks).size; return uniq / toks.length; } function guideOverlap(text: string): number { const toks = tokenize(text); if (toks.length < 3) return 0; let bg = 0, tg = 0, denom = 0; for (let i = 0; i < toks.length - 1; i++) { const k = `${toks[i]} ${toks[i + 1]}`; if (guide.bigrams[k]) bg += Math.log(1 + guide.bigrams[k]); denom += 1; } for (let i = 0; i < toks.length - 2; i++) { const k = `${toks[i]} ${toks[i + 1]} ${toks[i + 2]}`; if (guide.trigrams[k]) tg += Math.log(1 + guide.trigrams[k]); } return (bg * 0.4 + tg * 0.6) / Math.max(denom, 1); } export type Sample = { text: string; logprob: number; avg_logprob?: number }; export type RerankResult = { best: Sample; bestScore: number; scores: Array<{ text: string; score: number; breakdown: Partial> }>; }; export function rerankSamples( samples: Sample[], context: string, query: string, ): RerankResult { const ctxTerms = new Set([...tokenize(context), ...tokenize(query)]); const scores = samples.map(s => { const text = (s.text || "").trim(); if (text.length < 5) { return { text, score: -1e6, breakdown: { empty: 1 } }; } const len = text.length; const lenScore = Math.max(0, 1 - Math.abs(len - 350) / 700); const bm25 = bm25ish(text, ctxTerms); const rep = repetitionRatio(text); const guideS = guideOverlap(text); const lp = s.avg_logprob ?? (s.logprob / Math.max(1, text.length / 4)); const lpScore = Math.max(0, Math.min(1, (lp + 4) / 4)); const score = bm25 * 1.6 + lenScore * 0.6 + rep * 0.8 + guideS * 1.2 + lpScore * 0.4; return { text, score, breakdown: { bm25, lenScore, rep, guideS, lpScore, len, lp }, }; }); let bestIdx = 0, bestScore = -Infinity; for (let i = 0; i < scores.length; i++) { if (scores[i].score > bestScore) { bestScore = scores[i].score; bestIdx = i; } } return { best: samples[bestIdx], bestScore, scores }; }