qmd-web / src /pipeline /rrf.ts
shreyask's picture
feat: add RRF fusion and position-aware score blending
997db9a verified
import type { ScoredChunk, RRFResult, RRFContribution } from "../types";
import {
RRF_K,
RRF_PRIMARY_WEIGHT,
RRF_SECONDARY_WEIGHT,
RRF_RANK1_BONUS,
RRF_RANK2_BONUS,
RERANK_CANDIDATE_LIMIT,
} from "../constants";
interface RankedList {
results: ScoredChunk[];
queryType: "original" | "lex" | "vec" | "hyde";
query: string;
}
// Merge multiple ranked lists using Reciprocal Rank Fusion
export function reciprocalRankFusion(
lists: RankedList[],
candidateLimit: number = RERANK_CANDIDATE_LIMIT,
): RRFResult[] {
// For each document across all lists, compute RRF score:
// score = Σ (weight / (k + rank))
//
// Weight rules (from QMD):
// - First 2 lists: weight = RRF_PRIMARY_WEIGHT (2.0)
// - Remaining lists: weight = RRF_SECONDARY_WEIGHT (1.0)
//
// Rank bonuses:
// - Rank 1 in any list: +RRF_RANK1_BONUS (0.05)
// - Rank 1 or 2 in any list: +RRF_RANK2_BONUS (0.02)
//
// Group by docId (not chunk), keep the best chunk per doc.
// Sort by total score descending, return top candidateLimit.
const docScores = new Map<
string,
{
docId: string;
filepath: string;
title: string;
bestChunk: string;
bestChunkScore: number;
totalScore: number;
topRank: number;
contributions: RRFContribution[];
}
>();
lists.forEach((list, listIndex) => {
const weight = listIndex < 2 ? RRF_PRIMARY_WEIGHT : RRF_SECONDARY_WEIGHT;
list.results.forEach((result, rankIndex) => {
const rank = rankIndex + 1; // 1-indexed
const rrfContribution = weight / (RRF_K + rank);
const existing = docScores.get(result.chunk.docId);
if (existing) {
existing.totalScore += rrfContribution;
existing.topRank = Math.min(existing.topRank, rank);
existing.contributions.push({
source: result.source,
queryType: list.queryType,
query: list.query,
rank,
weight,
rrfContribution,
});
// Keep the chunk with the highest individual score
if (result.score > existing.bestChunkScore) {
existing.bestChunk = result.chunk.text;
existing.bestChunkScore = result.score;
}
} else {
docScores.set(result.chunk.docId, {
docId: result.chunk.docId,
filepath: result.chunk.docId, // In browser demo, docId = filepath
title: result.chunk.title,
bestChunk: result.chunk.text,
bestChunkScore: result.score,
totalScore: rrfContribution,
topRank: rank,
contributions: [
{
source: result.source,
queryType: list.queryType,
query: list.query,
rank,
weight,
rrfContribution,
},
],
});
}
});
});
// Apply rank bonuses
for (const doc of docScores.values()) {
if (doc.topRank === 1) doc.totalScore += RRF_RANK1_BONUS;
if (doc.topRank <= 2) doc.totalScore += RRF_RANK2_BONUS;
}
// Sort and slice
const results = Array.from(docScores.values())
.sort((a, b) => b.totalScore - a.totalScore)
.slice(0, candidateLimit)
.map((doc) => ({
docId: doc.docId,
filepath: doc.filepath,
title: doc.title,
bestChunk: doc.bestChunk,
score: doc.totalScore,
contributions: doc.contributions,
}));
return results;
}