Spaces:

shreyask
/

qmd-web

Running

App Files Files Community

qmd-web / src /pipeline /rrf.ts

shreyask

feat: add RRF fusion and position-aware score blending

997db9a verified about 1 month ago

raw

history blame contribute delete

3.43 kB

	import type { ScoredChunk, RRFResult, RRFContribution } from "../types";
	import {
	RRF_K,
	RRF_PRIMARY_WEIGHT,
	RRF_SECONDARY_WEIGHT,
	RRF_RANK1_BONUS,
	RRF_RANK2_BONUS,
	RERANK_CANDIDATE_LIMIT,
	} from "../constants";

	interface RankedList {
	results: ScoredChunk[];
	queryType: "original" \| "lex" \| "vec" \| "hyde";
	query: string;
	}

	// Merge multiple ranked lists using Reciprocal Rank Fusion
	export function reciprocalRankFusion(
	lists: RankedList[],
	candidateLimit: number = RERANK_CANDIDATE_LIMIT,
	): RRFResult[] {
	// For each document across all lists, compute RRF score:
	// score = Σ (weight / (k + rank))
	//
	// Weight rules (from QMD):
	// - First 2 lists: weight = RRF_PRIMARY_WEIGHT (2.0)
	// - Remaining lists: weight = RRF_SECONDARY_WEIGHT (1.0)
	//
	// Rank bonuses:
	// - Rank 1 in any list: +RRF_RANK1_BONUS (0.05)
	// - Rank 1 or 2 in any list: +RRF_RANK2_BONUS (0.02)
	//
	// Group by docId (not chunk), keep the best chunk per doc.
	// Sort by total score descending, return top candidateLimit.

	const docScores = new Map<
	string,
	{
	docId: string;
	filepath: string;
	title: string;
	bestChunk: string;
	bestChunkScore: number;
	totalScore: number;
	topRank: number;
	contributions: RRFContribution[];
	}
	>();

	lists.forEach((list, listIndex) => {
	const weight = listIndex < 2 ? RRF_PRIMARY_WEIGHT : RRF_SECONDARY_WEIGHT;

	list.results.forEach((result, rankIndex) => {
	const rank = rankIndex + 1; // 1-indexed
	const rrfContribution = weight / (RRF_K + rank);

	const existing = docScores.get(result.chunk.docId);
	if (existing) {
	existing.totalScore += rrfContribution;
	existing.topRank = Math.min(existing.topRank, rank);
	existing.contributions.push({
	source: result.source,
	queryType: list.queryType,
	query: list.query,
	rank,
	weight,
	rrfContribution,
	});
	// Keep the chunk with the highest individual score
	if (result.score > existing.bestChunkScore) {
	existing.bestChunk = result.chunk.text;
	existing.bestChunkScore = result.score;
	}
	} else {
	docScores.set(result.chunk.docId, {
	docId: result.chunk.docId,
	filepath: result.chunk.docId, // In browser demo, docId = filepath
	title: result.chunk.title,
	bestChunk: result.chunk.text,
	bestChunkScore: result.score,
	totalScore: rrfContribution,
	topRank: rank,
	contributions: [
	{
	source: result.source,
	queryType: list.queryType,
	query: list.query,
	rank,
	weight,
	rrfContribution,
	},
	],
	});
	}
	});
	});

	// Apply rank bonuses
	for (const doc of docScores.values()) {
	if (doc.topRank === 1) doc.totalScore += RRF_RANK1_BONUS;
	if (doc.topRank <= 2) doc.totalScore += RRF_RANK2_BONUS;
	}

	// Sort and slice
	const results = Array.from(docScores.values())
	.sort((a, b) => b.totalScore - a.totalScore)
	.slice(0, candidateLimit)
	.map((doc) => ({
	docId: doc.docId,
	filepath: doc.filepath,
	title: doc.title,
	bestChunk: doc.bestChunk,
	score: doc.totalScore,
	contributions: doc.contributions,
	}));

	return results;
	}