feat: add RRF fusion and position-aware score blending
Browse filesImplement reciprocal rank fusion (rrf.ts) to merge BM25 and vector
search ranked lists with weighted contributions and rank bonuses.
Add position-aware score blending (blend.ts) that combines RRF scores
with reranker scores using tiered weights (top3/mid/tail).
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
- src/pipeline/blend.test.ts +189 -0
- src/pipeline/blend.ts +57 -0
- src/pipeline/rrf.test.ts +223 -0
- src/pipeline/rrf.ts +118 -0
src/pipeline/blend.test.ts
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { describe, it, expect } from "vitest";
|
| 2 |
+
import { blendScores } from "./blend";
|
| 3 |
+
import type { RRFResult } from "../types";
|
| 4 |
+
import { BLEND_TOP3_RRF, BLEND_MID_RRF, BLEND_TAIL_RRF } from "../constants";
|
| 5 |
+
|
| 6 |
+
// ---------------------------------------------------------------------------
|
| 7 |
+
// Helpers
|
| 8 |
+
// ---------------------------------------------------------------------------
|
| 9 |
+
function makeRRFResult(
|
| 10 |
+
docId: string,
|
| 11 |
+
score: number,
|
| 12 |
+
title = `Title ${docId}`,
|
| 13 |
+
): RRFResult {
|
| 14 |
+
return {
|
| 15 |
+
docId,
|
| 16 |
+
filepath: docId,
|
| 17 |
+
title,
|
| 18 |
+
bestChunk: `chunk from ${docId}`,
|
| 19 |
+
score,
|
| 20 |
+
contributions: [],
|
| 21 |
+
};
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
// ---------------------------------------------------------------------------
|
| 25 |
+
// blendScores
|
| 26 |
+
// ---------------------------------------------------------------------------
|
| 27 |
+
describe("blendScores", () => {
|
| 28 |
+
it("returns empty array for empty input", () => {
|
| 29 |
+
expect(blendScores([], new Map())).toEqual([]);
|
| 30 |
+
});
|
| 31 |
+
|
| 32 |
+
it("uses top3 weight for rank 1-3", () => {
|
| 33 |
+
const rrfResults = [
|
| 34 |
+
makeRRFResult("doc1", 0.5),
|
| 35 |
+
makeRRFResult("doc2", 0.4),
|
| 36 |
+
makeRRFResult("doc3", 0.3),
|
| 37 |
+
];
|
| 38 |
+
const rerankScores = new Map([
|
| 39 |
+
["doc1", 0.9],
|
| 40 |
+
["doc2", 0.8],
|
| 41 |
+
["doc3", 0.7],
|
| 42 |
+
]);
|
| 43 |
+
const results = blendScores(rrfResults, rerankScores);
|
| 44 |
+
// All three should use BLEND_TOP3_RRF weight
|
| 45 |
+
// doc1: 0.75*0.5 + 0.25*0.9 = 0.375 + 0.225 = 0.600
|
| 46 |
+
const doc1 = results.find((r) => r.docId === "doc1")!;
|
| 47 |
+
const expected1 = BLEND_TOP3_RRF * 0.5 + (1 - BLEND_TOP3_RRF) * 0.9;
|
| 48 |
+
expect(doc1.score).toBeCloseTo(expected1, 10);
|
| 49 |
+
});
|
| 50 |
+
|
| 51 |
+
it("uses mid weight for rank 4-10", () => {
|
| 52 |
+
// Create 5 results; rank 4 and 5 should use mid weight
|
| 53 |
+
const rrfResults = Array.from({ length: 5 }, (_, i) =>
|
| 54 |
+
makeRRFResult(`doc${i}`, 0.5 - i * 0.05),
|
| 55 |
+
);
|
| 56 |
+
const rerankScores = new Map<string, number>([["doc3", 0.9]]);
|
| 57 |
+
|
| 58 |
+
const results = blendScores(rrfResults, rerankScores);
|
| 59 |
+
// doc3 is at rank 4 in RRF ordering -> uses BLEND_MID_RRF
|
| 60 |
+
const doc3 = results.find((r) => r.docId === "doc3")!;
|
| 61 |
+
const expected = BLEND_MID_RRF * 0.35 + (1 - BLEND_MID_RRF) * 0.9;
|
| 62 |
+
expect(doc3.score).toBeCloseTo(expected, 10);
|
| 63 |
+
});
|
| 64 |
+
|
| 65 |
+
it("uses tail weight for rank 11+", () => {
|
| 66 |
+
const rrfResults = Array.from({ length: 12 }, (_, i) =>
|
| 67 |
+
makeRRFResult(`doc${i}`, 0.5 - i * 0.03),
|
| 68 |
+
);
|
| 69 |
+
const rerankScores = new Map<string, number>([["doc11", 0.95]]);
|
| 70 |
+
|
| 71 |
+
const results = blendScores(rrfResults, rerankScores);
|
| 72 |
+
// doc11 is at rank 12 -> uses BLEND_TAIL_RRF
|
| 73 |
+
const doc11 = results.find((r) => r.docId === "doc11")!;
|
| 74 |
+
const rrfScore = 0.5 - 11 * 0.03; // 0.17
|
| 75 |
+
const expected = BLEND_TAIL_RRF * rrfScore + (1 - BLEND_TAIL_RRF) * 0.95;
|
| 76 |
+
expect(doc11.score).toBeCloseTo(expected, 10);
|
| 77 |
+
});
|
| 78 |
+
|
| 79 |
+
it("defaults missing rerank scores to 0", () => {
|
| 80 |
+
const rrfResults = [makeRRFResult("doc1", 0.5)];
|
| 81 |
+
const rerankScores = new Map<string, number>(); // no rerank score for doc1
|
| 82 |
+
const results = blendScores(rrfResults, rerankScores);
|
| 83 |
+
// score = 0.75 * 0.5 + 0.25 * 0 = 0.375
|
| 84 |
+
expect(results[0].score).toBeCloseTo(BLEND_TOP3_RRF * 0.5, 10);
|
| 85 |
+
});
|
| 86 |
+
|
| 87 |
+
it("sorts by blended score descending", () => {
|
| 88 |
+
const rrfResults = [
|
| 89 |
+
makeRRFResult("doc1", 0.5),
|
| 90 |
+
makeRRFResult("doc2", 0.4),
|
| 91 |
+
makeRRFResult("doc3", 0.3),
|
| 92 |
+
];
|
| 93 |
+
// High rerank score on doc3 should push it up
|
| 94 |
+
const rerankScores = new Map([
|
| 95 |
+
["doc1", 0.1],
|
| 96 |
+
["doc2", 0.2],
|
| 97 |
+
["doc3", 0.99],
|
| 98 |
+
]);
|
| 99 |
+
const results = blendScores(rrfResults, rerankScores);
|
| 100 |
+
for (let i = 1; i < results.length; i++) {
|
| 101 |
+
expect(results[i].score).toBeLessThanOrEqual(results[i - 1].score);
|
| 102 |
+
}
|
| 103 |
+
});
|
| 104 |
+
|
| 105 |
+
it("can reorder results when reranker disagrees with RRF", () => {
|
| 106 |
+
const rrfResults = [
|
| 107 |
+
makeRRFResult("doc1", 0.5),
|
| 108 |
+
makeRRFResult("doc2", 0.49),
|
| 109 |
+
];
|
| 110 |
+
// doc2 gets much higher rerank score
|
| 111 |
+
const rerankScores = new Map([
|
| 112 |
+
["doc1", 0.0],
|
| 113 |
+
["doc2", 1.0],
|
| 114 |
+
]);
|
| 115 |
+
const results = blendScores(rrfResults, rerankScores);
|
| 116 |
+
// doc1: 0.75*0.5 + 0.25*0.0 = 0.375
|
| 117 |
+
// doc2: 0.75*0.49 + 0.25*1.0 = 0.3675 + 0.25 = 0.6175
|
| 118 |
+
expect(results[0].docId).toBe("doc2");
|
| 119 |
+
});
|
| 120 |
+
|
| 121 |
+
it("preserves filepath, title, bestChunk in final results", () => {
|
| 122 |
+
const rrfResults = [makeRRFResult("doc1", 0.5, "My Title")];
|
| 123 |
+
const results = blendScores(rrfResults, new Map());
|
| 124 |
+
expect(results[0].filepath).toBe("doc1");
|
| 125 |
+
expect(results[0].title).toBe("My Title");
|
| 126 |
+
expect(results[0].bestChunk).toBe("chunk from doc1");
|
| 127 |
+
expect(results[0].docId).toBe("doc1");
|
| 128 |
+
});
|
| 129 |
+
|
| 130 |
+
it("deduplicates by docId, keeping highest blended score", () => {
|
| 131 |
+
// This shouldn't normally happen since RRF already deduplicates,
|
| 132 |
+
// but the function handles it defensively
|
| 133 |
+
const rrfResults = [
|
| 134 |
+
makeRRFResult("doc1", 0.5),
|
| 135 |
+
makeRRFResult("doc1", 0.3), // duplicate
|
| 136 |
+
];
|
| 137 |
+
const rerankScores = new Map([["doc1", 0.8]]);
|
| 138 |
+
const results = blendScores(rrfResults, rerankScores);
|
| 139 |
+
expect(results).toHaveLength(1);
|
| 140 |
+
expect(results[0].docId).toBe("doc1");
|
| 141 |
+
});
|
| 142 |
+
|
| 143 |
+
it("handles single result correctly", () => {
|
| 144 |
+
const rrfResults = [makeRRFResult("doc1", 0.42)];
|
| 145 |
+
const rerankScores = new Map([["doc1", 0.88]]);
|
| 146 |
+
const results = blendScores(rrfResults, rerankScores);
|
| 147 |
+
expect(results).toHaveLength(1);
|
| 148 |
+
const expected = BLEND_TOP3_RRF * 0.42 + (1 - BLEND_TOP3_RRF) * 0.88;
|
| 149 |
+
expect(results[0].score).toBeCloseTo(expected, 10);
|
| 150 |
+
});
|
| 151 |
+
|
| 152 |
+
it("weight transitions are correct at boundaries", () => {
|
| 153 |
+
// Create exactly 11 results, check ranks 3, 4, 10, 11
|
| 154 |
+
const rrfResults = Array.from({ length: 11 }, (_, i) =>
|
| 155 |
+
makeRRFResult(`doc${i}`, 1.0), // same RRF score
|
| 156 |
+
);
|
| 157 |
+
const rerankScores = new Map<string, number>();
|
| 158 |
+
// Give all the same rerank score so we can check weight differences
|
| 159 |
+
for (let i = 0; i < 11; i++) {
|
| 160 |
+
rerankScores.set(`doc${i}`, 1.0);
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
const results = blendScores(rrfResults, rerankScores);
|
| 164 |
+
// All should have blended score = weight * 1.0 + (1-weight) * 1.0 = 1.0
|
| 165 |
+
// regardless of weight, since both inputs are 1.0
|
| 166 |
+
for (const r of results) {
|
| 167 |
+
expect(r.score).toBeCloseTo(1.0, 10);
|
| 168 |
+
}
|
| 169 |
+
});
|
| 170 |
+
|
| 171 |
+
it("correctly applies different weights when scores differ", () => {
|
| 172 |
+
// 11 results with identical RRF=0.5, rerank=1.0
|
| 173 |
+
const rrfResults = Array.from({ length: 11 }, (_, i) =>
|
| 174 |
+
makeRRFResult(`doc${i}`, 0.5),
|
| 175 |
+
);
|
| 176 |
+
const rerankScores = new Map<string, number>();
|
| 177 |
+
for (let i = 0; i < 11; i++) {
|
| 178 |
+
rerankScores.set(`doc${i}`, 1.0);
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
const results = blendScores(rrfResults, rerankScores);
|
| 182 |
+
// Rank 1-3: 0.75*0.5 + 0.25*1.0 = 0.625
|
| 183 |
+
// Rank 4-10: 0.60*0.5 + 0.40*1.0 = 0.700
|
| 184 |
+
// Rank 11: 0.40*0.5 + 0.60*1.0 = 0.800
|
| 185 |
+
// So rank 11 should have highest score (tail weight favors reranker more)
|
| 186 |
+
const topScore = results[0].score;
|
| 187 |
+
expect(topScore).toBeCloseTo(0.8, 10); // rank 11 doc
|
| 188 |
+
});
|
| 189 |
+
});
|
src/pipeline/blend.ts
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { RRFResult, RerankedResult, FinalResult } from "../types";
|
| 2 |
+
import { BLEND_TOP3_RRF, BLEND_MID_RRF, BLEND_TAIL_RRF } from "../constants";
|
| 3 |
+
|
| 4 |
+
// Blend RRF score with reranker score using position-aware weights
|
| 5 |
+
export function blendScores(
|
| 6 |
+
rrfResults: RRFResult[],
|
| 7 |
+
rerankScores: Map<string, number>, // docId -> rerank score
|
| 8 |
+
): FinalResult[] {
|
| 9 |
+
const blended: RerankedResult[] = rrfResults.map((result, index) => {
|
| 10 |
+
const rank = index + 1; // 1-indexed position in RRF ordering
|
| 11 |
+
const rerankScore = rerankScores.get(result.docId) ?? 0;
|
| 12 |
+
|
| 13 |
+
// Position-aware RRF weight (from QMD):
|
| 14 |
+
// Rank 1-3: 75% RRF + 25% reranker
|
| 15 |
+
// Rank 4-10: 60% RRF + 40% reranker
|
| 16 |
+
// Rank 11+: 40% RRF + 60% reranker
|
| 17 |
+
let rrfWeight: number;
|
| 18 |
+
if (rank <= 3) {
|
| 19 |
+
rrfWeight = BLEND_TOP3_RRF; // 0.75
|
| 20 |
+
} else if (rank <= 10) {
|
| 21 |
+
rrfWeight = BLEND_MID_RRF; // 0.60
|
| 22 |
+
} else {
|
| 23 |
+
rrfWeight = BLEND_TAIL_RRF; // 0.40
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
// Normalize RRF score to [0,1] range for blending
|
| 27 |
+
// (RRF scores vary based on number of lists, so normalize by max)
|
| 28 |
+
const blendedScore =
|
| 29 |
+
rrfWeight * result.score + (1 - rrfWeight) * rerankScore;
|
| 30 |
+
|
| 31 |
+
return {
|
| 32 |
+
...result,
|
| 33 |
+
rerankScore,
|
| 34 |
+
blendedScore,
|
| 35 |
+
};
|
| 36 |
+
});
|
| 37 |
+
|
| 38 |
+
// Sort by blended score descending
|
| 39 |
+
blended.sort((a, b) => b.blendedScore - a.blendedScore);
|
| 40 |
+
|
| 41 |
+
// Dedup by docId (keep highest blended score)
|
| 42 |
+
const seen = new Set<string>();
|
| 43 |
+
const final: FinalResult[] = [];
|
| 44 |
+
for (const result of blended) {
|
| 45 |
+
if (seen.has(result.docId)) continue;
|
| 46 |
+
seen.add(result.docId);
|
| 47 |
+
final.push({
|
| 48 |
+
filepath: result.filepath,
|
| 49 |
+
title: result.title,
|
| 50 |
+
bestChunk: result.bestChunk,
|
| 51 |
+
score: result.blendedScore,
|
| 52 |
+
docId: result.docId,
|
| 53 |
+
});
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
return final;
|
| 57 |
+
}
|
src/pipeline/rrf.test.ts
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { describe, it, expect } from "vitest";
|
| 2 |
+
import { reciprocalRankFusion } from "./rrf";
|
| 3 |
+
import type { ScoredChunk } from "../types";
|
| 4 |
+
import {
|
| 5 |
+
RRF_K,
|
| 6 |
+
RRF_PRIMARY_WEIGHT,
|
| 7 |
+
RRF_SECONDARY_WEIGHT,
|
| 8 |
+
RRF_RANK1_BONUS,
|
| 9 |
+
RRF_RANK2_BONUS,
|
| 10 |
+
} from "../constants";
|
| 11 |
+
|
| 12 |
+
// ---------------------------------------------------------------------------
|
| 13 |
+
// Helpers
|
| 14 |
+
// ---------------------------------------------------------------------------
|
| 15 |
+
function makeScoredChunk(
|
| 16 |
+
docId: string,
|
| 17 |
+
score: number,
|
| 18 |
+
source: "bm25" | "vector" = "bm25",
|
| 19 |
+
text = `chunk from ${docId}`,
|
| 20 |
+
): ScoredChunk {
|
| 21 |
+
return {
|
| 22 |
+
chunk: { docId, chunkIndex: 0, text, startChar: 0, title: `Title ${docId}` },
|
| 23 |
+
score,
|
| 24 |
+
source,
|
| 25 |
+
};
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
function makeList(
|
| 29 |
+
results: ScoredChunk[],
|
| 30 |
+
queryType: "original" | "lex" | "vec" | "hyde" = "original",
|
| 31 |
+
query = "test query",
|
| 32 |
+
) {
|
| 33 |
+
return { results, queryType, query };
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
// ---------------------------------------------------------------------------
|
| 37 |
+
// reciprocalRankFusion
|
| 38 |
+
// ---------------------------------------------------------------------------
|
| 39 |
+
describe("reciprocalRankFusion", () => {
|
| 40 |
+
it("returns empty array for empty input", () => {
|
| 41 |
+
expect(reciprocalRankFusion([])).toEqual([]);
|
| 42 |
+
});
|
| 43 |
+
|
| 44 |
+
it("returns empty array for lists with no results", () => {
|
| 45 |
+
const result = reciprocalRankFusion([
|
| 46 |
+
makeList([]),
|
| 47 |
+
makeList([]),
|
| 48 |
+
]);
|
| 49 |
+
expect(result).toEqual([]);
|
| 50 |
+
});
|
| 51 |
+
|
| 52 |
+
it("computes correct RRF score for a single list with one result", () => {
|
| 53 |
+
const results = reciprocalRankFusion([
|
| 54 |
+
makeList([makeScoredChunk("doc1", 0.9)]),
|
| 55 |
+
]);
|
| 56 |
+
expect(results).toHaveLength(1);
|
| 57 |
+
// rank=1, weight=PRIMARY(2.0), score = 2/(60+1) + rank1 bonus + rank2 bonus
|
| 58 |
+
const expected = RRF_PRIMARY_WEIGHT / (RRF_K + 1) + RRF_RANK1_BONUS + RRF_RANK2_BONUS;
|
| 59 |
+
expect(results[0].score).toBeCloseTo(expected, 10);
|
| 60 |
+
expect(results[0].docId).toBe("doc1");
|
| 61 |
+
});
|
| 62 |
+
|
| 63 |
+
it("uses primary weight for first two lists and secondary for rest", () => {
|
| 64 |
+
// doc1 appears at rank 1 in all three lists
|
| 65 |
+
const results = reciprocalRankFusion([
|
| 66 |
+
makeList([makeScoredChunk("doc1", 0.9)], "original"),
|
| 67 |
+
makeList([makeScoredChunk("doc1", 0.8)], "lex"),
|
| 68 |
+
makeList([makeScoredChunk("doc1", 0.7)], "vec"),
|
| 69 |
+
]);
|
| 70 |
+
expect(results).toHaveLength(1);
|
| 71 |
+
// Lists 0,1 use PRIMARY weight, list 2 uses SECONDARY weight
|
| 72 |
+
const rrfBase =
|
| 73 |
+
RRF_PRIMARY_WEIGHT / (RRF_K + 1) +
|
| 74 |
+
RRF_PRIMARY_WEIGHT / (RRF_K + 1) +
|
| 75 |
+
RRF_SECONDARY_WEIGHT / (RRF_K + 1);
|
| 76 |
+
const expected = rrfBase + RRF_RANK1_BONUS + RRF_RANK2_BONUS;
|
| 77 |
+
expect(results[0].score).toBeCloseTo(expected, 10);
|
| 78 |
+
});
|
| 79 |
+
|
| 80 |
+
it("merges results from different lists for the same docId", () => {
|
| 81 |
+
const results = reciprocalRankFusion([
|
| 82 |
+
makeList([makeScoredChunk("doc1", 0.9)], "original"),
|
| 83 |
+
makeList([makeScoredChunk("doc1", 0.8, "vector")], "vec"),
|
| 84 |
+
]);
|
| 85 |
+
expect(results).toHaveLength(1);
|
| 86 |
+
expect(results[0].contributions).toHaveLength(2);
|
| 87 |
+
expect(results[0].contributions[0].source).toBe("bm25");
|
| 88 |
+
expect(results[0].contributions[1].source).toBe("vector");
|
| 89 |
+
});
|
| 90 |
+
|
| 91 |
+
it("keeps the best chunk (highest individual score) per doc", () => {
|
| 92 |
+
const results = reciprocalRankFusion([
|
| 93 |
+
makeList([makeScoredChunk("doc1", 0.5, "bm25", "low score chunk")], "original"),
|
| 94 |
+
makeList([makeScoredChunk("doc1", 0.9, "vector", "high score chunk")], "vec"),
|
| 95 |
+
]);
|
| 96 |
+
expect(results[0].bestChunk).toBe("high score chunk");
|
| 97 |
+
});
|
| 98 |
+
|
| 99 |
+
it("ranks documents by total RRF score descending", () => {
|
| 100 |
+
// doc1 appears in both lists, doc2 only in one
|
| 101 |
+
const results = reciprocalRankFusion([
|
| 102 |
+
makeList(
|
| 103 |
+
[makeScoredChunk("doc1", 0.9), makeScoredChunk("doc2", 0.8)],
|
| 104 |
+
"original",
|
| 105 |
+
),
|
| 106 |
+
makeList([makeScoredChunk("doc1", 0.7, "vector")], "vec"),
|
| 107 |
+
]);
|
| 108 |
+
expect(results[0].docId).toBe("doc1");
|
| 109 |
+
expect(results[1].docId).toBe("doc2");
|
| 110 |
+
expect(results[0].score).toBeGreaterThan(results[1].score);
|
| 111 |
+
});
|
| 112 |
+
|
| 113 |
+
it("applies rank 1 bonus only to docs that are rank 1 in some list", () => {
|
| 114 |
+
const results = reciprocalRankFusion([
|
| 115 |
+
makeList(
|
| 116 |
+
[makeScoredChunk("doc1", 0.9), makeScoredChunk("doc2", 0.8)],
|
| 117 |
+
"original",
|
| 118 |
+
),
|
| 119 |
+
]);
|
| 120 |
+
// doc1 is rank 1 -> gets both bonuses
|
| 121 |
+
// doc2 is rank 2 -> gets only rank2 bonus
|
| 122 |
+
const doc1Score = RRF_PRIMARY_WEIGHT / (RRF_K + 1) + RRF_RANK1_BONUS + RRF_RANK2_BONUS;
|
| 123 |
+
const doc2Score = RRF_PRIMARY_WEIGHT / (RRF_K + 2) + RRF_RANK2_BONUS;
|
| 124 |
+
expect(results[0].score).toBeCloseTo(doc1Score, 10);
|
| 125 |
+
expect(results[1].score).toBeCloseTo(doc2Score, 10);
|
| 126 |
+
});
|
| 127 |
+
|
| 128 |
+
it("does not apply rank bonuses to docs ranked 3 or lower", () => {
|
| 129 |
+
const results = reciprocalRankFusion([
|
| 130 |
+
makeList(
|
| 131 |
+
[
|
| 132 |
+
makeScoredChunk("doc1", 0.9),
|
| 133 |
+
makeScoredChunk("doc2", 0.8),
|
| 134 |
+
makeScoredChunk("doc3", 0.7),
|
| 135 |
+
],
|
| 136 |
+
"original",
|
| 137 |
+
),
|
| 138 |
+
]);
|
| 139 |
+
const doc3Score = RRF_PRIMARY_WEIGHT / (RRF_K + 3); // no bonuses
|
| 140 |
+
expect(results[2].score).toBeCloseTo(doc3Score, 10);
|
| 141 |
+
});
|
| 142 |
+
|
| 143 |
+
it("respects candidateLimit", () => {
|
| 144 |
+
const chunks = Array.from({ length: 10 }, (_, i) =>
|
| 145 |
+
makeScoredChunk(`doc${i}`, 1 - i * 0.1),
|
| 146 |
+
);
|
| 147 |
+
const results = reciprocalRankFusion([makeList(chunks)], 3);
|
| 148 |
+
expect(results).toHaveLength(3);
|
| 149 |
+
});
|
| 150 |
+
|
| 151 |
+
it("tracks contributions correctly with queryType and query", () => {
|
| 152 |
+
const results = reciprocalRankFusion([
|
| 153 |
+
makeList([makeScoredChunk("doc1", 0.9)], "original", "hello world"),
|
| 154 |
+
makeList([makeScoredChunk("doc1", 0.8, "vector")], "hyde", "hypothetical doc"),
|
| 155 |
+
]);
|
| 156 |
+
const contribs = results[0].contributions;
|
| 157 |
+
expect(contribs[0].queryType).toBe("original");
|
| 158 |
+
expect(contribs[0].query).toBe("hello world");
|
| 159 |
+
expect(contribs[1].queryType).toBe("hyde");
|
| 160 |
+
expect(contribs[1].query).toBe("hypothetical doc");
|
| 161 |
+
});
|
| 162 |
+
|
| 163 |
+
it("uses correct rank within each list independently", () => {
|
| 164 |
+
// doc1 is rank 2 in list 0, rank 1 in list 1
|
| 165 |
+
const results = reciprocalRankFusion([
|
| 166 |
+
makeList(
|
| 167 |
+
[makeScoredChunk("doc0", 0.9), makeScoredChunk("doc1", 0.8)],
|
| 168 |
+
"original",
|
| 169 |
+
),
|
| 170 |
+
makeList([makeScoredChunk("doc1", 0.95, "vector")], "vec"),
|
| 171 |
+
]);
|
| 172 |
+
const doc1 = results.find((r) => r.docId === "doc1")!;
|
| 173 |
+
// From list 0: rank=2, from list 1: rank=1 -> topRank=1
|
| 174 |
+
expect(doc1.contributions).toHaveLength(2);
|
| 175 |
+
expect(doc1.contributions[0].rank).toBe(2);
|
| 176 |
+
expect(doc1.contributions[1].rank).toBe(1);
|
| 177 |
+
// Should get rank1 bonus since topRank=1
|
| 178 |
+
const expectedScore =
|
| 179 |
+
RRF_PRIMARY_WEIGHT / (RRF_K + 2) +
|
| 180 |
+
RRF_PRIMARY_WEIGHT / (RRF_K + 1) +
|
| 181 |
+
RRF_RANK1_BONUS +
|
| 182 |
+
RRF_RANK2_BONUS;
|
| 183 |
+
expect(doc1.score).toBeCloseTo(expectedScore, 10);
|
| 184 |
+
});
|
| 185 |
+
|
| 186 |
+
it("sets filepath equal to docId", () => {
|
| 187 |
+
const results = reciprocalRankFusion([
|
| 188 |
+
makeList([makeScoredChunk("my-file.md", 0.9)]),
|
| 189 |
+
]);
|
| 190 |
+
expect(results[0].filepath).toBe("my-file.md");
|
| 191 |
+
});
|
| 192 |
+
|
| 193 |
+
it("preserves title from the chunk", () => {
|
| 194 |
+
const results = reciprocalRankFusion([
|
| 195 |
+
makeList([makeScoredChunk("doc1", 0.9)]),
|
| 196 |
+
]);
|
| 197 |
+
expect(results[0].title).toBe("Title doc1");
|
| 198 |
+
});
|
| 199 |
+
|
| 200 |
+
it("handles many lists (>2) with correct weight assignment", () => {
|
| 201 |
+
const results = reciprocalRankFusion([
|
| 202 |
+
makeList([makeScoredChunk("doc1", 0.9)], "original"),
|
| 203 |
+
makeList([makeScoredChunk("doc1", 0.8)], "lex"),
|
| 204 |
+
makeList([makeScoredChunk("doc1", 0.7)], "vec"),
|
| 205 |
+
makeList([makeScoredChunk("doc1", 0.6)], "hyde"),
|
| 206 |
+
]);
|
| 207 |
+
expect(results[0].contributions).toHaveLength(4);
|
| 208 |
+
// First two use primary weight
|
| 209 |
+
expect(results[0].contributions[0].weight).toBe(RRF_PRIMARY_WEIGHT);
|
| 210 |
+
expect(results[0].contributions[1].weight).toBe(RRF_PRIMARY_WEIGHT);
|
| 211 |
+
// Rest use secondary weight
|
| 212 |
+
expect(results[0].contributions[2].weight).toBe(RRF_SECONDARY_WEIGHT);
|
| 213 |
+
expect(results[0].contributions[3].weight).toBe(RRF_SECONDARY_WEIGHT);
|
| 214 |
+
});
|
| 215 |
+
|
| 216 |
+
it("default candidateLimit is RERANK_CANDIDATE_LIMIT (40)", () => {
|
| 217 |
+
const chunks = Array.from({ length: 50 }, (_, i) =>
|
| 218 |
+
makeScoredChunk(`doc${i}`, 1 - i * 0.01),
|
| 219 |
+
);
|
| 220 |
+
const results = reciprocalRankFusion([makeList(chunks)]);
|
| 221 |
+
expect(results).toHaveLength(40);
|
| 222 |
+
});
|
| 223 |
+
});
|
src/pipeline/rrf.ts
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { ScoredChunk, RRFResult, RRFContribution } from "../types";
|
| 2 |
+
import {
|
| 3 |
+
RRF_K,
|
| 4 |
+
RRF_PRIMARY_WEIGHT,
|
| 5 |
+
RRF_SECONDARY_WEIGHT,
|
| 6 |
+
RRF_RANK1_BONUS,
|
| 7 |
+
RRF_RANK2_BONUS,
|
| 8 |
+
RERANK_CANDIDATE_LIMIT,
|
| 9 |
+
} from "../constants";
|
| 10 |
+
|
| 11 |
+
interface RankedList {
|
| 12 |
+
results: ScoredChunk[];
|
| 13 |
+
queryType: "original" | "lex" | "vec" | "hyde";
|
| 14 |
+
query: string;
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
// Merge multiple ranked lists using Reciprocal Rank Fusion
|
| 18 |
+
export function reciprocalRankFusion(
|
| 19 |
+
lists: RankedList[],
|
| 20 |
+
candidateLimit: number = RERANK_CANDIDATE_LIMIT,
|
| 21 |
+
): RRFResult[] {
|
| 22 |
+
// For each document across all lists, compute RRF score:
|
| 23 |
+
// score = Σ (weight / (k + rank))
|
| 24 |
+
//
|
| 25 |
+
// Weight rules (from QMD):
|
| 26 |
+
// - First 2 lists: weight = RRF_PRIMARY_WEIGHT (2.0)
|
| 27 |
+
// - Remaining lists: weight = RRF_SECONDARY_WEIGHT (1.0)
|
| 28 |
+
//
|
| 29 |
+
// Rank bonuses:
|
| 30 |
+
// - Rank 1 in any list: +RRF_RANK1_BONUS (0.05)
|
| 31 |
+
// - Rank 1 or 2 in any list: +RRF_RANK2_BONUS (0.02)
|
| 32 |
+
//
|
| 33 |
+
// Group by docId (not chunk), keep the best chunk per doc.
|
| 34 |
+
// Sort by total score descending, return top candidateLimit.
|
| 35 |
+
|
| 36 |
+
const docScores = new Map<
|
| 37 |
+
string,
|
| 38 |
+
{
|
| 39 |
+
docId: string;
|
| 40 |
+
filepath: string;
|
| 41 |
+
title: string;
|
| 42 |
+
bestChunk: string;
|
| 43 |
+
bestChunkScore: number;
|
| 44 |
+
totalScore: number;
|
| 45 |
+
topRank: number;
|
| 46 |
+
contributions: RRFContribution[];
|
| 47 |
+
}
|
| 48 |
+
>();
|
| 49 |
+
|
| 50 |
+
lists.forEach((list, listIndex) => {
|
| 51 |
+
const weight = listIndex < 2 ? RRF_PRIMARY_WEIGHT : RRF_SECONDARY_WEIGHT;
|
| 52 |
+
|
| 53 |
+
list.results.forEach((result, rankIndex) => {
|
| 54 |
+
const rank = rankIndex + 1; // 1-indexed
|
| 55 |
+
const rrfContribution = weight / (RRF_K + rank);
|
| 56 |
+
|
| 57 |
+
const existing = docScores.get(result.chunk.docId);
|
| 58 |
+
if (existing) {
|
| 59 |
+
existing.totalScore += rrfContribution;
|
| 60 |
+
existing.topRank = Math.min(existing.topRank, rank);
|
| 61 |
+
existing.contributions.push({
|
| 62 |
+
source: result.source,
|
| 63 |
+
queryType: list.queryType,
|
| 64 |
+
query: list.query,
|
| 65 |
+
rank,
|
| 66 |
+
weight,
|
| 67 |
+
rrfContribution,
|
| 68 |
+
});
|
| 69 |
+
// Keep the chunk with the highest individual score
|
| 70 |
+
if (result.score > existing.bestChunkScore) {
|
| 71 |
+
existing.bestChunk = result.chunk.text;
|
| 72 |
+
existing.bestChunkScore = result.score;
|
| 73 |
+
}
|
| 74 |
+
} else {
|
| 75 |
+
docScores.set(result.chunk.docId, {
|
| 76 |
+
docId: result.chunk.docId,
|
| 77 |
+
filepath: result.chunk.docId, // In browser demo, docId = filepath
|
| 78 |
+
title: result.chunk.title,
|
| 79 |
+
bestChunk: result.chunk.text,
|
| 80 |
+
bestChunkScore: result.score,
|
| 81 |
+
totalScore: rrfContribution,
|
| 82 |
+
topRank: rank,
|
| 83 |
+
contributions: [
|
| 84 |
+
{
|
| 85 |
+
source: result.source,
|
| 86 |
+
queryType: list.queryType,
|
| 87 |
+
query: list.query,
|
| 88 |
+
rank,
|
| 89 |
+
weight,
|
| 90 |
+
rrfContribution,
|
| 91 |
+
},
|
| 92 |
+
],
|
| 93 |
+
});
|
| 94 |
+
}
|
| 95 |
+
});
|
| 96 |
+
});
|
| 97 |
+
|
| 98 |
+
// Apply rank bonuses
|
| 99 |
+
for (const doc of docScores.values()) {
|
| 100 |
+
if (doc.topRank === 1) doc.totalScore += RRF_RANK1_BONUS;
|
| 101 |
+
if (doc.topRank <= 2) doc.totalScore += RRF_RANK2_BONUS;
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
// Sort and slice
|
| 105 |
+
const results = Array.from(docScores.values())
|
| 106 |
+
.sort((a, b) => b.totalScore - a.totalScore)
|
| 107 |
+
.slice(0, candidateLimit)
|
| 108 |
+
.map((doc) => ({
|
| 109 |
+
docId: doc.docId,
|
| 110 |
+
filepath: doc.filepath,
|
| 111 |
+
title: doc.title,
|
| 112 |
+
bestChunk: doc.bestChunk,
|
| 113 |
+
score: doc.totalScore,
|
| 114 |
+
contributions: doc.contributions,
|
| 115 |
+
}));
|
| 116 |
+
|
| 117 |
+
return results;
|
| 118 |
+
}
|