Spaces:
Running
Running
siddhm11 commited on
Commit ·
61d5f0d
1
Parent(s): 10fbe3b
Phase 4 complete + Phase 4.5 instrumentation foundation
Browse filesPhase 4 (Recommendation Pipeline Fixes) - all implemented:
- 4.1: Importance-weighted quota fusion (fusion.py, 20 tests)
- 4.2: Turso metadata (done in Phase 3.5)
- 4.3: Hungarian matching for cluster stability (10 tests)
- 4.4: Category-level negative suppression (8 tests)
Phase 4.5 (Instrumentation Foundation) - NEW:
- Added ranker_version, candidate_source, cluster_id to interactions table
- ALTER TABLE migration for existing DBs (idempotent)
- Pipeline tagging: Tier 1 papers tagged by cluster/exploration
- End-to-end flow: recommendations.py -> templates -> events.py -> db.py
- 5 dedicated instrumentation tests
TASK-TRACKER: Phase 4 marked COMPLETE, Phase 4.5 added, Phase 8 expanded
Test count: 123 -> 176 (175 passing, 1 pre-existing flaky)
- app/db.py +104 -11
- app/recommend/clustering.py +90 -0
- app/recommend/fusion.py +103 -0
- app/routers/events.py +12 -0
- app/routers/recommendations.py +198 -60
- app/routers/saved.py +14 -3
- app/routers/search.py +4 -1
- app/templates/index.html +11 -8
- app/templates/partials/action_buttons.html +9 -4
- app/templates/partials/paper_card.html +4 -1
- app/turso_svc.py +5 -3
- docs/TASK-TRACKER.md +102 -30
- docs/phases/PHASE3-Hybrid-Semantic-Search.md +1 -1
- docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md +603 -0
- docs/research/03-MultiInterest-Recommender-Architecture.md +1 -1
- docs/research/07-LLM-Summaries-Reranker-and-Scaling-Research.md +426 -0
- docs/walkthroughs/02-Phase2-MultiInterest-Recommender.md +1 -1
- docs/walkthroughs/03-Code-Summary-and-Test-Plan.md +1 -1
- docs/walkthroughs/04-Next-Steps-and-Phase-Plan.md +55 -52
- tests/test_clustering.py +233 -0
- tests/test_db.py +316 -0
- tests/test_fusion.py +231 -0
- tests/test_integration.py +112 -3
- tests/test_search_router.py +22 -6
app/db.py
CHANGED
|
@@ -6,6 +6,11 @@ Tables
|
|
| 6 |
interactions – every user action (save, not_interested, click, view)
|
| 7 |
paper_qdrant_map – arxiv_id → integer Qdrant point ID (cached lazily)
|
| 8 |
paper_metadata – arXiv API response cache (title, abstract, …)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
"""
|
| 10 |
import aiosqlite
|
| 11 |
from app.config import DB_PATH
|
|
@@ -17,14 +22,17 @@ PRAGMA journal_mode=WAL;
|
|
| 17 |
PRAGMA synchronous=NORMAL;
|
| 18 |
|
| 19 |
CREATE TABLE IF NOT EXISTS interactions (
|
| 20 |
-
id
|
| 21 |
-
user_id
|
| 22 |
-
paper_id
|
| 23 |
-
event_type
|
| 24 |
-
source
|
| 25 |
-
position
|
| 26 |
-
query_id
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
| 28 |
);
|
| 29 |
|
| 30 |
CREATE INDEX IF NOT EXISTS idx_ui_user_ts
|
|
@@ -73,10 +81,25 @@ CREATE TABLE IF NOT EXISTS user_clusters (
|
|
| 73 |
"""
|
| 74 |
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
async def init_db() -> None:
|
| 77 |
"""Create tables if they don't exist. Called once at startup."""
|
| 78 |
async with aiosqlite.connect(DB_PATH) as db:
|
| 79 |
await db.executescript(_SCHEMA)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
await db.commit()
|
| 81 |
|
| 82 |
|
|
@@ -89,13 +112,18 @@ async def log_interaction(
|
|
| 89 |
source: str | None = None,
|
| 90 |
position: int | None = None,
|
| 91 |
query_id: str | None = None,
|
|
|
|
|
|
|
|
|
|
| 92 |
) -> None:
|
| 93 |
async with aiosqlite.connect(DB_PATH) as db:
|
| 94 |
await db.execute(
|
| 95 |
"""INSERT INTO interactions
|
| 96 |
-
(user_id, paper_id, event_type, source, position, query_id
|
| 97 |
-
|
| 98 |
-
|
|
|
|
|
|
|
| 99 |
)
|
| 100 |
await db.commit()
|
| 101 |
|
|
@@ -273,3 +301,68 @@ async def get_user_clusters(user_id: str) -> list[dict]:
|
|
| 273 |
)
|
| 274 |
rows = await cur.fetchall()
|
| 275 |
return [dict(r) for r in rows]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
interactions – every user action (save, not_interested, click, view)
|
| 7 |
paper_qdrant_map – arxiv_id → integer Qdrant point ID (cached lazily)
|
| 8 |
paper_metadata – arXiv API response cache (title, abstract, …)
|
| 9 |
+
|
| 10 |
+
Phase 4.5 instrumentation columns (interactions table):
|
| 11 |
+
ranker_version – identifies which pipeline version served the paper
|
| 12 |
+
candidate_source – granular origin: 'cluster_0', 'exploration', 'ewma', etc.
|
| 13 |
+
cluster_id – which interest cluster served this paper (NULL if N/A)
|
| 14 |
"""
|
| 15 |
import aiosqlite
|
| 16 |
from app.config import DB_PATH
|
|
|
|
| 22 |
PRAGMA synchronous=NORMAL;
|
| 23 |
|
| 24 |
CREATE TABLE IF NOT EXISTS interactions (
|
| 25 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 26 |
+
user_id TEXT NOT NULL,
|
| 27 |
+
paper_id TEXT NOT NULL,
|
| 28 |
+
event_type TEXT NOT NULL, -- save | not_interested | click | view
|
| 29 |
+
source TEXT, -- search | recommendation
|
| 30 |
+
position INTEGER,
|
| 31 |
+
query_id TEXT,
|
| 32 |
+
ranker_version TEXT, -- Phase 4.5: pipeline version tag
|
| 33 |
+
candidate_source TEXT, -- Phase 4.5: 'cluster_0' | 'exploration' | 'ewma' | 'qdrant_recommend'
|
| 34 |
+
cluster_id INTEGER, -- Phase 4.5: interest cluster index (NULL if N/A)
|
| 35 |
+
timestamp TEXT NOT NULL DEFAULT (datetime('now'))
|
| 36 |
);
|
| 37 |
|
| 38 |
CREATE INDEX IF NOT EXISTS idx_ui_user_ts
|
|
|
|
| 81 |
"""
|
| 82 |
|
| 83 |
|
| 84 |
+
# ── Phase 4.5: ALTER TABLE migration for existing DBs ─────────────────────────
|
| 85 |
+
# SQLite does not support IF NOT EXISTS for columns, so we try/except.
|
| 86 |
+
_MIGRATION_4_5 = [
|
| 87 |
+
"ALTER TABLE interactions ADD COLUMN ranker_version TEXT",
|
| 88 |
+
"ALTER TABLE interactions ADD COLUMN candidate_source TEXT",
|
| 89 |
+
"ALTER TABLE interactions ADD COLUMN cluster_id INTEGER",
|
| 90 |
+
]
|
| 91 |
+
|
| 92 |
+
|
| 93 |
async def init_db() -> None:
|
| 94 |
"""Create tables if they don't exist. Called once at startup."""
|
| 95 |
async with aiosqlite.connect(DB_PATH) as db:
|
| 96 |
await db.executescript(_SCHEMA)
|
| 97 |
+
# Phase 4.5: add instrumentation columns to existing DBs
|
| 98 |
+
for stmt in _MIGRATION_4_5:
|
| 99 |
+
try:
|
| 100 |
+
await db.execute(stmt)
|
| 101 |
+
except Exception:
|
| 102 |
+
pass # Column already exists — safe to ignore
|
| 103 |
await db.commit()
|
| 104 |
|
| 105 |
|
|
|
|
| 112 |
source: str | None = None,
|
| 113 |
position: int | None = None,
|
| 114 |
query_id: str | None = None,
|
| 115 |
+
ranker_version: str | None = None,
|
| 116 |
+
candidate_source: str | None = None,
|
| 117 |
+
cluster_id: int | None = None,
|
| 118 |
) -> None:
|
| 119 |
async with aiosqlite.connect(DB_PATH) as db:
|
| 120 |
await db.execute(
|
| 121 |
"""INSERT INTO interactions
|
| 122 |
+
(user_id, paper_id, event_type, source, position, query_id,
|
| 123 |
+
ranker_version, candidate_source, cluster_id)
|
| 124 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
| 125 |
+
(user_id, paper_id, event_type, source, position, query_id,
|
| 126 |
+
ranker_version, candidate_source, cluster_id),
|
| 127 |
)
|
| 128 |
await db.commit()
|
| 129 |
|
|
|
|
| 301 |
)
|
| 302 |
rows = await cur.fetchall()
|
| 303 |
return [dict(r) for r in rows]
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
# ── Phase 4.3: Category suppression helpers ───────────────────────────────────
|
| 307 |
+
|
| 308 |
+
async def cache_turso_metadata_batch(papers: list[dict]) -> None:
|
| 309 |
+
"""
|
| 310 |
+
Write Turso paper dicts to the paper_metadata SQLite cache.
|
| 311 |
+
|
| 312 |
+
Called after every Turso fetch so dismissal-category JOINs work.
|
| 313 |
+
Silently skips rows missing required fields.
|
| 314 |
+
"""
|
| 315 |
+
if not papers:
|
| 316 |
+
return
|
| 317 |
+
async with aiosqlite.connect(DB_PATH) as conn:
|
| 318 |
+
for paper in papers:
|
| 319 |
+
if not paper.get("arxiv_id"):
|
| 320 |
+
continue
|
| 321 |
+
try:
|
| 322 |
+
await conn.execute(
|
| 323 |
+
"""INSERT OR REPLACE INTO paper_metadata
|
| 324 |
+
(arxiv_id, title, abstract, authors, category, published)
|
| 325 |
+
VALUES (:arxiv_id, :title, :abstract, :authors, :category, :published)""",
|
| 326 |
+
{
|
| 327 |
+
"arxiv_id": paper.get("arxiv_id", ""),
|
| 328 |
+
"title": paper.get("title", ""),
|
| 329 |
+
"abstract": paper.get("abstract", ""),
|
| 330 |
+
"authors": paper.get("authors", "[]"),
|
| 331 |
+
"category": paper.get("category", ""),
|
| 332 |
+
"published": paper.get("published", ""),
|
| 333 |
+
},
|
| 334 |
+
)
|
| 335 |
+
except Exception:
|
| 336 |
+
pass
|
| 337 |
+
await conn.commit()
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
async def get_suppressed_categories(
|
| 341 |
+
user_id: str,
|
| 342 |
+
threshold: int = 3,
|
| 343 |
+
window_days: int = 14,
|
| 344 |
+
) -> set[str]:
|
| 345 |
+
"""
|
| 346 |
+
Return categories the user has strongly signalled disinterest in.
|
| 347 |
+
|
| 348 |
+
A category is suppressed when the user has dismissed ≥ threshold papers
|
| 349 |
+
in that category within the last window_days days.
|
| 350 |
+
|
| 351 |
+
Requires paper_metadata to be populated (via cache_turso_metadata_batch).
|
| 352 |
+
Returns an empty set if no suppressions are found.
|
| 353 |
+
"""
|
| 354 |
+
async with aiosqlite.connect(DB_PATH) as conn:
|
| 355 |
+
cur = await conn.execute(
|
| 356 |
+
"""SELECT pm.category, COUNT(*) AS cnt
|
| 357 |
+
FROM interactions i
|
| 358 |
+
JOIN paper_metadata pm ON i.paper_id = pm.arxiv_id
|
| 359 |
+
WHERE i.user_id = ?
|
| 360 |
+
AND i.event_type = 'not_interested'
|
| 361 |
+
AND i.timestamp >= datetime('now', ? || ' days')
|
| 362 |
+
AND pm.category != ''
|
| 363 |
+
GROUP BY pm.category
|
| 364 |
+
HAVING COUNT(*) >= ?""",
|
| 365 |
+
(user_id, f"-{window_days}", threshold),
|
| 366 |
+
)
|
| 367 |
+
rows = await cur.fetchall()
|
| 368 |
+
return {row[0] for row in rows}
|
app/recommend/clustering.py
CHANGED
|
@@ -20,6 +20,7 @@ import json
|
|
| 20 |
from dataclasses import dataclass, field
|
| 21 |
import numpy as np
|
| 22 |
from scipy.cluster.hierarchy import ward, fcluster
|
|
|
|
| 23 |
from scipy.spatial.distance import pdist
|
| 24 |
|
| 25 |
from app import db
|
|
@@ -183,6 +184,95 @@ def _find_medoid(embeddings: np.ndarray, centroid: np.ndarray) -> int:
|
|
| 183 |
return int(np.argmin(distances))
|
| 184 |
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
# ── Persistence ───────────────────────────────────────────────────────────────
|
| 187 |
|
| 188 |
async def save_clusters_to_db(user_id: str, clusters: list[InterestCluster]) -> None:
|
|
|
|
| 20 |
from dataclasses import dataclass, field
|
| 21 |
import numpy as np
|
| 22 |
from scipy.cluster.hierarchy import ward, fcluster
|
| 23 |
+
from scipy.optimize import linear_sum_assignment
|
| 24 |
from scipy.spatial.distance import pdist
|
| 25 |
|
| 26 |
from app import db
|
|
|
|
| 184 |
return int(np.argmin(distances))
|
| 185 |
|
| 186 |
|
| 187 |
+
# ── Cluster ID stabilisation (Phase 4.2) ─────────────────────────────────────
|
| 188 |
+
|
| 189 |
+
# Hungarian matches below this cosine similarity are rejected as "unrelated".
|
| 190 |
+
# Doc 06 §"Clustering specifics": a genuinely new interest must not steal an
|
| 191 |
+
# old cluster's identity just because Hungarian found the least-bad assignment.
|
| 192 |
+
CLUSTER_MATCH_MIN_COSINE = 0.5
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def stabilize_cluster_ids(
|
| 196 |
+
new_clusters: list[InterestCluster],
|
| 197 |
+
old_clusters: list[InterestCluster],
|
| 198 |
+
min_cosine_sim: float = CLUSTER_MATCH_MIN_COSINE,
|
| 199 |
+
) -> list[InterestCluster]:
|
| 200 |
+
"""
|
| 201 |
+
Preserve cluster identity across reclusters using the Hungarian algorithm.
|
| 202 |
+
|
| 203 |
+
Every time the user saves a paper we recluster from scratch. Without
|
| 204 |
+
stabilisation, cluster indices shuffle (NLP was 0, now it's 2), breaking
|
| 205 |
+
future analytics and UI labels.
|
| 206 |
+
|
| 207 |
+
Algorithm:
|
| 208 |
+
1. Build cost matrix: cost[i][j] = 1 - cosine_sim(new_medoid_i, old_medoid_j)
|
| 209 |
+
2. Solve with scipy linear_sum_assignment (O(K³), trivial for K ≤ 7)
|
| 210 |
+
3. Matched pairs with cosine_sim >= min_cosine_sim inherit the old idx
|
| 211 |
+
4. Weak matches (cosine_sim < min_cosine_sim) and unmatched new clusters
|
| 212 |
+
get the next available index
|
| 213 |
+
|
| 214 |
+
Args:
|
| 215 |
+
new_clusters: freshly computed clusters (cluster_idx values ignored)
|
| 216 |
+
old_clusters: clusters from the previous recluster (stable reference)
|
| 217 |
+
min_cosine_sim: reject matches below this cosine similarity (default 0.5)
|
| 218 |
+
|
| 219 |
+
Returns:
|
| 220 |
+
new_clusters with stable cluster_idx values assigned.
|
| 221 |
+
"""
|
| 222 |
+
if not old_clusters or not new_clusters:
|
| 223 |
+
return new_clusters
|
| 224 |
+
|
| 225 |
+
new_embs = np.array([c.medoid_embedding for c in new_clusters], dtype=np.float32)
|
| 226 |
+
old_embs = np.array([c.medoid_embedding for c in old_clusters], dtype=np.float32)
|
| 227 |
+
|
| 228 |
+
# L2-normalise before cosine similarity
|
| 229 |
+
def _safe_norm(embs: np.ndarray) -> np.ndarray:
|
| 230 |
+
norms = np.linalg.norm(embs, axis=1, keepdims=True)
|
| 231 |
+
return embs / np.where(norms < 1e-10, 1.0, norms)
|
| 232 |
+
|
| 233 |
+
new_embs = _safe_norm(new_embs)
|
| 234 |
+
old_embs = _safe_norm(old_embs)
|
| 235 |
+
|
| 236 |
+
# Cosine similarity → cost matrix (n_new × n_old)
|
| 237 |
+
sim = new_embs @ old_embs.T
|
| 238 |
+
cost = 1.0 - sim
|
| 239 |
+
|
| 240 |
+
# Hungarian assignment — works on rectangular matrices
|
| 241 |
+
row_ind, col_ind = linear_sum_assignment(cost)
|
| 242 |
+
|
| 243 |
+
# Accept only pairs whose cosine similarity clears the threshold.
|
| 244 |
+
# Weak matches would steal an old cluster's identity for an unrelated topic.
|
| 245 |
+
new_to_stable: dict[int, int] = {}
|
| 246 |
+
for r, c in zip(row_ind, col_ind):
|
| 247 |
+
if float(sim[r, c]) >= min_cosine_sim:
|
| 248 |
+
new_to_stable[int(r)] = old_clusters[int(c)].cluster_idx
|
| 249 |
+
|
| 250 |
+
used_ids: set[int] = set(new_to_stable.values())
|
| 251 |
+
next_id = 0
|
| 252 |
+
|
| 253 |
+
result: list[InterestCluster] = []
|
| 254 |
+
for i, cluster in enumerate(new_clusters):
|
| 255 |
+
if i in new_to_stable:
|
| 256 |
+
stable_idx = new_to_stable[i]
|
| 257 |
+
else:
|
| 258 |
+
# No strong match — assign next free index
|
| 259 |
+
while next_id in used_ids:
|
| 260 |
+
next_id += 1
|
| 261 |
+
stable_idx = next_id
|
| 262 |
+
used_ids.add(stable_idx)
|
| 263 |
+
next_id += 1
|
| 264 |
+
|
| 265 |
+
result.append(InterestCluster(
|
| 266 |
+
cluster_idx=stable_idx,
|
| 267 |
+
medoid_paper_id=cluster.medoid_paper_id,
|
| 268 |
+
medoid_embedding=cluster.medoid_embedding,
|
| 269 |
+
paper_ids=cluster.paper_ids,
|
| 270 |
+
importance=cluster.importance,
|
| 271 |
+
))
|
| 272 |
+
|
| 273 |
+
return result
|
| 274 |
+
|
| 275 |
+
|
| 276 |
# ── Persistence ───────────────────────────────────────────────────────────────
|
| 277 |
|
| 278 |
async def save_clusters_to_db(user_id: str, clusters: list[InterestCluster]) -> None:
|
app/recommend/fusion.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Importance-weighted quota fusion for multi-interest recommendations.
|
| 3 |
+
|
| 4 |
+
Replaces RRF for the recommendation pipeline (not search).
|
| 5 |
+
|
| 6 |
+
RRF is correct for search (different retrievers, same query).
|
| 7 |
+
For recommendations (different cluster queries, same user), RRF lets
|
| 8 |
+
the dominant cluster drown minority interests. Quota ensures every
|
| 9 |
+
interest cluster gets a guaranteed floor of slots.
|
| 10 |
+
|
| 11 |
+
Reference: doc 06 §3.1 — "importance-weighted quota with a floor"
|
| 12 |
+
w_k = importance_k / sum(importance_k)
|
| 13 |
+
slot_k = max(floor(F * w_k), F_min) # F = total, F_min = 3
|
| 14 |
+
# distribute remainder by largest fractional part
|
| 15 |
+
"""
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def allocate_quotas(
|
| 20 |
+
importances: list[float],
|
| 21 |
+
total_slots: int,
|
| 22 |
+
min_slots: int = 3,
|
| 23 |
+
) -> list[int]:
|
| 24 |
+
"""
|
| 25 |
+
Allocate recommendation slots proportionally to cluster importances,
|
| 26 |
+
with a guaranteed minimum per cluster.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
importances: importance score per cluster, same order as clusters
|
| 30 |
+
total_slots: total candidate slots to distribute (e.g. 100)
|
| 31 |
+
min_slots: minimum slots guaranteed to every cluster (default 3)
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
List of slot counts, same length and order as importances.
|
| 35 |
+
sum(result) >= total_slots (may exceed if floor constraints force it).
|
| 36 |
+
"""
|
| 37 |
+
n = len(importances)
|
| 38 |
+
if n == 0:
|
| 39 |
+
return []
|
| 40 |
+
if n == 1:
|
| 41 |
+
return [max(total_slots, min_slots)]
|
| 42 |
+
|
| 43 |
+
total_imp = sum(importances)
|
| 44 |
+
|
| 45 |
+
if total_imp <= 0:
|
| 46 |
+
# Degenerate: equal distribution with floor guarantee
|
| 47 |
+
per = total_slots // n
|
| 48 |
+
result = [per] * n
|
| 49 |
+
for i in range(total_slots - per * n):
|
| 50 |
+
result[i] += 1
|
| 51 |
+
return [max(r, min_slots) for r in result]
|
| 52 |
+
|
| 53 |
+
# Proportional raw allocations
|
| 54 |
+
raw = [imp / total_imp * total_slots for imp in importances]
|
| 55 |
+
|
| 56 |
+
# Apply floor: max(floor(raw_i), min_slots)
|
| 57 |
+
floored = [max(int(r), min_slots) for r in raw]
|
| 58 |
+
|
| 59 |
+
remainder = total_slots - sum(floored)
|
| 60 |
+
|
| 61 |
+
if remainder <= 0:
|
| 62 |
+
# Floor guarantees already account for all slots (or more)
|
| 63 |
+
return floored
|
| 64 |
+
|
| 65 |
+
# Distribute remainder slots by largest fractional part of raw allocations
|
| 66 |
+
fracs = sorted(range(n), key=lambda i: raw[i] % 1.0, reverse=True)
|
| 67 |
+
for j in range(remainder):
|
| 68 |
+
floored[fracs[j % n]] += 1
|
| 69 |
+
|
| 70 |
+
return floored
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def merge_quota_results(
|
| 74 |
+
per_cluster_ids: list[list[str]],
|
| 75 |
+
quotas: list[int],
|
| 76 |
+
) -> list[str]:
|
| 77 |
+
"""
|
| 78 |
+
Merge per-cluster search results respecting quota allocations.
|
| 79 |
+
|
| 80 |
+
Takes up to `quota_k` unique results from each cluster in round-robin
|
| 81 |
+
order across clusters (by importance rank), deduplicating globally.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
per_cluster_ids: list of arxiv_id lists, one per cluster (importance order)
|
| 85 |
+
quotas: slot count for each cluster (same order)
|
| 86 |
+
|
| 87 |
+
Returns:
|
| 88 |
+
Merged list of arxiv_ids, deduplicated, quota-bounded per cluster.
|
| 89 |
+
"""
|
| 90 |
+
seen: set[str] = set()
|
| 91 |
+
result: list[str] = []
|
| 92 |
+
|
| 93 |
+
for cluster_ids, quota in zip(per_cluster_ids, quotas):
|
| 94 |
+
count = 0
|
| 95 |
+
for aid in cluster_ids:
|
| 96 |
+
if count >= quota:
|
| 97 |
+
break
|
| 98 |
+
if aid not in seen:
|
| 99 |
+
result.append(aid)
|
| 100 |
+
seen.add(aid)
|
| 101 |
+
count += 1
|
| 102 |
+
|
| 103 |
+
return result
|
app/routers/events.py
CHANGED
|
@@ -24,6 +24,9 @@ async def save_paper(
|
|
| 24 |
source: str = Form(default="search"),
|
| 25 |
position: int = Form(default=0),
|
| 26 |
query_id: str = Form(default=""),
|
|
|
|
|
|
|
|
|
|
| 27 |
user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
|
| 28 |
):
|
| 29 |
user_id = user_id or str(uuid.uuid4())
|
|
@@ -35,6 +38,9 @@ async def save_paper(
|
|
| 35 |
source=source,
|
| 36 |
position=position or None,
|
| 37 |
query_id=query_id or None,
|
|
|
|
|
|
|
|
|
|
| 38 |
)
|
| 39 |
|
| 40 |
us.record_positive(user_id, paper_id)
|
|
@@ -57,6 +63,9 @@ async def not_interested(
|
|
| 57 |
source: str = Form(default="search"),
|
| 58 |
position: int = Form(default=0),
|
| 59 |
query_id: str = Form(default=""),
|
|
|
|
|
|
|
|
|
|
| 60 |
user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
|
| 61 |
):
|
| 62 |
user_id = user_id or str(uuid.uuid4())
|
|
@@ -68,6 +77,9 @@ async def not_interested(
|
|
| 68 |
source=source,
|
| 69 |
position=position or None,
|
| 70 |
query_id=query_id or None,
|
|
|
|
|
|
|
|
|
|
| 71 |
)
|
| 72 |
|
| 73 |
us.record_negative(user_id, paper_id)
|
|
|
|
| 24 |
source: str = Form(default="search"),
|
| 25 |
position: int = Form(default=0),
|
| 26 |
query_id: str = Form(default=""),
|
| 27 |
+
ranker_version: str = Form(default=""),
|
| 28 |
+
candidate_source: str = Form(default=""),
|
| 29 |
+
cluster_id: str = Form(default=""),
|
| 30 |
user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
|
| 31 |
):
|
| 32 |
user_id = user_id or str(uuid.uuid4())
|
|
|
|
| 38 |
source=source,
|
| 39 |
position=position or None,
|
| 40 |
query_id=query_id or None,
|
| 41 |
+
ranker_version=ranker_version or None,
|
| 42 |
+
candidate_source=candidate_source or None,
|
| 43 |
+
cluster_id=int(cluster_id) if cluster_id else None,
|
| 44 |
)
|
| 45 |
|
| 46 |
us.record_positive(user_id, paper_id)
|
|
|
|
| 63 |
source: str = Form(default="search"),
|
| 64 |
position: int = Form(default=0),
|
| 65 |
query_id: str = Form(default=""),
|
| 66 |
+
ranker_version: str = Form(default=""),
|
| 67 |
+
candidate_source: str = Form(default=""),
|
| 68 |
+
cluster_id: str = Form(default=""),
|
| 69 |
user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
|
| 70 |
):
|
| 71 |
user_id = user_id or str(uuid.uuid4())
|
|
|
|
| 77 |
source=source,
|
| 78 |
position=position or None,
|
| 79 |
query_id=query_id or None,
|
| 80 |
+
ranker_version=ranker_version or None,
|
| 81 |
+
candidate_source=candidate_source or None,
|
| 82 |
+
cluster_id=int(cluster_id) if cluster_id else None,
|
| 83 |
)
|
| 84 |
|
| 85 |
us.record_negative(user_id, paper_id)
|
app/routers/recommendations.py
CHANGED
|
@@ -6,16 +6,21 @@ GET /api/recommendations
|
|
| 6 |
– Returns the recommendations partial HTML
|
| 7 |
|
| 8 |
Recommendation pipeline (cascading fallback):
|
| 9 |
-
Phase 2b: Multi-interest clustering →
|
| 10 |
-
Phase 2a:
|
| 11 |
-
Phase 1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
"""
|
| 13 |
-
import
|
| 14 |
import uuid
|
| 15 |
import numpy as np
|
| 16 |
from fastapi import APIRouter, Request, Cookie
|
| 17 |
from fastapi.responses import HTMLResponse
|
| 18 |
-
from app import qdrant_svc, arxiv_svc, user_state as us
|
| 19 |
from app.config import COOKIE_NAME, REC_LIMIT, REC_MIN_POSITIVES
|
| 20 |
from app.templates_env import templates
|
| 21 |
from app.recommend import profiles
|
|
@@ -23,16 +28,28 @@ from app.recommend.clustering import (
|
|
| 23 |
compute_clusters,
|
| 24 |
save_clusters_to_db,
|
| 25 |
load_clusters_from_db,
|
|
|
|
| 26 |
MIN_PAPERS_FOR_CLUSTERING,
|
| 27 |
)
|
|
|
|
| 28 |
from app.recommend.reranker import rerank_candidates
|
| 29 |
from app.recommend.diversity import mmr_rerank, inject_exploration
|
| 30 |
|
| 31 |
router = APIRouter(prefix="/api")
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# Minimum EWMA interactions before switching from ID-based to vector-based recs
|
| 34 |
_MIN_EWMA_INTERACTIONS = 3
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
@router.get("/recommendations", response_class=HTMLResponse)
|
| 38 |
async def get_recommendations(
|
|
@@ -56,14 +73,27 @@ async def get_recommendations(
|
|
| 56 |
|
| 57 |
seen = us.all_seen(user_id)
|
| 58 |
|
| 59 |
-
#
|
| 60 |
-
|
|
|
|
|
|
|
| 61 |
|
| 62 |
-
# ── Tier
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
if not rec_arxiv_ids:
|
| 64 |
rec_arxiv_ids = await _ewma_recommend(user_id, seen, REC_LIMIT)
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
if not rec_arxiv_ids:
|
| 68 |
rec_arxiv_ids = await qdrant_svc.recommend(
|
| 69 |
positive_arxiv_ids=state.positive_list,
|
|
@@ -71,16 +101,43 @@ async def get_recommendations(
|
|
| 71 |
seen_arxiv_ids=seen,
|
| 72 |
limit=REC_LIMIT,
|
| 73 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
if not rec_arxiv_ids:
|
| 76 |
return _empty_resp()
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
resp = templates.TemplateResponse(
|
| 86 |
request,
|
|
@@ -91,35 +148,34 @@ async def get_recommendations(
|
|
| 91 |
return resp
|
| 92 |
|
| 93 |
|
| 94 |
-
# ── Tier 1: Multi-interest clustering +
|
| 95 |
-
|
| 96 |
-
# Per-cluster candidate limits (descending by importance)
|
| 97 |
-
_CLUSTER_LIMITS = [40, 30, 25, 20, 15, 15, 15]
|
| 98 |
-
|
| 99 |
|
| 100 |
async def _multi_interest_recommend(
|
| 101 |
user_id: str, state, seen: set[str], limit: int
|
| 102 |
-
) -> list[str]:
|
| 103 |
"""
|
| 104 |
-
Full recommendation pipeline (Phase 2b +
|
| 105 |
1. Ward clustering → identify distinct interests
|
| 106 |
-
2.
|
| 107 |
-
3.
|
| 108 |
-
4.
|
| 109 |
-
5.
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
| 113 |
"""
|
| 114 |
positives = state.positive_list
|
| 115 |
if len(positives) < MIN_PAPERS_FOR_CLUSTERING:
|
| 116 |
-
return []
|
| 117 |
|
| 118 |
try:
|
| 119 |
# Fetch embeddings for all saved papers
|
| 120 |
vectors = await qdrant_svc.get_paper_vectors(positives)
|
| 121 |
if len(vectors) < MIN_PAPERS_FOR_CLUSTERING:
|
| 122 |
-
return []
|
| 123 |
|
| 124 |
# Build aligned arrays (only papers we got vectors for)
|
| 125 |
aligned_ids = [pid for pid in positives if pid in vectors]
|
|
@@ -129,38 +185,89 @@ async def _multi_interest_recommend(
|
|
| 129 |
|
| 130 |
# ── Step 1: Compute interest clusters ─────────────────────────────
|
| 131 |
clusters = compute_clusters(aligned_ids, aligned_embs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
await save_clusters_to_db(user_id, clusters)
|
| 133 |
|
| 134 |
-
# ── Step 2:
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
per_cluster_limit = _CLUSTER_LIMITS[i] if i < len(_CLUSTER_LIMITS) else 15
|
| 138 |
-
interest_vectors.append(
|
| 139 |
-
(cluster.medoid_embedding.tolist(), per_cluster_limit)
|
| 140 |
-
)
|
| 141 |
|
|
|
|
| 142 |
st_vec = await profiles.load_profile(user_id, "short_term")
|
| 143 |
-
st_list = st_vec.tolist() if st_vec is not None else None
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
if not candidate_ids:
|
| 153 |
-
return []
|
| 154 |
|
| 155 |
-
# ── Step
|
| 156 |
-
# Fetch embeddings + metadata for candidates
|
| 157 |
cand_vectors = await qdrant_svc.get_paper_vectors(candidate_ids)
|
| 158 |
-
cand_meta = await
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
valid_ids = [cid for cid in candidate_ids if cid in cand_vectors and cid in cand_meta]
|
| 162 |
if not valid_ids:
|
| 163 |
-
return candidate_ids[:limit]
|
| 164 |
|
| 165 |
valid_embs = np.array([cand_vectors[cid] for cid in valid_ids], dtype=np.float32)
|
| 166 |
valid_meta = [cand_meta[cid] for cid in valid_ids]
|
|
@@ -168,6 +275,7 @@ async def _multi_interest_recommend(
|
|
| 168 |
lt_vec = await profiles.load_profile(user_id, "long_term")
|
| 169 |
neg_vec = await profiles.load_profile(user_id, "negative")
|
| 170 |
|
|
|
|
| 171 |
reranked_ids, reranked_scores, reranked_embs = rerank_candidates(
|
| 172 |
candidate_ids=valid_ids,
|
| 173 |
candidate_embeddings=valid_embs,
|
|
@@ -177,7 +285,19 @@ async def _multi_interest_recommend(
|
|
| 177 |
negative_vec=neg_vec,
|
| 178 |
)
|
| 179 |
|
| 180 |
-
# ── Step 4:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
query_vec = lt_vec if lt_vec is not None else aligned_embs.mean(axis=0)
|
| 182 |
mmr_selected = mmr_rerank(
|
| 183 |
query_embedding=query_vec,
|
|
@@ -188,18 +308,38 @@ async def _multi_interest_recommend(
|
|
| 188 |
top_k=limit,
|
| 189 |
)
|
| 190 |
|
| 191 |
-
# ── Step
|
| 192 |
final = inject_exploration(
|
| 193 |
selected_ids=mmr_selected,
|
| 194 |
all_candidate_ids=reranked_ids,
|
| 195 |
n_explore=2,
|
| 196 |
)
|
| 197 |
-
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
except Exception as e:
|
| 201 |
print(f"[recommendations] multi-interest search failed: {e}")
|
| 202 |
-
return []
|
| 203 |
|
| 204 |
|
| 205 |
# ── Tier 2: EWMA single-vector search ────────────────────────────────────────
|
|
@@ -227,5 +367,3 @@ async def _ewma_recommend(
|
|
| 227 |
limit=limit,
|
| 228 |
exclude_ids=seen,
|
| 229 |
)
|
| 230 |
-
|
| 231 |
-
|
|
|
|
| 6 |
– Returns the recommendations partial HTML
|
| 7 |
|
| 8 |
Recommendation pipeline (cascading fallback):
|
| 9 |
+
Phase 2b / 4.1: Multi-interest clustering → quota fusion (≥5 saves)
|
| 10 |
+
Phase 2a: EWMA long-term vector → single vector search (≥3 saves)
|
| 11 |
+
Phase 1: Qdrant BEST_SCORE Recommend API with raw IDs (≥1 save)
|
| 12 |
+
|
| 13 |
+
Phase 4 changes vs Phase 2b:
|
| 14 |
+
- RRF replaced with importance-weighted quota fusion (doc 06 §3.1)
|
| 15 |
+
- Hungarian matching stabilises cluster IDs across reclusters (4.2)
|
| 16 |
+
- Category-level suppression filters strongly disliked topics (4.3)
|
| 17 |
"""
|
| 18 |
+
import asyncio
|
| 19 |
import uuid
|
| 20 |
import numpy as np
|
| 21 |
from fastapi import APIRouter, Request, Cookie
|
| 22 |
from fastapi.responses import HTMLResponse
|
| 23 |
+
from app import db, qdrant_svc, arxiv_svc, turso_svc, user_state as us
|
| 24 |
from app.config import COOKIE_NAME, REC_LIMIT, REC_MIN_POSITIVES
|
| 25 |
from app.templates_env import templates
|
| 26 |
from app.recommend import profiles
|
|
|
|
| 28 |
compute_clusters,
|
| 29 |
save_clusters_to_db,
|
| 30 |
load_clusters_from_db,
|
| 31 |
+
stabilize_cluster_ids,
|
| 32 |
MIN_PAPERS_FOR_CLUSTERING,
|
| 33 |
)
|
| 34 |
+
from app.recommend.fusion import allocate_quotas, merge_quota_results
|
| 35 |
from app.recommend.reranker import rerank_candidates
|
| 36 |
from app.recommend.diversity import mmr_rerank, inject_exploration
|
| 37 |
|
| 38 |
router = APIRouter(prefix="/api")
|
| 39 |
|
| 40 |
+
# Phase 4.5: Pipeline version tag for instrumentation. Bump this on any
|
| 41 |
+
# change to the ranking logic so A/B attribution is possible.
|
| 42 |
+
_RANKER_VERSION = "v4.1_quota_hungarian_suppression"
|
| 43 |
+
|
| 44 |
# Minimum EWMA interactions before switching from ID-based to vector-based recs
|
| 45 |
_MIN_EWMA_INTERACTIONS = 3
|
| 46 |
|
| 47 |
+
# Candidate oversampling factor per cluster (fetch more than quota to handle dedup)
|
| 48 |
+
_OVERSAMPLE = 3
|
| 49 |
+
|
| 50 |
+
# Short-term session context: fixed supplementary pool size
|
| 51 |
+
_ST_SUPPLEMENT = 20
|
| 52 |
+
|
| 53 |
|
| 54 |
@router.get("/recommendations", response_class=HTMLResponse)
|
| 55 |
async def get_recommendations(
|
|
|
|
| 73 |
|
| 74 |
seen = us.all_seen(user_id)
|
| 75 |
|
| 76 |
+
# Phase 4.5: paper_tags maps arxiv_id → instrumentation metadata
|
| 77 |
+
# populated by whichever tier serves the result.
|
| 78 |
+
paper_tags: dict[str, dict] = {}
|
| 79 |
+
rec_arxiv_ids: list[str] = []
|
| 80 |
|
| 81 |
+
# ── Tier 1: Multi-interest clustering + quota fusion (≥5 saves) ──────
|
| 82 |
+
rec_arxiv_ids, paper_tags = await _multi_interest_recommend(
|
| 83 |
+
user_id, state, seen, REC_LIMIT,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# ── Tier 2: EWMA single-vector search (≥3 saves) ──────────────────────
|
| 87 |
if not rec_arxiv_ids:
|
| 88 |
rec_arxiv_ids = await _ewma_recommend(user_id, seen, REC_LIMIT)
|
| 89 |
+
for aid in rec_arxiv_ids:
|
| 90 |
+
paper_tags[aid] = {
|
| 91 |
+
"ranker_version": _RANKER_VERSION,
|
| 92 |
+
"candidate_source": "ewma_longterm",
|
| 93 |
+
"cluster_id": "",
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
# ── Tier 3: Qdrant Recommend API (≥1 save fallback) ───────────────────
|
| 97 |
if not rec_arxiv_ids:
|
| 98 |
rec_arxiv_ids = await qdrant_svc.recommend(
|
| 99 |
positive_arxiv_ids=state.positive_list,
|
|
|
|
| 101 |
seen_arxiv_ids=seen,
|
| 102 |
limit=REC_LIMIT,
|
| 103 |
)
|
| 104 |
+
for aid in rec_arxiv_ids:
|
| 105 |
+
paper_tags[aid] = {
|
| 106 |
+
"ranker_version": _RANKER_VERSION,
|
| 107 |
+
"candidate_source": "qdrant_recommend",
|
| 108 |
+
"cluster_id": "",
|
| 109 |
+
}
|
| 110 |
|
| 111 |
if not rec_arxiv_ids:
|
| 112 |
return _empty_resp()
|
| 113 |
|
| 114 |
+
# Phase 3.5: Turso primary, arXiv API fallback
|
| 115 |
+
meta = await turso_svc.fetch_metadata_batch(rec_arxiv_ids)
|
| 116 |
+
missing = [aid for aid in rec_arxiv_ids if aid not in meta]
|
| 117 |
+
if missing:
|
| 118 |
+
try:
|
| 119 |
+
arxiv_meta = await arxiv_svc.fetch_metadata_batch(missing)
|
| 120 |
+
meta.update(arxiv_meta)
|
| 121 |
+
except Exception as e:
|
| 122 |
+
print(f"[recommendations] arXiv fallback for {len(missing)} IDs failed: {e}")
|
| 123 |
+
|
| 124 |
+
# Cache to SQLite so category suppression JOINs work (Phase 4.3)
|
| 125 |
+
await db.cache_turso_metadata_batch(list(meta.values()))
|
| 126 |
+
|
| 127 |
+
papers = []
|
| 128 |
+
for aid in rec_arxiv_ids:
|
| 129 |
+
if aid not in meta:
|
| 130 |
+
continue
|
| 131 |
+
tags = paper_tags.get(aid, {})
|
| 132 |
+
papers.append({
|
| 133 |
+
**meta[aid],
|
| 134 |
+
"saved": False,
|
| 135 |
+
"dismissed": False,
|
| 136 |
+
# Phase 4.5 instrumentation — embedded in card, flows back via HTMX
|
| 137 |
+
"ranker_version": tags.get("ranker_version", _RANKER_VERSION),
|
| 138 |
+
"candidate_source": tags.get("candidate_source", ""),
|
| 139 |
+
"cluster_id": tags.get("cluster_id", ""),
|
| 140 |
+
})
|
| 141 |
|
| 142 |
resp = templates.TemplateResponse(
|
| 143 |
request,
|
|
|
|
| 148 |
return resp
|
| 149 |
|
| 150 |
|
| 151 |
+
# ── Tier 1: Multi-interest clustering + quota fusion ─────────────────────────
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
async def _multi_interest_recommend(
|
| 154 |
user_id: str, state, seen: set[str], limit: int
|
| 155 |
+
) -> tuple[list[str], dict[str, dict]]:
|
| 156 |
"""
|
| 157 |
+
Full recommendation pipeline (Phase 2b + Phase 4 corrections):
|
| 158 |
1. Ward clustering → identify distinct interests
|
| 159 |
+
2. Quota allocation → per-cluster slot budgets (replaces RRF)
|
| 160 |
+
3. Parallel per-cluster ANN searches → retrieve candidates
|
| 161 |
+
4. Hungarian matching → stabilise cluster IDs across reclusters
|
| 162 |
+
5. Category suppression → remove strongly disliked topics
|
| 163 |
+
6. Heuristic re-ranking → score candidates
|
| 164 |
+
7. MMR diversity → select top-k with diversity
|
| 165 |
+
8. Exploration injection → serendipitous papers
|
| 166 |
+
|
| 167 |
+
Returns ([], {}) to trigger fallback to Tier 2.
|
| 168 |
+
Phase 4.5: second element is {arxiv_id: {ranker_version, candidate_source, cluster_id}}.
|
| 169 |
"""
|
| 170 |
positives = state.positive_list
|
| 171 |
if len(positives) < MIN_PAPERS_FOR_CLUSTERING:
|
| 172 |
+
return [], {}
|
| 173 |
|
| 174 |
try:
|
| 175 |
# Fetch embeddings for all saved papers
|
| 176 |
vectors = await qdrant_svc.get_paper_vectors(positives)
|
| 177 |
if len(vectors) < MIN_PAPERS_FOR_CLUSTERING:
|
| 178 |
+
return [], {}
|
| 179 |
|
| 180 |
# Build aligned arrays (only papers we got vectors for)
|
| 181 |
aligned_ids = [pid for pid in positives if pid in vectors]
|
|
|
|
| 185 |
|
| 186 |
# ── Step 1: Compute interest clusters ─────────────────────────────
|
| 187 |
clusters = compute_clusters(aligned_ids, aligned_embs)
|
| 188 |
+
|
| 189 |
+
# ── Step 4.2: Stabilise cluster IDs with Hungarian matching ───────
|
| 190 |
+
old_clusters_data = await load_clusters_from_db(user_id)
|
| 191 |
+
if old_clusters_data:
|
| 192 |
+
from app.recommend.clustering import InterestCluster
|
| 193 |
+
old_clusters = [
|
| 194 |
+
InterestCluster(
|
| 195 |
+
cluster_idx=row["cluster_idx"],
|
| 196 |
+
medoid_paper_id=row["medoid_paper_id"],
|
| 197 |
+
medoid_embedding=np.array(
|
| 198 |
+
vectors[row["medoid_paper_id"]], dtype=np.float32
|
| 199 |
+
) if row["medoid_paper_id"] in vectors else np.zeros(1024, dtype=np.float32),
|
| 200 |
+
paper_ids=[],
|
| 201 |
+
importance=row["importance"],
|
| 202 |
+
)
|
| 203 |
+
for row in old_clusters_data
|
| 204 |
+
]
|
| 205 |
+
clusters = stabilize_cluster_ids(clusters, old_clusters)
|
| 206 |
+
|
| 207 |
await save_clusters_to_db(user_id, clusters)
|
| 208 |
|
| 209 |
+
# ── Step 2: Quota allocation ───────────────────────────────────────
|
| 210 |
+
importances = [c.importance for c in clusters]
|
| 211 |
+
quotas = allocate_quotas(importances, total_slots=100, min_slots=3)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
+
# ── Step 3: Parallel per-cluster ANN searches ─────────────────────
|
| 214 |
st_vec = await profiles.load_profile(user_id, "short_term")
|
|
|
|
| 215 |
|
| 216 |
+
search_tasks = [
|
| 217 |
+
qdrant_svc.search_by_vector(
|
| 218 |
+
query_vector=c.medoid_embedding.tolist(),
|
| 219 |
+
limit=quota * _OVERSAMPLE,
|
| 220 |
+
exclude_ids=seen,
|
| 221 |
+
)
|
| 222 |
+
for c, quota in zip(clusters, quotas)
|
| 223 |
+
]
|
| 224 |
+
per_cluster_results = await asyncio.gather(*search_tasks)
|
| 225 |
+
|
| 226 |
+
# Phase 4.5: Build paper → cluster mapping BEFORE merge (so we know
|
| 227 |
+
# which cluster each paper was retrieved from).
|
| 228 |
+
paper_cluster_map: dict[str, int] = {}
|
| 229 |
+
for cluster, result_ids in zip(clusters, per_cluster_results):
|
| 230 |
+
for aid in result_ids:
|
| 231 |
+
if aid not in paper_cluster_map: # first-occurrence wins
|
| 232 |
+
paper_cluster_map[aid] = cluster.cluster_idx
|
| 233 |
+
|
| 234 |
+
# Apply quota merge (dedup globally, respect per-cluster quotas)
|
| 235 |
+
candidate_ids = merge_quota_results(list(per_cluster_results), quotas)
|
| 236 |
+
|
| 237 |
+
# Supplement with short-term session context
|
| 238 |
+
if st_vec is not None:
|
| 239 |
+
seen_so_far = seen | set(candidate_ids)
|
| 240 |
+
st_results = await qdrant_svc.search_by_vector(
|
| 241 |
+
query_vector=st_vec.tolist(),
|
| 242 |
+
limit=_ST_SUPPLEMENT,
|
| 243 |
+
exclude_ids=seen_so_far,
|
| 244 |
+
)
|
| 245 |
+
for aid in st_results:
|
| 246 |
+
if aid not in set(candidate_ids):
|
| 247 |
+
candidate_ids.append(aid)
|
| 248 |
+
paper_cluster_map[aid] = -1 # short-term supplement
|
| 249 |
|
| 250 |
if not candidate_ids:
|
| 251 |
+
return [], {}
|
| 252 |
|
| 253 |
+
# ── Step 5: Fetch candidate vectors + metadata ────────────────────
|
|
|
|
| 254 |
cand_vectors = await qdrant_svc.get_paper_vectors(candidate_ids)
|
| 255 |
+
cand_meta = await turso_svc.fetch_metadata_batch(candidate_ids)
|
| 256 |
+
cand_missing = [cid for cid in candidate_ids if cid not in cand_meta]
|
| 257 |
+
if cand_missing:
|
| 258 |
+
try:
|
| 259 |
+
arxiv_cand_meta = await arxiv_svc.fetch_metadata_batch(cand_missing)
|
| 260 |
+
cand_meta.update(arxiv_cand_meta)
|
| 261 |
+
except Exception as e:
|
| 262 |
+
print(f"[recommendations] arXiv fallback for {len(cand_missing)} IDs failed: {e}")
|
| 263 |
+
|
| 264 |
+
# Cache fetched metadata to SQLite for category suppression
|
| 265 |
+
await db.cache_turso_metadata_batch(list(cand_meta.values()))
|
| 266 |
+
|
| 267 |
+
# Only process candidates with both vectors and metadata
|
| 268 |
valid_ids = [cid for cid in candidate_ids if cid in cand_vectors and cid in cand_meta]
|
| 269 |
if not valid_ids:
|
| 270 |
+
return candidate_ids[:limit], {}
|
| 271 |
|
| 272 |
valid_embs = np.array([cand_vectors[cid] for cid in valid_ids], dtype=np.float32)
|
| 273 |
valid_meta = [cand_meta[cid] for cid in valid_ids]
|
|
|
|
| 275 |
lt_vec = await profiles.load_profile(user_id, "long_term")
|
| 276 |
neg_vec = await profiles.load_profile(user_id, "negative")
|
| 277 |
|
| 278 |
+
# ── Step 6: Heuristic re-ranking ──────────────────────────────────
|
| 279 |
reranked_ids, reranked_scores, reranked_embs = rerank_candidates(
|
| 280 |
candidate_ids=valid_ids,
|
| 281 |
candidate_embeddings=valid_embs,
|
|
|
|
| 285 |
negative_vec=neg_vec,
|
| 286 |
)
|
| 287 |
|
| 288 |
+
# ── Step 4.3: Category suppression ────────────────────────────────
|
| 289 |
+
suppressed = await db.get_suppressed_categories(user_id)
|
| 290 |
+
if suppressed:
|
| 291 |
+
kept = [
|
| 292 |
+
i for i, cid in enumerate(reranked_ids)
|
| 293 |
+
if cand_meta.get(cid, {}).get("category", "") not in suppressed
|
| 294 |
+
]
|
| 295 |
+
if kept:
|
| 296 |
+
reranked_ids = [reranked_ids[i] for i in kept]
|
| 297 |
+
reranked_scores = [reranked_scores[i] for i in kept]
|
| 298 |
+
reranked_embs = reranked_embs[kept]
|
| 299 |
+
|
| 300 |
+
# ── Step 7: MMR diversity enforcement ─────────────────────────────
|
| 301 |
query_vec = lt_vec if lt_vec is not None else aligned_embs.mean(axis=0)
|
| 302 |
mmr_selected = mmr_rerank(
|
| 303 |
query_embedding=query_vec,
|
|
|
|
| 308 |
top_k=limit,
|
| 309 |
)
|
| 310 |
|
| 311 |
+
# ── Step 8: Exploration injection ─────────────────────────────────
|
| 312 |
final = inject_exploration(
|
| 313 |
selected_ids=mmr_selected,
|
| 314 |
all_candidate_ids=reranked_ids,
|
| 315 |
n_explore=2,
|
| 316 |
)
|
| 317 |
+
final = final[:limit + 2]
|
| 318 |
+
|
| 319 |
+
# Phase 4.5: Build per-paper instrumentation tags
|
| 320 |
+
exploration_set = set(final) - set(mmr_selected)
|
| 321 |
+
paper_tags: dict[str, dict] = {}
|
| 322 |
+
for aid in final:
|
| 323 |
+
cluster_idx = paper_cluster_map.get(aid)
|
| 324 |
+
if aid in exploration_set:
|
| 325 |
+
source = "exploration"
|
| 326 |
+
elif cluster_idx == -1:
|
| 327 |
+
source = "short_term_supplement"
|
| 328 |
+
elif cluster_idx is not None:
|
| 329 |
+
source = f"cluster_{cluster_idx}"
|
| 330 |
+
else:
|
| 331 |
+
source = "tier1_unknown"
|
| 332 |
+
paper_tags[aid] = {
|
| 333 |
+
"ranker_version": _RANKER_VERSION,
|
| 334 |
+
"candidate_source": source,
|
| 335 |
+
"cluster_id": str(cluster_idx) if cluster_idx is not None and cluster_idx >= 0 else "",
|
| 336 |
+
}
|
| 337 |
+
|
| 338 |
+
return final, paper_tags
|
| 339 |
|
| 340 |
except Exception as e:
|
| 341 |
print(f"[recommendations] multi-interest search failed: {e}")
|
| 342 |
+
return [], {}
|
| 343 |
|
| 344 |
|
| 345 |
# ── Tier 2: EWMA single-vector search ────────────────────────────────────────
|
|
|
|
| 367 |
limit=limit,
|
| 368 |
exclude_ids=seen,
|
| 369 |
)
|
|
|
|
|
|
app/routers/saved.py
CHANGED
|
@@ -3,12 +3,12 @@ Saved papers router.
|
|
| 3 |
|
| 4 |
GET /saved
|
| 5 |
– Shows all papers the user has currently saved (positive_list)
|
| 6 |
-
– Metadata fetched via
|
| 7 |
"""
|
| 8 |
import uuid
|
| 9 |
from fastapi import APIRouter, Request, Cookie
|
| 10 |
from fastapi.responses import HTMLResponse
|
| 11 |
-
from app import arxiv_svc, user_state as us
|
| 12 |
from app.config import COOKIE_NAME
|
| 13 |
from app.templates_env import templates
|
| 14 |
|
|
@@ -27,7 +27,18 @@ async def saved_papers(
|
|
| 27 |
|
| 28 |
papers = []
|
| 29 |
if saved_ids:
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
papers = [
|
| 32 |
{**meta[aid], "saved": True, "dismissed": False}
|
| 33 |
for aid in saved_ids
|
|
|
|
| 3 |
|
| 4 |
GET /saved
|
| 5 |
– Shows all papers the user has currently saved (positive_list)
|
| 6 |
+
– Metadata fetched via Turso DB (Phase 3.5), arXiv API fallback
|
| 7 |
"""
|
| 8 |
import uuid
|
| 9 |
from fastapi import APIRouter, Request, Cookie
|
| 10 |
from fastapi.responses import HTMLResponse
|
| 11 |
+
from app import arxiv_svc, db, turso_svc, user_state as us
|
| 12 |
from app.config import COOKIE_NAME
|
| 13 |
from app.templates_env import templates
|
| 14 |
|
|
|
|
| 27 |
|
| 28 |
papers = []
|
| 29 |
if saved_ids:
|
| 30 |
+
# Phase 3.5: Turso primary, arXiv API fallback
|
| 31 |
+
meta = await turso_svc.fetch_metadata_batch(saved_ids)
|
| 32 |
+
missing = [aid for aid in saved_ids if aid not in meta]
|
| 33 |
+
if missing:
|
| 34 |
+
try:
|
| 35 |
+
arxiv_meta = await arxiv_svc.fetch_metadata_batch(missing)
|
| 36 |
+
meta.update(arxiv_meta)
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"[saved] arXiv fallback for {len(missing)} IDs failed: {e}")
|
| 39 |
+
# Phase 4.3: Cache to SQLite so dismissal category JOINs work
|
| 40 |
+
await db.cache_turso_metadata_batch(list(meta.values()))
|
| 41 |
+
|
| 42 |
papers = [
|
| 43 |
{**meta[aid], "saved": True, "dismissed": False}
|
| 44 |
for aid in saved_ids
|
app/routers/search.py
CHANGED
|
@@ -14,7 +14,7 @@ Phase 3.5: Metadata now fetched from Turso cloud DB (fast, includes citations)
|
|
| 14 |
import uuid
|
| 15 |
from fastapi import APIRouter, Request, Cookie
|
| 16 |
from fastapi.responses import HTMLResponse
|
| 17 |
-
from app import arxiv_svc, turso_svc, user_state as us, hybrid_search_svc
|
| 18 |
from app.config import COOKIE_NAME, ARXIV_MAX_RESULTS
|
| 19 |
from app.templates_env import templates
|
| 20 |
|
|
@@ -53,6 +53,9 @@ async def search(
|
|
| 53 |
except Exception as e:
|
| 54 |
print(f"[search] arXiv fallback for {len(missing)} IDs failed: {e}")
|
| 55 |
|
|
|
|
|
|
|
|
|
|
| 56 |
# Preserve ranking order from hybrid search
|
| 57 |
papers = [meta[aid] for aid in arxiv_ids if aid in meta]
|
| 58 |
|
|
|
|
| 14 |
import uuid
|
| 15 |
from fastapi import APIRouter, Request, Cookie
|
| 16 |
from fastapi.responses import HTMLResponse
|
| 17 |
+
from app import arxiv_svc, db, turso_svc, user_state as us, hybrid_search_svc
|
| 18 |
from app.config import COOKIE_NAME, ARXIV_MAX_RESULTS
|
| 19 |
from app.templates_env import templates
|
| 20 |
|
|
|
|
| 53 |
except Exception as e:
|
| 54 |
print(f"[search] arXiv fallback for {len(missing)} IDs failed: {e}")
|
| 55 |
|
| 56 |
+
# Phase 4.3: Cache to SQLite so dismissal category JOINs work
|
| 57 |
+
await db.cache_turso_metadata_batch(list(meta.values()))
|
| 58 |
+
|
| 59 |
# Preserve ranking order from hybrid search
|
| 60 |
papers = [meta[aid] for aid in arxiv_ids if aid in meta]
|
| 61 |
|
app/templates/index.html
CHANGED
|
@@ -31,14 +31,17 @@
|
|
| 31 |
<!-- Recommendations section -->
|
| 32 |
<div>
|
| 33 |
<h2 class="text-lg font-semibold mb-3">Recommended for You</h2>
|
| 34 |
-
<div id="rec-section"
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
<
|
|
|
|
|
|
|
|
|
|
| 42 |
</div>
|
| 43 |
</div>
|
| 44 |
</div>
|
|
|
|
| 31 |
<!-- Recommendations section -->
|
| 32 |
<div>
|
| 33 |
<h2 class="text-lg font-semibold mb-3">Recommended for You</h2>
|
| 34 |
+
<div id="rec-section-wrapper" class="relative">
|
| 35 |
+
<span id="rec-spinner" class="htmx-indicator loading loading-spinner loading-sm absolute right-0 top-0"></span>
|
| 36 |
+
<div id="rec-section"
|
| 37 |
+
hx-get="/api/recommendations"
|
| 38 |
+
hx-trigger="load"
|
| 39 |
+
hx-indicator="#rec-spinner"
|
| 40 |
+
hx-swap="innerHTML">
|
| 41 |
+
<div class="flex items-center gap-2 text-base-content/50">
|
| 42 |
+
<span class="loading loading-spinner loading-sm"></span>
|
| 43 |
+
<span>Loading recommendations…</span>
|
| 44 |
+
</div>
|
| 45 |
</div>
|
| 46 |
</div>
|
| 47 |
</div>
|
app/templates/partials/action_buttons.html
CHANGED
|
@@ -2,12 +2,16 @@
|
|
| 2 |
Action buttons for a paper card.
|
| 3 |
Expects: paper_id (or paper.arxiv_id), saved (bool), dismissed (bool)
|
| 4 |
Optional: source ("search" | "recommendation" | "saved"), position (int)
|
|
|
|
| 5 |
These are returned directly by the /api/papers/{id}/save endpoint
|
| 6 |
so they also work as a standalone partial.
|
| 7 |
#}
|
| 8 |
{% set pid = paper_id if paper_id is defined else paper.arxiv_id %}
|
| 9 |
{% set is_saved = saved if saved is defined else (paper.saved | default(false)) %}
|
| 10 |
{% set _source = source if source is defined else "search" %}
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
{% if is_saved %}
|
| 13 |
<!-- Already saved — show saved state, allow unsave via not-interested -->
|
|
@@ -19,7 +23,7 @@
|
|
| 19 |
hx-post="/api/papers/{{ pid }}/not-interested"
|
| 20 |
hx-target="#paper-{{ pid }}"
|
| 21 |
hx-swap="outerHTML swap:200ms"
|
| 22 |
-
hx-vals='{"source": "{{ _source }}"}'>
|
| 23 |
Remove
|
| 24 |
</button>
|
| 25 |
</div>
|
|
@@ -28,9 +32,9 @@
|
|
| 28 |
<!-- Save -->
|
| 29 |
<button class="btn btn-primary btn-xs"
|
| 30 |
hx-post="/api/papers/{{ pid }}/save"
|
| 31 |
-
hx-target="
|
| 32 |
hx-swap="innerHTML"
|
| 33 |
-
hx-vals='{"source": "{{ _source }}", "position": "{{ position | default(0) }}"}'>
|
| 34 |
⭐ Save
|
| 35 |
</button>
|
| 36 |
<!-- Not interested (removes the whole card) -->
|
|
@@ -38,8 +42,9 @@
|
|
| 38 |
hx-post="/api/papers/{{ pid }}/not-interested"
|
| 39 |
hx-target="#paper-{{ pid }}"
|
| 40 |
hx-swap="outerHTML swap:200ms"
|
| 41 |
-
hx-vals='{"source": "{{ _source }}"}'>
|
| 42 |
✕ Not interested
|
| 43 |
</button>
|
| 44 |
</div>
|
| 45 |
{% endif %}
|
|
|
|
|
|
| 2 |
Action buttons for a paper card.
|
| 3 |
Expects: paper_id (or paper.arxiv_id), saved (bool), dismissed (bool)
|
| 4 |
Optional: source ("search" | "recommendation" | "saved"), position (int)
|
| 5 |
+
Phase 4.5: ranker_version, candidate_source, cluster_id (set by recommendations.py)
|
| 6 |
These are returned directly by the /api/papers/{id}/save endpoint
|
| 7 |
so they also work as a standalone partial.
|
| 8 |
#}
|
| 9 |
{% set pid = paper_id if paper_id is defined else paper.arxiv_id %}
|
| 10 |
{% set is_saved = saved if saved is defined else (paper.saved | default(false)) %}
|
| 11 |
{% set _source = source if source is defined else "search" %}
|
| 12 |
+
{% set _ranker_version = paper.ranker_version | default("") if paper is defined else "" %}
|
| 13 |
+
{% set _candidate_source = paper.candidate_source | default("") if paper is defined else "" %}
|
| 14 |
+
{% set _cluster_id = paper.cluster_id | default("") if paper is defined else "" %}
|
| 15 |
|
| 16 |
{% if is_saved %}
|
| 17 |
<!-- Already saved — show saved state, allow unsave via not-interested -->
|
|
|
|
| 23 |
hx-post="/api/papers/{{ pid }}/not-interested"
|
| 24 |
hx-target="#paper-{{ pid }}"
|
| 25 |
hx-swap="outerHTML swap:200ms"
|
| 26 |
+
hx-vals='{"source": "{{ _source }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}"}'>
|
| 27 |
Remove
|
| 28 |
</button>
|
| 29 |
</div>
|
|
|
|
| 32 |
<!-- Save -->
|
| 33 |
<button class="btn btn-primary btn-xs"
|
| 34 |
hx-post="/api/papers/{{ pid }}/save"
|
| 35 |
+
hx-target="[id='actions-{{ pid }}']"
|
| 36 |
hx-swap="innerHTML"
|
| 37 |
+
hx-vals='{"source": "{{ _source }}", "position": "{{ position | default(0) }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}"}'>
|
| 38 |
⭐ Save
|
| 39 |
</button>
|
| 40 |
<!-- Not interested (removes the whole card) -->
|
|
|
|
| 42 |
hx-post="/api/papers/{{ pid }}/not-interested"
|
| 43 |
hx-target="#paper-{{ pid }}"
|
| 44 |
hx-swap="outerHTML swap:200ms"
|
| 45 |
+
hx-vals='{"source": "{{ _source }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}"}'>
|
| 46 |
✕ Not interested
|
| 47 |
</button>
|
| 48 |
</div>
|
| 49 |
{% endif %}
|
| 50 |
+
|
app/templates/partials/paper_card.html
CHANGED
|
@@ -25,11 +25,14 @@
|
|
| 25 |
{% endif %}
|
| 26 |
</div>
|
| 27 |
|
| 28 |
-
<!-- Meta: arXiv ID + year -->
|
| 29 |
<div class="text-xs text-base-content/50">
|
| 30 |
[{{ paper.arxiv_id }}]
|
| 31 |
{% if paper.published %} · {{ paper.published[:4] }}{% endif %}
|
| 32 |
{% if authors_list %} · {{ authors_list | join(", ") }}{% endif %}
|
|
|
|
|
|
|
|
|
|
| 33 |
</div>
|
| 34 |
|
| 35 |
<!-- Abstract (truncated) -->
|
|
|
|
| 25 |
{% endif %}
|
| 26 |
</div>
|
| 27 |
|
| 28 |
+
<!-- Meta: arXiv ID + year + citations -->
|
| 29 |
<div class="text-xs text-base-content/50">
|
| 30 |
[{{ paper.arxiv_id }}]
|
| 31 |
{% if paper.published %} · {{ paper.published[:4] }}{% endif %}
|
| 32 |
{% if authors_list %} · {{ authors_list | join(", ") }}{% endif %}
|
| 33 |
+
{% if paper.citation_count %}
|
| 34 |
+
· <span class="font-medium text-base-content/70" title="{{ paper.influential_citations|default(0) }} influential">📊 {{ paper.citation_count }} citations</span>
|
| 35 |
+
{% endif %}
|
| 36 |
</div>
|
| 37 |
|
| 38 |
<!-- Abstract (truncated) -->
|
app/turso_svc.py
CHANGED
|
@@ -59,9 +59,11 @@ async def fetch_metadata_batch(arxiv_ids: list[str]) -> dict[str, dict]:
|
|
| 59 |
pipeline_url = url.rstrip("/")
|
| 60 |
# Convert to HTTP API URL format
|
| 61 |
if pipeline_url.startswith("libsql://"):
|
| 62 |
-
pipeline_url =
|
| 63 |
-
|
| 64 |
-
pipeline_url = "https://" + pipeline_url
|
|
|
|
|
|
|
| 65 |
|
| 66 |
payload = {
|
| 67 |
"requests": [
|
|
|
|
| 59 |
pipeline_url = url.rstrip("/")
|
| 60 |
# Convert to HTTP API URL format
|
| 61 |
if pipeline_url.startswith("libsql://"):
|
| 62 |
+
pipeline_url = "https://" + pipeline_url[len("libsql://"):]
|
| 63 |
+
elif pipeline_url.startswith("http://"):
|
| 64 |
+
pipeline_url = "https://" + pipeline_url[len("http://"):]
|
| 65 |
+
elif not pipeline_url.startswith("https://"):
|
| 66 |
+
pipeline_url = "https://" + pipeline_url
|
| 67 |
|
| 68 |
payload = {
|
| 69 |
"requests": [
|
docs/TASK-TRACKER.md
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
# ResearchIT — Master Task Tracker
|
| 2 |
|
| 3 |
> **Purpose**: Single source of truth for all completed, in-progress, and upcoming work.
|
| 4 |
-
> **Last updated**: 2026-04-
|
| 5 |
-
> **Current phase**: Phase
|
| 6 |
|
| 7 |
---
|
| 8 |
|
|
@@ -241,21 +241,25 @@
|
|
| 241 |
|
| 242 |
---
|
| 243 |
|
| 244 |
-
## Phase 4: Recommendation Pipeline Fixes
|
| 245 |
|
| 246 |
-
> *
|
| 247 |
-
> *
|
| 248 |
|
| 249 |
### 4.1 — Replace RRF with Importance-Weighted Quota Fusion
|
| 250 |
-
- [
|
| 251 |
- `w_k = importance_k / sum(importance_k)`
|
| 252 |
- `slot_k = max(floor(F × w_k), F_min=3)` — every cluster gets at least 3 slots
|
| 253 |
- Distribute remainder by largest fractional part
|
| 254 |
-
- [
|
|
|
|
|
|
|
| 255 |
- Replace `multi_interest_search()` with per-cluster separate ANN queries
|
| 256 |
-
-
|
| 257 |
-
-
|
| 258 |
-
-
|
|
|
|
|
|
|
| 259 |
|
| 260 |
### 4.2 — Pre-populate Metadata Store ✅ DONE (via Turso)
|
| 261 |
- [x] Bulk-loaded arXiv metadata from Kaggle to Turso cloud DB (Phase 3.5)
|
|
@@ -265,13 +269,60 @@
|
|
| 265 |
- [x] **Impact**: Search time dropped from ~10.7s to ~1.75s on HF Spaces
|
| 266 |
|
| 267 |
### 4.3 — Hungarian Matching for Cluster Stability
|
| 268 |
-
- [
|
| 269 |
-
-
|
| 270 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
---
|
| 277 |
|
|
@@ -306,7 +357,8 @@
|
|
| 306 |
|
| 307 |
> *Replace heuristic scorer with a trained LightGBM lambdarank model.*
|
| 308 |
> *Blocked by: ≥500 labeled interactions OR citation-graph bootstrap*
|
| 309 |
-
> *Estimated effort: ~2-4 weeks*
|
|
|
|
| 310 |
|
| 311 |
- [ ] Citation-graph pseudo-labels from unarXive 2022 (cited = relevance 2, co-cited = 1, random = 0)
|
| 312 |
- [ ] Author-as-user simulation
|
|
@@ -329,11 +381,30 @@
|
|
| 329 |
|
| 330 |
## Phase 8: LLM Interest Summaries + Distilled Re-ranker 📋 NOT STARTED
|
| 331 |
|
| 332 |
-
> *Estimated effort: ~
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 337 |
|
| 338 |
---
|
| 339 |
|
|
@@ -380,18 +451,19 @@
|
|
| 380 |
| Test File | Count | Status |
|
| 381 |
|---|---|---|
|
| 382 |
| `tests/test_profiles.py` | 11 | ✅ Passing |
|
| 383 |
-
| `tests/test_clustering.py` |
|
| 384 |
| `tests/test_reranker_diversity.py` | 13 | ✅ Passing |
|
| 385 |
-
| `tests/
|
|
|
|
| 386 |
| `tests/test_qdrant_svc.py` | — | ✅ Passing |
|
| 387 |
| `tests/test_arxiv_svc.py` | — | ✅ Passing |
|
| 388 |
-
| `tests/test_integration.py` | — | ✅ Passing |
|
| 389 |
| `tests/test_user_state.py` | — | ✅ Passing |
|
| 390 |
| `tests/test_saved.py` | — | ✅ Passing |
|
| 391 |
| `tests/test_hybrid_search.py` | 21 | ✅ Passing |
|
| 392 |
| `tests/test_search_router.py` | 6 | ✅ Passing |
|
| 393 |
| `tests/test_live_search.py` | 8 | ✅ Passing |
|
| 394 |
-
| **Total** | **
|
| 395 |
| `test_e2e_recs.py` (standalone) | 1 | ✅ E2E simulation |
|
| 396 |
|
| 397 |
---
|
|
@@ -404,8 +476,8 @@
|
|
| 404 |
| L2-normalize before Ward clustering | ✅ Applied | `app/recommend/clustering.py` |
|
| 405 |
| Medoid not centroid | ✅ Applied | `app/recommend/clustering.py` → `_find_medoid()` |
|
| 406 |
| Negative EWMA wired into reranking | ✅ Applied | `app/recommend/reranker.py` → Feature 5 |
|
| 407 |
-
| RRF → quota fusion for recommendations |
|
| 408 |
-
| Hungarian cluster matching |
|
| 409 |
-
| Per-item short-term negative decay | [!] Backlog | Phase
|
| 410 |
-
| Category-level suppression |
|
| 411 |
| BGE-reranker NEVER in hot path | ✅ Followed | Heuristic scorer used instead |
|
|
|
|
| 1 |
# ResearchIT — Master Task Tracker
|
| 2 |
|
| 3 |
> **Purpose**: Single source of truth for all completed, in-progress, and upcoming work.
|
| 4 |
+
> **Last updated**: 2026-04-26
|
| 5 |
+
> **Current phase**: Phase 4.5 (Instrumentation Foundation) — COMPLETE ✔
|
| 6 |
|
| 7 |
---
|
| 8 |
|
|
|
|
| 241 |
|
| 242 |
---
|
| 243 |
|
| 244 |
+
## Phase 4: Recommendation Pipeline Fixes ✅ COMPLETE
|
| 245 |
|
| 246 |
+
> *Fixed the known architectural debt in the recommendation pipeline.*
|
| 247 |
+
> *Detailed plan: `docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`*
|
| 248 |
|
| 249 |
### 4.1 — Replace RRF with Importance-Weighted Quota Fusion
|
| 250 |
+
- [x] Create `app/recommend/fusion.py` — quota allocation logic
|
| 251 |
- `w_k = importance_k / sum(importance_k)`
|
| 252 |
- `slot_k = max(floor(F × w_k), F_min=3)` — every cluster gets at least 3 slots
|
| 253 |
- Distribute remainder by largest fractional part
|
| 254 |
+
- [x] Create `tests/test_fusion.py` — **20 unit tests** for quota allocation
|
| 255 |
+
- Proportionality, floor enforcement, total invariant, edge cases, Doc 06 worked examples
|
| 256 |
+
- [x] Refactor `_multi_interest_recommend()` in `recommendations.py`
|
| 257 |
- Replace `multi_interest_search()` with per-cluster separate ANN queries
|
| 258 |
+
- Use `asyncio.gather()` for concurrent searches (~15ms wall-clock)
|
| 259 |
+
- Allocate feed slots proportionally via `allocate_quotas()`
|
| 260 |
+
- Deduplicate across clusters (first-occurrence = highest-ranked cluster wins)
|
| 261 |
+
- MMR over merged union (unchanged)
|
| 262 |
+
- [x] Keep `qdrant_svc.multi_interest_search()` in codebase (no deletion)
|
| 263 |
|
| 264 |
### 4.2 — Pre-populate Metadata Store ✅ DONE (via Turso)
|
| 265 |
- [x] Bulk-loaded arXiv metadata from Kaggle to Turso cloud DB (Phase 3.5)
|
|
|
|
| 269 |
- [x] **Impact**: Search time dropped from ~10.7s to ~1.75s on HF Spaces
|
| 270 |
|
| 271 |
### 4.3 — Hungarian Matching for Cluster Stability
|
| 272 |
+
- [x] Add `stabilize_cluster_ids()` function to `clustering.py`
|
| 273 |
+
- Uses `scipy.optimize.linear_sum_assignment` (already a dependency)
|
| 274 |
+
- Cost matrix: `1 - cosine_sim(new_medoid, old_medoid)` — trivial at K≤7
|
| 275 |
+
- Matched clusters keep old indices; new clusters get next available
|
| 276 |
+
- Min cosine threshold (0.5) rejects unrelated matches
|
| 277 |
+
- [x] Call between `compute_clusters()` and `save_clusters_to_db()` in recommendations.py
|
| 278 |
+
- [x] **10 tests** in `test_clustering.py` — perturbed clusters preserve indices,
|
| 279 |
+
unrelated match rejection, K growth/shrink, custom thresholds
|
| 280 |
+
|
| 281 |
+
### 4.4 — Category-Level Negative Suppression
|
| 282 |
+
- [x] Add `get_suppressed_categories()` to `db.py`
|
| 283 |
+
- Joins `interactions` + `paper_metadata` to find categories with ≥3 dismissals
|
| 284 |
+
- **Primary category only** (decision: avoid over-suppression)
|
| 285 |
+
- **14-day window** (standard default, τ_neg = 14 days)
|
| 286 |
+
- [x] Add suppression filter in `_multi_interest_recommend()` after reranking
|
| 287 |
+
- [x] Cache Turso metadata to `paper_metadata` via `cache_turso_metadata_batch()`
|
| 288 |
+
- [x] **8 tests** in `test_db.py` — threshold, partitioning, user isolation, custom threshold
|
| 289 |
+
- [~] Per-item short-term decay → **deferred to Phase 6** (LightGBM feature)
|
| 290 |
+
|
| 291 |
+
**Gaps**: None.
|
| 292 |
+
|
| 293 |
+
---
|
| 294 |
+
|
| 295 |
+
## Phase 4.5: Instrumentation Foundation ✅ COMPLETE
|
| 296 |
|
| 297 |
+
> *Added telemetry columns to the interactions table so every saved/dismissed paper*
|
| 298 |
+
> *can be attributed to its pipeline tier, cluster origin, and ranker version.*
|
| 299 |
+
> *Doc 07 (ADR A4) identified this as the single most valuable early investment —*
|
| 300 |
+
> *retrofitting these fields after real user data exists is painful and blocks all*
|
| 301 |
+
> *later counterfactual evaluation.*
|
| 302 |
+
|
| 303 |
+
### Schema changes
|
| 304 |
+
- [x] Add `ranker_version TEXT` to `interactions` table — pipeline version tag
|
| 305 |
+
- [x] Add `candidate_source TEXT` to `interactions` — e.g. `cluster_0`, `exploration`, `ewma_longterm`, `qdrant_recommend`, `short_term_supplement`
|
| 306 |
+
- [x] Add `cluster_id INTEGER` to `interactions` — interest cluster index (NULL if N/A)
|
| 307 |
+
- [x] ALTER TABLE migration for existing DBs (safe try/except, idempotent)
|
| 308 |
+
|
| 309 |
+
### Pipeline tagging
|
| 310 |
+
- [x] Add `_RANKER_VERSION` constant to `recommendations.py`
|
| 311 |
+
- [x] Tag Tier 1 papers with cluster origin, exploration status, short-term supplement
|
| 312 |
+
- [x] Tag Tier 2 papers as `ewma_longterm`
|
| 313 |
+
- [x] Tag Tier 3 papers as `qdrant_recommend`
|
| 314 |
+
- [x] Build `paper_cluster_map` before quota merge (first-occurrence = cluster attribution)
|
| 315 |
+
- [x] Exploration papers tagged as `candidate_source='exploration'`
|
| 316 |
+
|
| 317 |
+
### End-to-end flow
|
| 318 |
+
- [x] `recommendations.py` embeds tags in paper dicts
|
| 319 |
+
- [x] `action_buttons.html` includes tags in `hx-vals` JSON
|
| 320 |
+
- [x] `events.py` accepts `ranker_version`, `candidate_source`, `cluster_id` Form fields
|
| 321 |
+
- [x] `db.log_interaction()` stores all three new columns
|
| 322 |
+
|
| 323 |
+
**Files modified**: `app/db.py`, `app/routers/events.py`, `app/routers/recommendations.py`, `app/templates/partials/action_buttons.html`
|
| 324 |
+
|
| 325 |
+
**Gaps**: None. `propensity` and `policy_id` fields deferred until ε-greedy exploration (Phase 9).
|
| 326 |
|
| 327 |
---
|
| 328 |
|
|
|
|
| 357 |
|
| 358 |
> *Replace heuristic scorer with a trained LightGBM lambdarank model.*
|
| 359 |
> *Blocked by: ≥500 labeled interactions OR citation-graph bootstrap*
|
| 360 |
+
> *Estimated effort: ~2-4 weeks*
|
| 361 |
+
> *Architecture decision: one-stage LambdaMART first (Doc 07 ADR A3)*
|
| 362 |
|
| 363 |
- [ ] Citation-graph pseudo-labels from unarXive 2022 (cited = relevance 2, co-cited = 1, random = 0)
|
| 364 |
- [ ] Author-as-user simulation
|
|
|
|
| 381 |
|
| 382 |
## Phase 8: LLM Interest Summaries + Distilled Re-ranker 📋 NOT STARTED
|
| 383 |
|
| 384 |
+
> *Estimated effort: ~10-12 weeks (Doc 07)*
|
| 385 |
+
> *Detailed research plan: `docs/research/07-LLM-Summaries-Reranker-and-Scaling-Research.md`*
|
| 386 |
+
> *Entry criteria: Phase 7 eval producing stable nDCG@10; cluster stability Jaccard ≥0.7 over 7 days*
|
| 387 |
+
|
| 388 |
+
### 8a — Claude-generated per-cluster interest summaries (Doc 07 §A)
|
| 389 |
+
- [ ] Cluster snapshot versioning (ADR A1)
|
| 390 |
+
- [ ] Content-addressed caching: `sha256(sorted(paper_ids) + prompt_version + model)`
|
| 391 |
+
- [ ] Shared summaries (not per-user) — Haiku 4.5 + Batch API (~$50-80/month @ 1K users)
|
| 392 |
+
- [ ] Nightly regeneration job with 7-day TTL + event-triggered refresh
|
| 393 |
+
- [ ] "You're reading about X" UI framing with sub-theme bullets
|
| 394 |
+
- [ ] Anthropic Citations API for hallucination prevention
|
| 395 |
+
|
| 396 |
+
### 8b — Distilled cross-encoder reranker (Doc 07 §B)
|
| 397 |
+
- [ ] Deploy `cross-encoder/ms-marco-TinyBERT-L-2-v2` INT8 ONNX as MVP
|
| 398 |
+
- [ ] 6ms budget for 20 pairs on CPU (AVX-512 VNNI)
|
| 399 |
+
- [ ] TinyBERT score as LightGBM feature (Option C architecture)
|
| 400 |
+
- [ ] Custom distillation from BGE-reranker-v2-m3 only if held-out gap >3 nDCG
|
| 401 |
+
- [ ] MarginMSE loss + SciNCL citation-graph hard negatives
|
| 402 |
+
|
| 403 |
+
### 8c — Use-cases and information-gain design doc (Doc 07 §C)
|
| 404 |
+
- [ ] 8 user personas (P1 cold-start through P8 stay-current)
|
| 405 |
+
- [ ] Information-gain table (save=3-5×, dismiss-as-label=−3-4×, passive skip=−0.1×)
|
| 406 |
+
- [ ] Mode-switching UI: "Stay Current" vs "Lit Review" toggle
|
| 407 |
+
- [ ] Failure mode detection rules (feed collapse, stale profile, filter bubble)
|
| 408 |
|
| 409 |
---
|
| 410 |
|
|
|
|
| 451 |
| Test File | Count | Status |
|
| 452 |
|---|---|---|
|
| 453 |
| `tests/test_profiles.py` | 11 | ✅ Passing |
|
| 454 |
+
| `tests/test_clustering.py` | 21 | ✅ Passing | (9 compute + 10 Hungarian + 2 persistence) |
|
| 455 |
| `tests/test_reranker_diversity.py` | 13 | ✅ Passing |
|
| 456 |
+
| `tests/test_fusion.py` | 20 | ✅ Passing | (Phase 4.1) |
|
| 457 |
+
| `tests/test_db.py` | 19 | ✅ Passing | (includes 4 Turso cache + 8 suppression) |
|
| 458 |
| `tests/test_qdrant_svc.py` | — | ✅ Passing |
|
| 459 |
| `tests/test_arxiv_svc.py` | — | ✅ Passing |
|
| 460 |
+
| `tests/test_integration.py` | — | ✅ Passing | (includes quota pipeline E2E) |
|
| 461 |
| `tests/test_user_state.py` | — | ✅ Passing |
|
| 462 |
| `tests/test_saved.py` | — | ✅ Passing |
|
| 463 |
| `tests/test_hybrid_search.py` | 21 | ✅ Passing |
|
| 464 |
| `tests/test_search_router.py` | 6 | ✅ Passing |
|
| 465 |
| `tests/test_live_search.py` | 8 | ✅ Passing |
|
| 466 |
+
| **Total** | **171** | ✅ |
|
| 467 |
| `test_e2e_recs.py` (standalone) | 1 | ✅ E2E simulation |
|
| 468 |
|
| 469 |
---
|
|
|
|
| 476 |
| L2-normalize before Ward clustering | ✅ Applied | `app/recommend/clustering.py` |
|
| 477 |
| Medoid not centroid | ✅ Applied | `app/recommend/clustering.py` → `_find_medoid()` |
|
| 478 |
| Negative EWMA wired into reranking | ✅ Applied | `app/recommend/reranker.py` → Feature 5 |
|
| 479 |
+
| RRF → quota fusion for recommendations | ✅ Applied | `app/recommend/fusion.py` (Phase 4.1) |
|
| 480 |
+
| Hungarian cluster matching | ✅ Applied | `app/recommend/clustering.py` → `stabilize_cluster_ids()` (Phase 4.3) |
|
| 481 |
+
| Per-item short-term negative decay | [!] Backlog | Phase 6 (LightGBM feature) |
|
| 482 |
+
| Category-level suppression | ✅ Applied | `app/db.py` → `get_suppressed_categories()` (Phase 4.4) |
|
| 483 |
| BGE-reranker NEVER in hot path | ✅ Followed | Heuristic scorer used instead |
|
docs/phases/PHASE3-Hybrid-Semantic-Search.md
CHANGED
|
@@ -3,7 +3,7 @@
|
|
| 3 |
> **Purpose**: Replace the Phase 1 placeholder arXiv keyword API search with real vector-based
|
| 4 |
> semantic search using BGE-M3 encoding + Qdrant dense + Zilliz sparse + RRF fusion.
|
| 5 |
>
|
| 6 |
-
> **Status**:
|
| 7 |
> **Estimated effort**: ~2-3 weeks
|
| 8 |
> **Predecessor**: Phase 2c (complete) — the recommendation pipeline
|
| 9 |
> **Deployment target**: Hugging Face Spaces (Docker SDK, free tier: 16GB RAM, 2 vCPUs)
|
|
|
|
| 3 |
> **Purpose**: Replace the Phase 1 placeholder arXiv keyword API search with real vector-based
|
| 4 |
> semantic search using BGE-M3 encoding + Qdrant dense + Zilliz sparse + RRF fusion.
|
| 5 |
>
|
| 6 |
+
> **Status**: ✅ Complete
|
| 7 |
> **Estimated effort**: ~2-3 weeks
|
| 8 |
> **Predecessor**: Phase 2c (complete) — the recommendation pipeline
|
| 9 |
> **Deployment target**: Hugging Face Spaces (Docker SDK, free tier: 16GB RAM, 2 vCPUs)
|
docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md
ADDED
|
@@ -0,0 +1,603 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Phase 4 — Recommendation Pipeline Fixes
|
| 2 |
+
|
| 3 |
+
> **Purpose**: Fix the 3 remaining architectural faults identified by Doc 06 in the
|
| 4 |
+
> recommendation pipeline: replace RRF with importance-weighted quota fusion, add
|
| 5 |
+
> Hungarian matching for cluster stability, and wire category-level negative suppression.
|
| 6 |
+
>
|
| 7 |
+
> **Status**: 📋 Not started
|
| 8 |
+
> **Estimated effort**: ~1 week
|
| 9 |
+
> **Predecessor**: Phase 3.5 (complete) — Turso metadata DB
|
| 10 |
+
> **Deployment target**: Same — Hugging Face Spaces (no infra changes)
|
| 11 |
+
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
## Why This Matters
|
| 15 |
+
|
| 16 |
+
The recommendation engine works today — all 3 tiers cascade correctly, EWMA profiles
|
| 17 |
+
update, Ward clustering detects interests, and MMR enforces diversity. But Doc 06
|
| 18 |
+
identified three concrete faults that degrade quality for multi-interest users:
|
| 19 |
+
|
| 20 |
+
| # | Fault | Impact | Who gets hurt |
|
| 21 |
+
|---|---|---|---|
|
| 22 |
+
| **4.1** | RRF fuses interest clusters by consensus, not proportionally | Dominant cluster drowns minority interests | User who likes both NLP (70%) and RL (30%) never sees RL papers |
|
| 23 |
+
| **4.3** | Cluster indices shuffle on every recluster | Future analytics and UI labels break | Any user who saves a new paper |
|
| 24 |
+
| **4.4** | No category-level negative suppression | Dismissed topics keep reappearing | User who dismisses 5 physics papers still gets physics recs |
|
| 25 |
+
|
| 26 |
+
**What's already fixed (not Phase 4)**:
|
| 27 |
+
- ✅ α_long = 0.03 (was 0.10, fixed Phase 2a — PinnerSage rejected 0.10)
|
| 28 |
+
- ✅ L2-normalize before Ward (fixed Phase 2b — Doc 06 fault #4)
|
| 29 |
+
- ✅ Negative EWMA penalty in reranker (fixed Phase 2c — Feature 5, weight 0.15)
|
| 30 |
+
- ✅ Metadata store pre-populated (Phase 3.5 — Turso, 1.23GB)
|
| 31 |
+
|
| 32 |
+
---
|
| 33 |
+
|
| 34 |
+
## Current Architecture vs Target Architecture
|
| 35 |
+
|
| 36 |
+
### Current Retrieval (Phase 2b — being fixed)
|
| 37 |
+
|
| 38 |
+
```
|
| 39 |
+
Cluster medoids + short-term vector
|
| 40 |
+
│
|
| 41 |
+
▼
|
| 42 |
+
Single Qdrant prefetch+RRF call
|
| 43 |
+
├── Prefetch: medoid_1 (limit=40)
|
| 44 |
+
├── Prefetch: medoid_2 (limit=30)
|
| 45 |
+
├── Prefetch: medoid_3 (limit=25)
|
| 46 |
+
└── Prefetch: short_term (limit=25)
|
| 47 |
+
│
|
| 48 |
+
▼
|
| 49 |
+
FusionQuery(fusion=Fusion.RRF)
|
| 50 |
+
│ ← papers near ALL cluster centroids get boosted
|
| 51 |
+
│ ← minority interests get drowned
|
| 52 |
+
▼
|
| 53 |
+
~100 candidates → rerank → MMR → serve
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
**Problem**: RRF was designed for fusing *different retrievers on the same query*
|
| 57 |
+
(BM25 + vector). Here we're fusing *different queries for the same user*. Consensus
|
| 58 |
+
means "near the centroid of everything" — the exact failure multi-interest models
|
| 59 |
+
exist to prevent.
|
| 60 |
+
|
| 61 |
+
### Target Retrieval (Phase 4)
|
| 62 |
+
|
| 63 |
+
```
|
| 64 |
+
compute_clusters() → K clusters with importance scores
|
| 65 |
+
│
|
| 66 |
+
▼
|
| 67 |
+
allocate_quotas([imp_1, imp_2, ...], total=100, min=3)
|
| 68 |
+
→ [55, 30, 15] (proportional, each ≥ 3)
|
| 69 |
+
│
|
| 70 |
+
▼
|
| 71 |
+
asyncio.gather( ← concurrent, ~15ms wall-clock
|
| 72 |
+
search_by_vector(medoid_1, limit=55×3), # 3× over-fetch for rerank headroom
|
| 73 |
+
search_by_vector(medoid_2, limit=30×3),
|
| 74 |
+
search_by_vector(medoid_3, limit=15×3),
|
| 75 |
+
search_by_vector(short_term, limit=25), # session boost
|
| 76 |
+
)
|
| 77 |
+
│
|
| 78 |
+
▼
|
| 79 |
+
Deduplicate across clusters
|
| 80 |
+
(assign each paper to its highest-ranked cluster)
|
| 81 |
+
│
|
| 82 |
+
▼
|
| 83 |
+
Category suppression: drop papers from suppressed categories
|
| 84 |
+
│
|
| 85 |
+
▼
|
| 86 |
+
Rerank → MMR → exploration → serve
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
**Evidence this is correct**:
|
| 90 |
+
- PinnerSage (KDD 2020): samples 3 medoids proportional to importance — no RRF
|
| 91 |
+
- Taobao ULIM (RecSys 2025): per-category parallel retrieval with quota — +5.54% clicks
|
| 92 |
+
- Pinterest Bucketized-ANN (SIGIR 2023): ensures minority items aren't dropped
|
| 93 |
+
- Twitter kNN-Embed: candidates per cluster proportional to mixture weight
|
| 94 |
+
- Bruch et al. (SIGIR 2022): RRF optimises Recall not nDCG — quota gives better nDCG
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
## 4.1 — Replace RRF with Importance-Weighted Quota Fusion
|
| 99 |
+
|
| 100 |
+
### New File: `app/recommend/fusion.py`
|
| 101 |
+
|
| 102 |
+
Pure-math module with zero I/O dependencies. Contains one function:
|
| 103 |
+
|
| 104 |
+
```python
|
| 105 |
+
def allocate_quotas(
|
| 106 |
+
importances: list[float],
|
| 107 |
+
total_slots: int = 100,
|
| 108 |
+
min_slots: int = 3,
|
| 109 |
+
) -> list[int]:
|
| 110 |
+
"""
|
| 111 |
+
Importance-weighted quota allocation with a minimum floor.
|
| 112 |
+
|
| 113 |
+
Each cluster gets feed slots proportional to its importance,
|
| 114 |
+
with a guaranteed minimum of `min_slots` to protect minority interests.
|
| 115 |
+
|
| 116 |
+
Algorithm:
|
| 117 |
+
1. Normalise: w_k = importance_k / sum(importances)
|
| 118 |
+
2. Raw allocation: raw_k = total_slots × w_k
|
| 119 |
+
3. Apply floor: slot_k = max(floor(raw_k), min_slots)
|
| 120 |
+
4. Distribute remainder by largest fractional part
|
| 121 |
+
5. Guarantee: sum(slots) == total_slots
|
| 122 |
+
|
| 123 |
+
This is the Doc 06 formula verbatim:
|
| 124 |
+
slot_k = max(⌊F × w_k⌋, F_min=3)
|
| 125 |
+
|
| 126 |
+
Reference: PinnerSage (KDD 2020), Taobao ULIM (RecSys 2025),
|
| 127 |
+
Pinterest Bucketized-ANN (SIGIR 2023).
|
| 128 |
+
"""
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
**Worked example** (from Doc 06 §"Worked example"):
|
| 132 |
+
- 3 clusters with importances [0.55, 0.30, 0.15], total_slots=30
|
| 133 |
+
- Raw allocation: [16.5, 9.0, 4.5]
|
| 134 |
+
- Floor applied: [16, 9, 4] (all ≥ 3, so floor has no effect)
|
| 135 |
+
- Remainder: 30 - 29 = 1 slot → goes to cluster 0 (largest fractional part: 0.5)
|
| 136 |
+
- Final: [17, 9, 4] — minority cluster gets 4 slots, not 0
|
| 137 |
+
|
| 138 |
+
**Edge case — tiny cluster**:
|
| 139 |
+
- 4 clusters with importances [0.60, 0.25, 0.10, 0.05], total_slots=30
|
| 140 |
+
- Raw allocation: [18.0, 7.5, 3.0, 1.5]
|
| 141 |
+
- Without floor: [18, 7, 3, 1] — smallest cluster gets 1 paper
|
| 142 |
+
- With floor (min=3): [18, 7, 3, 3] — smallest cluster gets 3 papers
|
| 143 |
+
|
| 144 |
+
### Modified File: `app/routers/recommendations.py`
|
| 145 |
+
|
| 146 |
+
The `_multi_interest_recommend()` function changes its retrieval step:
|
| 147 |
+
|
| 148 |
+
**What gets removed**:
|
| 149 |
+
- The `_CLUSTER_LIMITS = [40, 30, 25, 20, 15, 15, 15]` hardcoded list
|
| 150 |
+
- The call to `qdrant_svc.multi_interest_search()` (the prefetch+RRF path)
|
| 151 |
+
- Building the `interest_vectors` list of `(medoid_embedding, limit)` tuples
|
| 152 |
+
|
| 153 |
+
**What replaces it**:
|
| 154 |
+
```python
|
| 155 |
+
import asyncio
|
| 156 |
+
from app.recommend.fusion import allocate_quotas
|
| 157 |
+
|
| 158 |
+
# Step 2: Quota-based parallel retrieval (replaces RRF)
|
| 159 |
+
quotas = allocate_quotas(
|
| 160 |
+
importances=[c.importance for c in clusters],
|
| 161 |
+
total_slots=100, # wide retrieval net
|
| 162 |
+
min_slots=3, # every cluster gets at least 3 slots
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
# Launch concurrent ANN searches — one per cluster + session
|
| 166 |
+
search_coros = []
|
| 167 |
+
for cluster, quota in zip(clusters, quotas):
|
| 168 |
+
search_coros.append(
|
| 169 |
+
qdrant_svc.search_by_vector(
|
| 170 |
+
query_vector=cluster.medoid_embedding.tolist(),
|
| 171 |
+
limit=quota * 3, # 3× over-fetch for rerank headroom
|
| 172 |
+
exclude_ids=seen,
|
| 173 |
+
)
|
| 174 |
+
)
|
| 175 |
+
# Add short-term session vector if available
|
| 176 |
+
st_vec = await profiles.load_profile(user_id, "short_term")
|
| 177 |
+
if st_vec is not None:
|
| 178 |
+
search_coros.append(
|
| 179 |
+
qdrant_svc.search_by_vector(
|
| 180 |
+
query_vector=st_vec.tolist(),
|
| 181 |
+
limit=25,
|
| 182 |
+
exclude_ids=seen,
|
| 183 |
+
)
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
# Execute all searches concurrently (~15ms wall-clock)
|
| 187 |
+
per_cluster_results = await asyncio.gather(*search_coros)
|
| 188 |
+
|
| 189 |
+
# Deduplicate: first occurrence wins (highest-ranked cluster)
|
| 190 |
+
seen_in_results = set()
|
| 191 |
+
candidate_ids = []
|
| 192 |
+
for result_list in per_cluster_results:
|
| 193 |
+
for arxiv_id in result_list:
|
| 194 |
+
if arxiv_id not in seen_in_results:
|
| 195 |
+
seen_in_results.add(arxiv_id)
|
| 196 |
+
candidate_ids.append(arxiv_id)
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
**Key design decisions**:
|
| 200 |
+
|
| 201 |
+
1. **`asyncio.gather()` for concurrency** — Each `search_by_vector()` call takes ~5-15ms.
|
| 202 |
+
With `asyncio.gather()`, 3-7 concurrent queries run in ~15-25ms wall-clock — same as
|
| 203 |
+
the old single prefetch call.
|
| 204 |
+
|
| 205 |
+
2. **3× over-fetch** — We fetch `quota × 3` candidates per cluster, then let the reranker
|
| 206 |
+
pick the best `quota` from each. This gives the heuristic scorer enough headroom to
|
| 207 |
+
find quality papers even if some candidates are poor matches.
|
| 208 |
+
|
| 209 |
+
3. **First-occurrence deduplication** — Papers appearing in multiple cluster results are
|
| 210 |
+
assigned to whichever cluster ranked them highest (first encounter). This is simple,
|
| 211 |
+
deterministic, and matches the PinnerSage pattern.
|
| 212 |
+
|
| 213 |
+
4. **`multi_interest_search()` is NOT deleted** — The function stays in `qdrant_svc.py`
|
| 214 |
+
for potential future use. We simply stop calling it from the recommendations router.
|
| 215 |
+
|
| 216 |
+
### Latency Impact
|
| 217 |
+
|
| 218 |
+
| Stage | Before (RRF) | After (Quota) |
|
| 219 |
+
|---|---|---|
|
| 220 |
+
| Qdrant retrieval | ~15-25ms (1 prefetch call) | ~15-25ms (3-7 concurrent calls) |
|
| 221 |
+
| Dedup + quota | N/A | <1ms |
|
| 222 |
+
| Rerank + MMR | ~12ms | ~12ms (unchanged) |
|
| 223 |
+
| **Total pipeline** | ~30ms | ~30ms |
|
| 224 |
+
|
| 225 |
+
No latency regression. The concurrent gather matches the prefetch parallelism.
|
| 226 |
+
|
| 227 |
+
---
|
| 228 |
+
|
| 229 |
+
## 4.3 — Hungarian Matching for Cluster Stability
|
| 230 |
+
|
| 231 |
+
### Why This Matters
|
| 232 |
+
|
| 233 |
+
When a user saves a new paper, `compute_clusters()` runs Ward clustering from scratch.
|
| 234 |
+
The cluster that was "NLP papers" yesterday might get `cluster_idx=2` today and
|
| 235 |
+
`cluster_idx=0` tomorrow. This breaks:
|
| 236 |
+
|
| 237 |
+
- Future analytics ("which cluster does the user engage with most?")
|
| 238 |
+
- Future UI labels ("Your Interest: Natural Language Processing")
|
| 239 |
+
- A/B test logs that reference cluster indices
|
| 240 |
+
- Doc 06 §"Clustering specifics" calls this "the real operational risk"
|
| 241 |
+
|
| 242 |
+
### Modified File: `app/recommend/clustering.py`
|
| 243 |
+
|
| 244 |
+
Add a new function called between `compute_clusters()` and `save_clusters_to_db()`:
|
| 245 |
+
|
| 246 |
+
```python
|
| 247 |
+
from scipy.optimize import linear_sum_assignment
|
| 248 |
+
|
| 249 |
+
def stabilize_cluster_ids(
|
| 250 |
+
new_clusters: list[InterestCluster],
|
| 251 |
+
old_clusters: list[dict] | None,
|
| 252 |
+
paper_vectors: dict[str, list[float]] | None = None,
|
| 253 |
+
) -> list[InterestCluster]:
|
| 254 |
+
"""
|
| 255 |
+
Remap new cluster indices to match previous clusters via Hungarian matching.
|
| 256 |
+
|
| 257 |
+
1. Compute cost matrix: cost[i][j] = 1 - cosine_sim(new_medoid_i, old_medoid_j)
|
| 258 |
+
2. Solve assignment with scipy.optimize.linear_sum_assignment
|
| 259 |
+
3. Remap new cluster_idx to matched old cluster_idx
|
| 260 |
+
4. Genuinely new clusters (no match) get next available index
|
| 261 |
+
|
| 262 |
+
At K ≤ 7 this is trivially fast (7×7 matrix).
|
| 263 |
+
|
| 264 |
+
Reference: Doc 06 §"Clustering specifics" — "persist cluster→medoid-paper-id
|
| 265 |
+
mapping across reclusterings and use Hungarian matching against previous medoids."
|
| 266 |
+
"""
|
| 267 |
+
```
|
| 268 |
+
|
| 269 |
+
**Algorithm walkthrough**:
|
| 270 |
+
|
| 271 |
+
1. Load previous clusters from SQLite via `load_clusters_from_db(user_id)`
|
| 272 |
+
2. If `old_clusters is None` (first time): no remapping needed, return as-is
|
| 273 |
+
3. Build a cost matrix of shape `(K_new, K_old)`:
|
| 274 |
+
- For each pair, fetch the old medoid embedding from `paper_vectors`
|
| 275 |
+
- `cost[i][j] = 1 - cosine_similarity(new_medoid_i, old_medoid_j)`
|
| 276 |
+
4. Run `scipy.optimize.linear_sum_assignment(cost_matrix)` — O(K³), trivial at K≤7
|
| 277 |
+
5. For matched pairs `(new_i, old_j)` where `cost < 0.5` (cosine sim > 0.5):
|
| 278 |
+
assign `new_clusters[new_i].cluster_idx = old_clusters[old_j]['cluster_idx']`
|
| 279 |
+
6. For unmatched new clusters: assign the next available index
|
| 280 |
+
|
| 281 |
+
**Where it's called** — in `_multi_interest_recommend()` in `recommendations.py`:
|
| 282 |
+
|
| 283 |
+
```python
|
| 284 |
+
# Step 1: Compute interest clusters
|
| 285 |
+
clusters = compute_clusters(aligned_ids, aligned_embs)
|
| 286 |
+
|
| 287 |
+
# Step 1.5: Stabilise cluster IDs against previous run
|
| 288 |
+
old_clusters = await load_clusters_from_db(user_id)
|
| 289 |
+
clusters = stabilize_cluster_ids(clusters, old_clusters, vectors)
|
| 290 |
+
|
| 291 |
+
# Step 1.6: Persist (now with stable IDs)
|
| 292 |
+
await save_clusters_to_db(user_id, clusters)
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
### What Needs to Change
|
| 296 |
+
|
| 297 |
+
The old medoid embeddings need to be compared against new medoid embeddings. The old
|
| 298 |
+
medoid embeddings aren't stored in SQLite (only the `medoid_paper_id` is). Two options:
|
| 299 |
+
|
| 300 |
+
**Option A** (recommended): Use the `paper_vectors` dict that's already loaded at the
|
| 301 |
+
top of `_multi_interest_recommend()` (line 128: `vectors = await qdrant_svc.get_paper_vectors(positives)`).
|
| 302 |
+
Old medoid paper IDs are likely in this set since the medoid IS a saved paper. If not,
|
| 303 |
+
do a small `get_paper_vectors([old_medoid_id])` call.
|
| 304 |
+
|
| 305 |
+
**Option B**: Store medoid embeddings as BLOBs in `user_clusters` table. This adds a
|
| 306 |
+
4KB column but avoids any Qdrant call. Overhead is negligible.
|
| 307 |
+
|
| 308 |
+
**Decision**: Option A — avoids schema migration and the vectors are already in memory.
|
| 309 |
+
|
| 310 |
+
---
|
| 311 |
+
|
| 312 |
+
## 4.4 — Category-Level Negative Suppression
|
| 313 |
+
|
| 314 |
+
### Design Decisions (Per User Input)
|
| 315 |
+
|
| 316 |
+
1. **Primary category only** — arXiv papers have multiple categories (e.g., `cs.CV`, `cs.AI`).
|
| 317 |
+
Suppression applies to the **primary category only** to avoid suffocating the recommendation
|
| 318 |
+
graph. A paper tagged `[cs.CV, cs.AI]` is only suppressed if `cs.CV` (primary) is
|
| 319 |
+
suppressed, not if `cs.AI` is.
|
| 320 |
+
|
| 321 |
+
2. **τ_neg = 14 days** — Standard default from the literature. If a user dismisses ≥3 papers
|
| 322 |
+
from the same primary category within 14 days, that category is suppressed for 14 days
|
| 323 |
+
from the last dismissal.
|
| 324 |
+
|
| 325 |
+
### ⚠️ Critical Implementation Detail: Category Format Mismatch
|
| 326 |
+
|
| 327 |
+
The arXiv API and Turso store categories in **different formats**:
|
| 328 |
+
- **arXiv API** (`arxiv_svc.py`): uses arXiv codes like `cs.CV`, `cs.CL`, `stat.ML`
|
| 329 |
+
- **Turso** (`turso_svc.py`): uses `primary_topic` which contains human-readable labels
|
| 330 |
+
like `"AI/ML"`, `"Computer Vision"`, `"NLP/Computational Linguistics"`
|
| 331 |
+
- Both write to `paper_metadata.category` via different paths
|
| 332 |
+
|
| 333 |
+
This means `paper_metadata.category` contains a **mix of both formats** depending on
|
| 334 |
+
which service populated it. The suppression logic must handle this:
|
| 335 |
+
|
| 336 |
+
```python
|
| 337 |
+
# In the suppression filter, normalise category comparison:
|
| 338 |
+
# - Papers from arXiv have codes: "cs.CV"
|
| 339 |
+
# - Papers from Turso have labels: "Computer Vision"
|
| 340 |
+
# Both may appear in suppressed_cats, so we suppress on exact match
|
| 341 |
+
```
|
| 342 |
+
|
| 343 |
+
**Resolution**: The `get_suppressed_categories()` query will return whatever format is
|
| 344 |
+
in the database. The filter in `recommendations.py` will compare candidate categories
|
| 345 |
+
(from Turso metadata) against the suppressed set. Since recommendations primarily use
|
| 346 |
+
Turso for metadata, the formats will match. For the rare arXiv-fallback case, we accept
|
| 347 |
+
the slight inconsistency — it's a minor gap that self-corrects as more Turso data is used.
|
| 348 |
+
|
| 349 |
+
### What's Already Done
|
| 350 |
+
|
| 351 |
+
The EWMA negative profile is already wired as Feature 5 in `reranker.py`:
|
| 352 |
+
```python
|
| 353 |
+
# Feature 5: cosine_sim_negative (0.15 penalty weight)
|
| 354 |
+
neg_penalty = cosine_sim(candidate, neg_profile) * 0.15
|
| 355 |
+
final_score -= neg_penalty
|
| 356 |
+
```
|
| 357 |
+
|
| 358 |
+
This gives a "soft" directional signal: papers semantically similar to dismissed papers
|
| 359 |
+
get demoted. What's missing is the "hard" category-level suppression.
|
| 360 |
+
|
| 361 |
+
### What's NOT Being Done (Deferred)
|
| 362 |
+
|
| 363 |
+
**Per-item temporal decay** (`score -= α × exp(-dt / τ)`) is deferred to Phase 6.
|
| 364 |
+
Reasoning:
|
| 365 |
+
- Requires per-dismissed-item timestamps matched against candidates
|
| 366 |
+
- Most naturally expressed as a LightGBM feature (`days_since_most_recent_similar_dismissal`)
|
| 367 |
+
- The EWMA negative penalty already covers the directional signal
|
| 368 |
+
- Adding hand-tuned temporal formulas when LightGBM is the next phase would create throwaway code
|
| 369 |
+
|
| 370 |
+
### Modified File: `app/db.py`
|
| 371 |
+
|
| 372 |
+
Add one new helper function:
|
| 373 |
+
|
| 374 |
+
```python
|
| 375 |
+
async def get_suppressed_categories(
|
| 376 |
+
user_id: str,
|
| 377 |
+
threshold: int = 3,
|
| 378 |
+
days: int = 14,
|
| 379 |
+
) -> set[str]:
|
| 380 |
+
"""
|
| 381 |
+
Find primary arXiv categories where the user has dismissed ≥ threshold
|
| 382 |
+
papers within the last `days` days.
|
| 383 |
+
|
| 384 |
+
Joins interactions (event_type='not_interested') against paper_metadata
|
| 385 |
+
to get the category of each dismissed paper.
|
| 386 |
+
|
| 387 |
+
Returns: set of category strings to suppress (e.g., {'cs.CV', 'physics.optics'})
|
| 388 |
+
"""
|
| 389 |
+
async with aiosqlite.connect(DB_PATH) as db:
|
| 390 |
+
cur = await db.execute(
|
| 391 |
+
"""SELECT pm.category, COUNT(*) as cnt
|
| 392 |
+
FROM interactions i
|
| 393 |
+
JOIN paper_metadata pm ON i.paper_id = pm.arxiv_id
|
| 394 |
+
WHERE i.user_id = ?
|
| 395 |
+
AND i.event_type = 'not_interested'
|
| 396 |
+
AND i.timestamp >= datetime('now', ?)
|
| 397 |
+
GROUP BY pm.category
|
| 398 |
+
HAVING cnt >= ?""",
|
| 399 |
+
(user_id, f"-{days} days", threshold),
|
| 400 |
+
)
|
| 401 |
+
rows = await cur.fetchall()
|
| 402 |
+
return {row[0] for row in rows if row[0]}
|
| 403 |
+
```
|
| 404 |
+
|
| 405 |
+
**Data dependency**: This requires dismissed papers to have their metadata in
|
| 406 |
+
`paper_metadata`. Currently:
|
| 407 |
+
- Papers from **arXiv API** (`arxiv_svc.py`) are automatically cached via `db.cache_metadata()`
|
| 408 |
+
- Papers from **Turso** (`turso_svc.py`) are **NOT cached** to `paper_metadata`
|
| 409 |
+
|
| 410 |
+
This is a gap. When a user dismisses a paper whose metadata came from Turso (the common
|
| 411 |
+
case since Phase 3.5), the category won't be in `paper_metadata` and the suppression
|
| 412 |
+
join will miss it.
|
| 413 |
+
|
| 414 |
+
**Fix**: Add a `cache_turso_metadata()` helper in the recommendations router that writes
|
| 415 |
+
Turso-sourced paper dicts to `paper_metadata` after fetching. This is a small INSERT OR
|
| 416 |
+
IGNORE — ~1ms overhead for 20 papers. We should also add this to `search.py` and
|
| 417 |
+
`saved.py` so ALL metadata paths feed the cache.
|
| 418 |
+
|
| 419 |
+
### Modified File: `app/routers/recommendations.py`
|
| 420 |
+
|
| 421 |
+
In `_multi_interest_recommend()`, after re-ranking but before MMR:
|
| 422 |
+
|
| 423 |
+
```python
|
| 424 |
+
# Step 3.5: Category suppression
|
| 425 |
+
suppressed_cats = await db.get_suppressed_categories(user_id)
|
| 426 |
+
if suppressed_cats:
|
| 427 |
+
# Filter out candidates whose primary category is suppressed
|
| 428 |
+
reranked_ids_filtered = []
|
| 429 |
+
reranked_scores_filtered = []
|
| 430 |
+
reranked_embs_list = []
|
| 431 |
+
for i, rid in enumerate(reranked_ids):
|
| 432 |
+
cat = cand_meta.get(rid, {}).get("category", "")
|
| 433 |
+
# Extract primary category (first in the list, or the whole string)
|
| 434 |
+
primary_cat = cat.split()[0] if cat else ""
|
| 435 |
+
if primary_cat not in suppressed_cats:
|
| 436 |
+
reranked_ids_filtered.append(rid)
|
| 437 |
+
reranked_scores_filtered.append(reranked_scores[i])
|
| 438 |
+
reranked_embs_list.append(reranked_embs[i])
|
| 439 |
+
|
| 440 |
+
if reranked_ids_filtered:
|
| 441 |
+
reranked_ids = reranked_ids_filtered
|
| 442 |
+
reranked_scores = reranked_scores_filtered
|
| 443 |
+
reranked_embs = np.array(reranked_embs_list, dtype=np.float32)
|
| 444 |
+
```
|
| 445 |
+
|
| 446 |
+
---
|
| 447 |
+
|
| 448 |
+
## What Does NOT Change
|
| 449 |
+
|
| 450 |
+
These are explicitly out of scope for Phase 4:
|
| 451 |
+
|
| 452 |
+
| Component | Why it stays |
|
| 453 |
+
|---|---|
|
| 454 |
+
| **Search pipeline** (`search.py`, `hybrid_search_svc.py`) | RRF is correct for search (different retrievers, same query) |
|
| 455 |
+
| **α_long = 0.03** (`profiles.py`) | Already fixed in Phase 2a |
|
| 456 |
+
| **L2 normalization** (`clustering.py`) | Already applied before Ward in Phase 2b |
|
| 457 |
+
| **Negative EWMA Feature 5** (`reranker.py`) | Already wired in Phase 2c |
|
| 458 |
+
| **`qdrant_svc.multi_interest_search()`** | Kept in codebase, just no longer called by recs |
|
| 459 |
+
| **Per-item temporal decay** | Deferred to Phase 6 (LightGBM feature) |
|
| 460 |
+
| **Templates / UI** | No frontend changes |
|
| 461 |
+
| **Infrastructure** | Same deployment, same databases |
|
| 462 |
+
|
| 463 |
+
---
|
| 464 |
+
|
| 465 |
+
## Files Changed — Complete Map
|
| 466 |
+
|
| 467 |
+
| File | Action | Lines Changed (est.) | What Changes |
|
| 468 |
+
|---|---|---|---|
|
| 469 |
+
| `app/recommend/fusion.py` | **NEW** | ~60 | `allocate_quotas()` function |
|
| 470 |
+
| `app/routers/recommendations.py` | **MODIFY** | ~40 | Replace RRF call with quota + parallel search; add category suppression |
|
| 471 |
+
| `app/recommend/clustering.py` | **MODIFY** | ~50 | Add `stabilize_cluster_ids()` with Hungarian matching |
|
| 472 |
+
| `app/db.py` | **MODIFY** | ~20 | Add `get_suppressed_categories()` |
|
| 473 |
+
| `tests/test_fusion.py` | **NEW** | ~80 | Unit tests for quota allocation |
|
| 474 |
+
| `tests/test_clustering.py` | **MODIFY** | ~30 | Add test for Hungarian matching stability |
|
| 475 |
+
| `tests/test_search_router.py` | **NO CHANGE** | 0 | Search pipeline untouched |
|
| 476 |
+
| `tests/test_integration.py` | **NO CHANGE** | 0 | Integration tests use mocks, unaffected |
|
| 477 |
+
|
| 478 |
+
**Total new/modified production code**: ~170 lines
|
| 479 |
+
**Total new test code**: ~110 lines
|
| 480 |
+
|
| 481 |
+
---
|
| 482 |
+
|
| 483 |
+
## Implementation Order
|
| 484 |
+
|
| 485 |
+
Each step leaves the app in a working state. Tests pass after every step.
|
| 486 |
+
|
| 487 |
+
### Step 1 — Create `fusion.py` + unit tests (~30 min)
|
| 488 |
+
|
| 489 |
+
Build `allocate_quotas()` in isolation with thorough unit tests:
|
| 490 |
+
|
| 491 |
+
- `test_basic_allocation` — 3 clusters, verify proportionality
|
| 492 |
+
- `test_floor_enforcement` — tiny cluster still gets `min_slots`
|
| 493 |
+
- `test_total_equals_requested` — sum always equals `total_slots`
|
| 494 |
+
- `test_single_cluster` — all slots go to the one cluster
|
| 495 |
+
- `test_equal_importances` — even split
|
| 496 |
+
- `test_many_clusters_with_floor` — 7 clusters, floor forces redistribution
|
| 497 |
+
|
| 498 |
+
### Step 2 — Refactor `_multi_interest_recommend()` (~1 hour)
|
| 499 |
+
|
| 500 |
+
Replace the RRF call with quota + `asyncio.gather()`. Key changes:
|
| 501 |
+
1. Remove `_CLUSTER_LIMITS` hardcoded list
|
| 502 |
+
2. Import `allocate_quotas` from `fusion.py`
|
| 503 |
+
3. Replace `multi_interest_search()` with per-cluster `search_by_vector()` calls
|
| 504 |
+
4. Add deduplication logic
|
| 505 |
+
5. Wire short-term vector as a separate search
|
| 506 |
+
|
| 507 |
+
**Test**: Run `python -m pytest tests/ -v` — all tests must pass.
|
| 508 |
+
|
| 509 |
+
### Step 3 — Add Hungarian matching to `clustering.py` (~1 hour)
|
| 510 |
+
|
| 511 |
+
1. Add `stabilize_cluster_ids()` function
|
| 512 |
+
2. Call it in `_multi_interest_recommend()` between `compute_clusters()` and `save_clusters_to_db()`
|
| 513 |
+
3. Add test: create clusters, slightly perturb, verify indices preserved
|
| 514 |
+
|
| 515 |
+
**Test**: Run `python -m pytest tests/test_clustering.py -v`
|
| 516 |
+
|
| 517 |
+
### Step 4 — Add category suppression (~30 min)
|
| 518 |
+
|
| 519 |
+
1. Add `get_suppressed_categories()` to `db.py`
|
| 520 |
+
2. Add suppression filter in `_multi_interest_recommend()` after reranking
|
| 521 |
+
3. Ensure Turso metadata is cached to `paper_metadata` for the join to work
|
| 522 |
+
|
| 523 |
+
**Test**: Run full `python -m pytest tests/ -v`
|
| 524 |
+
|
| 525 |
+
### Step 5 — End-to-end verification (~30 min)
|
| 526 |
+
|
| 527 |
+
1. Run `python test_e2e_recs.py` — verify recommendations generate correctly
|
| 528 |
+
2. Verify latency stays comparable (~7-8s end-to-end including network I/O)
|
| 529 |
+
3. Run full `python -m pytest tests/ -v` — 125+ tests, zero regressions
|
| 530 |
+
|
| 531 |
+
---
|
| 532 |
+
|
| 533 |
+
## Test Plan
|
| 534 |
+
|
| 535 |
+
### New Unit Tests: `tests/test_fusion.py`
|
| 536 |
+
|
| 537 |
+
| Test | What it verifies |
|
| 538 |
+
|---|---|
|
| 539 |
+
| `test_basic_proportional_allocation` | 3 clusters with [0.5, 0.3, 0.2] → ~[50, 30, 20] slots |
|
| 540 |
+
| `test_floor_protects_minority` | Tiny importance still gets ≥ `min_slots` |
|
| 541 |
+
| `test_sum_always_equals_total` | No slots lost or gained during allocation |
|
| 542 |
+
| `test_single_cluster` | One cluster gets all slots |
|
| 543 |
+
| `test_equal_importances` | N clusters get total/N each |
|
| 544 |
+
| `test_remainder_distribution` | Remainder goes to largest fractional part |
|
| 545 |
+
|
| 546 |
+
### New Unit Test: `tests/test_clustering.py`
|
| 547 |
+
|
| 548 |
+
| Test | What it verifies |
|
| 549 |
+
|---|---|
|
| 550 |
+
| `test_hungarian_preserves_indices` | Slight perturbation doesn't shuffle indices |
|
| 551 |
+
|
| 552 |
+
### Regression
|
| 553 |
+
|
| 554 |
+
- All 125 existing tests must pass
|
| 555 |
+
- `test_e2e_recs.py` must complete successfully
|
| 556 |
+
|
| 557 |
+
---
|
| 558 |
+
|
| 559 |
+
## Risks and Mitigations
|
| 560 |
+
|
| 561 |
+
| Risk | Impact | Mitigation |
|
| 562 |
+
|---|---|---|
|
| 563 |
+
| **Concurrent searches slower than prefetch** | Higher latency | `asyncio.gather()` runs them truly concurrently. Each is ~5-15ms. Wall-clock ~ max(all), not sum(all). |
|
| 564 |
+
| **Floor forces too many slots** | With 7 clusters, floor=3 requires 21 minimum slots. If total<21... | `allocate_quotas()` will clamp: if `K × min_slots > total`, reduce floor proportionally. At `total_slots=100` and `MAX_CLUSTERS=7`, minimum is 21, well within budget. |
|
| 565 |
+
| **Hungarian matching with different K** | New clustering produces fewer/more clusters than before | Handle rectangular cost matrices. `linear_sum_assignment` natively supports non-square matrices. Unmatched new clusters get fresh indices. |
|
| 566 |
+
| **`paper_metadata` missing for suppression join** | `get_suppressed_categories()` returns empty set | **Real gap found** — Turso metadata is not cached to `paper_metadata`. Fix: add `cache_turso_metadata()` calls in search/rec/saved routers. |
|
| 567 |
+
| **Turso categories vs arXiv categories format** | Turso stores human-readable categories ("AI/ML"), arXiv uses codes ("cs.AI") | **Real gap found** — both formats coexist in `paper_metadata.category`. Suppression will work within each format. Cross-format inconsistency is minor and self-corrects as Turso dominates. |
|
| 568 |
+
| **`search_by_vector` already does 2× over-fetch internally** | Asking for `quota*3` then `search_by_vector` internally doubles it | **Real gap found** — `search_by_vector()` at line 234 already fetches `limit*2` when `exclude_ids` is set. So asking for `quota*3` will actually fetch `quota*6` from Qdrant. This is fine (more candidates for reranker) but should be noted for tuning. |
|
| 569 |
+
|
| 570 |
+
---
|
| 571 |
+
|
| 572 |
+
## Verification Checklist
|
| 573 |
+
|
| 574 |
+
Before declaring Phase 4 complete:
|
| 575 |
+
|
| 576 |
+
- [ ] `python -m pytest tests/ -v` — all tests pass (130+ including new tests)
|
| 577 |
+
- [ ] `test_fusion.py` — 6+ quota allocation tests pass
|
| 578 |
+
- [ ] `test_clustering.py` — Hungarian matching test passes
|
| 579 |
+
- [ ] `test_e2e_recs.py` — end-to-end recommendations generate correctly
|
| 580 |
+
- [ ] Recommendations include papers from minority clusters (quota working)
|
| 581 |
+
- [ ] Cluster indices remain stable across consecutive saves
|
| 582 |
+
- [ ] Category suppression activates after ≥3 dismissals of same category
|
| 583 |
+
- [ ] Search pipeline is completely unaffected (RRF still used for search)
|
| 584 |
+
- [ ] Latency comparable to Phase 3.5 baseline
|
| 585 |
+
- [ ] All 3 recommendation tiers still cascade correctly (Tier 1 → 2 → 3)
|
| 586 |
+
|
| 587 |
+
---
|
| 588 |
+
|
| 589 |
+
## References
|
| 590 |
+
|
| 591 |
+
- PinnerSage (Pal et al., KDD 2020) — Ward + medoid + importance sampling, no RRF
|
| 592 |
+
- Taobao ULIM (Meng et al., RecSys 2025) — quota allocation, +5.54% clicks
|
| 593 |
+
- Pinterest Bucketized-ANN (SIGIR 2023) — minority representation protection
|
| 594 |
+
- Twitter kNN-Embed (arXiv:2205.06205) — per-cluster proportional drawing
|
| 595 |
+
- Bruch et al. (SIGIR 2022) — RRF optimises Recall not nDCG
|
| 596 |
+
- YouTube (Xia et al., 2023) — 3× gain from richer negative treatment
|
| 597 |
+
- Doc 06 §"The fusion fault in Doc 03" — full RRF critique
|
| 598 |
+
- Doc 06 §"Clustering specifics" — Hungarian matching recommendation
|
| 599 |
+
- Doc 06 §"Negative signals" — three-layer negative design
|
| 600 |
+
|
| 601 |
+
---
|
| 602 |
+
|
| 603 |
+
*Last updated: 2026-04-23*
|
docs/research/03-MultiInterest-Recommender-Architecture.md
CHANGED
|
@@ -266,7 +266,7 @@ Each cluster gets feed slots proportional to its importance, with a floor of 3 t
|
|
| 266 |
|
| 267 |
**Note:** RRF *is* correct for the search bar (fusing dense + sparse for the *same* query). Only the recommendation pipeline needs quota.
|
| 268 |
|
| 269 |
-
**Status:** ⚠️ Code still uses RRF.
|
| 270 |
|
| 271 |
---
|
| 272 |
|
|
|
|
| 266 |
|
| 267 |
**Note:** RRF *is* correct for the search bar (fusing dense + sparse for the *same* query). Only the recommendation pipeline needs quota.
|
| 268 |
|
| 269 |
+
**Status:** ⚠️ Code still uses RRF. Phase 4 planned — see `docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`.
|
| 270 |
|
| 271 |
---
|
| 272 |
|
docs/research/07-LLM-Summaries-Reranker-and-Scaling-Research.md
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ResearchIT Phase 4 Implementation Plan and Phase 5 Preview — Research Report for Amin
|
| 2 |
+
|
| 3 |
+
This report synthesizes 2024–2026 sources (RecSys/SIGIR/KDD/NeurIPS/ACL/EMNLP papers, production blogs from Pinterest, Spotify, YouTube, Netflix, and documentation from BAAI, Jina, Mixedbread, Anthropic) into an implementation-ready plan. The headline recommendation is to run Phase 4a (Claude summaries) and 4d (use-cases doc) in parallel over weeks 1–3 after a one-week ADR sprint, then spend weeks 4–9 on 4b (distilled reranker) — total ~10–12 weeks for Phase 4 with buffer. Nearly every Phase 5 workstream (exploration, IPS, propensity logging, telemetry schema) must be architected *before* Phase 4 code lands, even though the workstreams themselves are gated on user-count thresholds. The single most valuable decision to make now is the telemetry event schema, because retrofitting propensity, policy-id, and position fields after you have real-user data is painful and blocks all later counterfactual evaluation.
|
| 4 |
+
|
| 5 |
+
## A. Phase 4a — Claude-API-generated per-cluster interest summaries
|
| 6 |
+
|
| 7 |
+
### A.1 Prompt engineering
|
| 8 |
+
|
| 9 |
+
The closest published analogue to Amin's use case is **Scholar Inbox** (Flicke et al., ACL 2025 Demo, arXiv 2504.08385), which generates 4-level hierarchical labels (field → subfield → subsubfield → method) from t-SNE paper clusters using Qwen; their appendix §6.1 contains the exact prompt. Microsoft's **TnT-LLM** (KDD 2024) and **TopicGPT** (Pham et al., NAACL 2024) converge on the same pattern: structured XML-tagged inputs, constrained vocabulary, and JSON output. The recommended template for ResearchIT:
|
| 10 |
+
|
| 11 |
+
```
|
| 12 |
+
You are summarizing a research interest cluster for a specific user.
|
| 13 |
+
|
| 14 |
+
USER PROFILE CONTEXT (tone only, not content):
|
| 15 |
+
{short profile string}
|
| 16 |
+
|
| 17 |
+
CLUSTER MEDOID PAPER (most representative):
|
| 18 |
+
<medoid><title>{...}</title><abstract>{...}</abstract></medoid>
|
| 19 |
+
|
| 20 |
+
NEAREST NEIGHBOR PAPERS:
|
| 21 |
+
<papers>
|
| 22 |
+
<paper id="1"><title>...</title><abstract>...</abstract></paper>
|
| 23 |
+
... (up to 20)
|
| 24 |
+
</papers>
|
| 25 |
+
|
| 26 |
+
TASK: Produce JSON {"label": "<1-sentence 'You're reading about X, particularly Y' framing>", "themes": [<≤5-word bullet>, ... up to 4]}
|
| 27 |
+
|
| 28 |
+
RULES:
|
| 29 |
+
- Every technical term in "label" and "themes" MUST appear verbatim in at least one provided title or abstract.
|
| 30 |
+
- Do NOT introduce methods, datasets, or concepts not present in inputs.
|
| 31 |
+
- If fewer than 3 papers share a theme, omit it.
|
| 32 |
+
- Prefer specific phrases ("retrieval-augmented generation evaluation") over generic ones ("NLP research").
|
| 33 |
+
- Output JSON only.
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
**Start zero-shot with this constrained prompt; add 2–3 hand-written few-shot examples only to anchor the "You're reading about X" voice.** Spotify Research's Dec 2024 "Contextualized Recommendations Through Personalized Narratives Using LLMs" post found zero-shot adequate but converged on 3–5 "golden" style examples for tone. The Anthropic cookbook's `using_citations.ipynb` demonstrates the **Citations API**, which returns structured citation objects and explicitly "will not return citations pointing to documents or locations that were not provided as valid sources" — **use the Citations API for ResearchIT**, it eliminates the hallucination vector at the API level.
|
| 37 |
+
|
| 38 |
+
### A.2 Regeneration frequency
|
| 39 |
+
|
| 40 |
+
The 2024–2026 literature (Google's arXiv 2510.20260 on "Balancing Fine-tuning and RAG for Dynamic LLM Recommendation Updates"; Spotify's production narratives cache per-item) strongly favors **event-triggered regeneration over fixed nightly cadence**. Concrete hybrid policy:
|
| 41 |
+
|
| 42 |
+
Regenerate when the medoid paper changes, when Jaccard distance between old and new paper-ID sets exceeds 0.3, or when a cluster is added/merged/split. Apply a **7-day TTL fallback** even when nothing changes (captures embedding/context drift). **Do not regenerate nightly** — it is roughly 7× the cost for negligible UX gain on Ward clusters whose membership is stable over the timescale of a single day.
|
| 43 |
+
|
| 44 |
+
### A.3 Pricing (April 2026) and cost estimate
|
| 45 |
+
|
| 46 |
+
Verified current pricing from platform.claude.com/docs, cross-checked against Finout/MetaCTO/PE Collective reporting: **Haiku 4.5 at $1/$5 per MTok in/out**, **Sonnet 4.6 at $3/$15**, **Opus 4.7 at $5/$25** (released April 16, 2026, with a new tokenizer that can inflate token counts up to 35%). Cache reads are 10% of base input; cache writes 125% (5-minute) or 200% (1-hour). Batch API gives a flat 50% discount with ≤24h turnaround and stacks with caching. Haiku 3 is deprecated April 19, 2026 — do not build against it.
|
| 47 |
+
|
| 48 |
+
For 1,000 users × 5 clusters × 20-paper contexts (~6,000 input tokens each) regenerated weekly, monthly traffic is ~130M input + ~3.25M output tokens. Total monthly cost by model:
|
| 49 |
+
|
| 50 |
+
- **Haiku 4.5 + Batch API: ~$73/month; with prompt caching on stable prefix, ~$50–60/month**
|
| 51 |
+
- Sonnet 4.6 + Batch API: ~$220/month (~$150–180 with caching)
|
| 52 |
+
- Opus 4.6/4.7 + Batch API: ~$366/month (~$280 with caching)
|
| 53 |
+
|
| 54 |
+
**Recommendation: Haiku 4.5 + Batch API is the right default.** The task (label a cluster from provided abstracts) sits comfortably within Haiku's capability. Reserve Sonnet for offline A/B quality evaluation on a minority of calls. Skip Opus entirely for this task. Prompt caching savings are modest because each cluster's paper context is unique per cluster; the real economic lever is the **shared cross-user dedup** (§A.7), not prompt caching within a single call.
|
| 55 |
+
|
| 56 |
+
### A.4 Content-addressed caching
|
| 57 |
+
|
| 58 |
+
Construct the cache key as `sha256(sorted(paper_ids) + prompt_version + model + schema_version)`. Sort paper IDs before hashing for order-independence; include prompt and model version so stale summaries don't survive a template change; **omit user ID** from the shared cache key (that's the entire point — §A.7). Use an immutable, content-addressed store (`summaries[hash] = {label, themes, generated_at, model, tokens_used}`) — never overwrite; let old entries age out on a 90-day LRU. This mirrors CDN asset hashing (`main.a3f2b1c9.js`) and matches the Anthropic Claude Code cache-invalidation discussion (issue #29230) recommending SHA-256 of all source files be part of the cache key.
|
| 59 |
+
|
| 60 |
+
Expected exact 20-paper dedup rate is low (papers are drawn from 3M+ arXiv), but a **two-tier cache** with a "narrow" key (medoid + top-5 neighbors) as fallback increases hit rate substantially.
|
| 61 |
+
|
| 62 |
+
### A.5 Explainable-recommender UX in academic search
|
| 63 |
+
|
| 64 |
+
None of Scholar Inbox, Connected Papers, Elicit, ResearchRabbit, Semantic Scholar, Consensus, or Undermind currently displays a **personalized "You're reading about X" per-user cluster narrative**. Scholar Inbox's Scholar Map labels are the closest analogue but are global/shared across users. This means ResearchIT's Phase 4a is **genuinely novel UX for academic search**, and the right place to borrow heavily is Spotify (which reports up to 4× CTR on niche content when LLM narratives personalize discovery) and Wang et al.'s "LLMs for User Interest Exploration in Large-scale Recommendation Systems" (RecSys 2024, arXiv 2405.16363), an architecturally identical recipe (interest clusters + constrained LLM descriptions). Lubos et al.'s UMAP 2024 user study on "LLM-generated Explanations for Recommender Systems" confirms users rate LLM explanations highly for decision support.
|
| 65 |
+
|
| 66 |
+
UX recommendation: lead with the 1-sentence "You're reading about X, particularly Y" framing, then an expandable bullet list of 3–5 sub-themes, with **source paper titles as linkable chips** under each bullet (the Anthropic Citations / deterministic-quoting pattern, which kills trust issues by letting users verify). A subtle "regenerated on {date}" timestamp plus a manual refresh button gives users control.
|
| 67 |
+
|
| 68 |
+
### A.6 Hallucination prevention
|
| 69 |
+
|
| 70 |
+
The 2024–2026 state-of-the-art for grounding evaluation is **MiniCheck** (Tang, Laban, Durrett, EMNLP 2024, arXiv 2404.10774) — a 770M-parameter fine-tuned Flan-T5 that matches GPT-4 fact-checking accuracy at ~400× lower cost. Ranked strongest-to-weakest, grounding techniques are: (1) deterministic quoting (surface verbatim source text in the UI); (2) **Anthropic Citations API** (native, recommended); (3) prompt-based "use only phrases from source" rules; (4) post-hoc NLI verification with MiniCheck-FT5; (5) constrained decoding (overkill for 1-sentence labels).
|
| 71 |
+
|
| 72 |
+
Recommended stack: Anthropic Citations API + explicit "verbatim-phrase" rule in prompt + post-hoc substring verification on noun phrases (reject and regenerate if >1 unsupported phrase). Run MiniCheck-FT5 offline on a sample as an ongoing faithfulness metric. Zhou et al. (Findings EMNLP 2023) "context-faithful prompting" shows instruction-only grounding measurably reduces hallucination but is not sufficient alone — combine with a verification layer.
|
| 73 |
+
|
| 74 |
+
### A.7 Per-user vs shared summaries
|
| 75 |
+
|
| 76 |
+
**Use a hybrid two-stage design.** Stage 1 generates a **shared, content-addressed, public-paper-only** cluster description (the Claude call gets only paper titles/abstracts, never user profile text) — identical cluster content produces identical summary across users and days, enabling aggressive dedup. Stage 2 wraps the shared summary with per-user framing either via client-side string templating ("You're reading about {shared_label}") or via a lightweight per-user LLM pass cached at `(user_id, shared_hash)`.
|
| 77 |
+
|
| 78 |
+
This matches Spotify's item-level-narrative + per-user-context split and Google's arXiv 2510.20260 offline-bulk/online-lookup separation. **Privacy payoff:** shared summaries are pure functions of public arXiv content, so they can ride Anthropic's Batch API with ZDR safely, be logged freely, and be cached cross-user. User profile text never leaves your infrastructure (or does so only in a heavily-filtered form for Stage 2). This is the architectural decision (ADR A2) that must be made **before** building the caching layer, because switching from per-user to shared requires a full cache-schema migration post-launch.
|
| 79 |
+
|
| 80 |
+
## B. Phase 4b — Distilled cross-encoder reranker
|
| 81 |
+
|
| 82 |
+
### B.1 FlashRank recipe and student candidates
|
| 83 |
+
|
| 84 |
+
**FlashRank (PrithivirajDamodaran) does not train its own students** — it repackages existing open checkpoints as quantized ONNX. The default "Nano" is `ms-marco-TinyBERT-L-2-v2` (14M params, ~17MB fp32, ~6MB INT8), "Small" is `ms-marco-MiniLM-L-12-v2`, and "Medium" is `rank-T5-flan`. The engineering pattern to steal is ONNX + INT8 dynamic quantization + the `tokenizers` Rust library only (no PyTorch/transformers at runtime), keeping cold-start under 500ms on serverless.
|
| 85 |
+
|
| 86 |
+
For Amin's 6ms-for-20-pairs CPU budget (≈0.3ms/pair), **the only candidates that fit with headroom are 2-layer students**:
|
| 87 |
+
|
| 88 |
+
| Model | Params | INT8 CPU latency/pair | BEIR nDCG@10 |
|
| 89 |
+
|---|---|---|---|
|
| 90 |
+
| **ms-marco-TinyBERT-L-2-v2** | 14M | ~0.3–1.0ms | ~43–45 |
|
| 91 |
+
| ms-marco-MiniLM-L-4-v2 | 19M | ~1.5–2ms | ~46 |
|
| 92 |
+
| ms-marco-MiniLM-L-6-v2 | 22M | ~3–5ms (tight on budget) | ~48 |
|
| 93 |
+
| jina-reranker-v1-turbo-en | 38M | ~3–5ms | 49.60 (95% of jina-base) |
|
| 94 |
+
| jina-reranker-v1-tiny-en | 33M | ~2–3ms | 48.54 (92.5%) |
|
| 95 |
+
| mxbai-rerank-xsmall-v1 | 71M | ~8–12ms (over budget) | 43.9 |
|
| 96 |
+
|
| 97 |
+
Tonellotto et al.'s "Shallow Cross-Encoders" (SIGIR 2024, arXiv 2403.20222) found that at latency ≤10ms on CPU, TinyBERT-gBCE reaches nDCG@10 of 0.652 on TREC-DL-2019, a +51% gain over MonoBERT-Large (0.431). **The architectural choice (2L vs 12L) matters more than the teacher weights at tight latency.** Don't pick a bigger student.
|
| 98 |
+
|
| 99 |
+
### B.2 Domain adaptation — how much does arXiv-specific fine-tuning buy?
|
| 100 |
+
|
| 101 |
+
**Typical gain from in-domain distillation at the 2-layer scale: +1 to +3 nDCG@10 points on SciDocs**, not 10. MedCPT (PubMed, Jin et al. arXiv 2307.00589) surpasses BM25 only after ~150M query-article pairs, showing diminishing returns for modest training budgets. The listwise-distillation paper arXiv 2505.19274 demonstrates that a general RankT5-3B teacher is competitive with in-domain rerankers on SciDocs/SciFact/NFCorpus, within noise. **No BGE-reranker-v2 checkpoint fine-tuned on scientific text exists on Hugging Face as of April 2026** (searched).
|
| 102 |
+
|
| 103 |
+
### B.3 Distillation objectives
|
| 104 |
+
|
| 105 |
+
The 2025 reproducibility study (arXiv 2603.03010) benchmarks nine loss functions across nine backbones with SPLADE-v3 top-1000 candidates. Average rank across out-of-domain BEIR:
|
| 106 |
+
|
| 107 |
+
1. InfoNCE (rank 1.83)
|
| 108 |
+
2. **MarginMSE** (2.17) — Hofstätter-style pairwise distillation
|
| 109 |
+
3. DistillRankNet (3.61)
|
| 110 |
+
4. ADR-MSE (3.66)
|
| 111 |
+
5. Hinge (3.99)
|
| 112 |
+
6. BCE (5.74) — significantly worse than every other
|
| 113 |
+
|
| 114 |
+
Critically, "**MarginMSE with BM25-mined negatives is statistically equivalent to InfoNCE with ColBERTv2 hard negatives**" — loss formulation matters more than negative-pool quality. BAAI/BGE uses MarginMSE + self-knowledge-distillation from ensembles. Jina uses explicit KL on logits from the full-size teacher. Yang, He, Yang's SIGIR 2024 paper proposes CKL (contrastively-weighted KL) outperforming MarginMSE+plain KL on MS MARCO + BEIR zero-shot, but the gap is small.
|
| 115 |
+
|
| 116 |
+
**Recommended loss:** `L = α·MarginMSE(student, teacher, pos, neg) + β·KL(σ(student/T), σ(teacher/T)) + γ·BCE(pos, 1)` with α=1.0, β=0.5, γ=0.1, T=1.0. MarginMSE alone is a fine MVP.
|
| 117 |
+
|
| 118 |
+
### B.4 Integration architecture
|
| 119 |
+
|
| 120 |
+
Three options: (A) TinyBERT score as one feature in a second LightGBM pass; (B) TinyBERT as a direct re-ranker on top-20 replacing LightGBM at that stage; (C) two-stage LightGBM with TinyBERT in between. Bing's LambdaMART over hundreds of features (including BERT scores), Pinterest's TransActV2 feeding neural scores into GBDT, Google/DeepMind's DASALC+TFR-BERT, and TREC TOT 2025 (arXiv 2601.15518) all converge on **the neural score as one feature among many in a final LambdaMART**, not as a terminal reranker.
|
| 121 |
+
|
| 122 |
+
**Recommendation: Option C (≈Option A).** Keep the upstream LightGBM-lambdarank, score the top-20 with TinyBERT (~0.3ms/pair × 20 = ~6ms), and feed the student scores back into a second LightGBM pass that has access to the full personalization feature set. **Do not do Option B** — replacing LightGBM with TinyBERT at top-20 throws away user features, citation-graph features, and temporal decay that LightGBM already incorporates. Engineered features for LightGBM-2: `tinybert_score`, `tinybert_rank_position`, `tinybert_score_normalized_within_query`, and the interaction `tinybert_score − bm25_score`.
|
| 123 |
+
|
| 124 |
+
### B.5 Hard negative mining
|
| 125 |
+
|
| 126 |
+
The 2024 standard is **NV-Retriever** (arXiv 2407.15831) "positive-aware" filtering: mine top-100 ANN neighbors, then filter with the teacher cross-encoder, dropping candidates whose teacher score is within 0.3 of the positive (likely false negatives or duplicates). For academic papers, supplement with SPECTER/SciNCL citation-graph negatives: **SPECTER** uses 2 "citation-of-citation" hard negatives per query; **SciNCL** (Ostendorff et al.) improves on this by sampling from a continuous citation embedding space (PyTorch-BigGraph over S2ORC) with controlled distance margins (k_min=3998, k_max=4000 on a 52M-node graph), delivering +1.8 points on SciDocs. Recommended mix per (seed, positive): 3 SciNCL-style citation-of-citation negatives, 5 teacher-filtered ANN negatives (top 10–100 with teacher score below 95th percentile), 2 random in-batch. Critically, re-score all candidates with BGE-reranker-v2-m3 and **drop any within 0.3 teacher-score of the positive**.
|
| 127 |
+
|
| 128 |
+
### B.6 Evaluation and distillation quality gap
|
| 129 |
+
|
| 130 |
+
Typical retention rates from the 2024–2025 literature: jina-v1-base → jina-v1-turbo retains 95% (52.45 → 49.60); TinyBERT-4L retains ~96.8% of BERT-base on GLUE; MiniLM-L6 → MiniLM-L2 rerank retains ~85–90%. **For Amin's ~20× compression from BGE-reranker-v2-m3 (278M) → TinyBERT-L2 (14M), expect 82–88% retention of nDCG@10.** If below 80%, something is wrong (bad negatives, insufficient data, teacher-label leakage into eval).
|
| 131 |
+
|
| 132 |
+
Run evaluations on SciDocs (focusing on Co-view / Co-read / Cite / Co-cite tasks), SciRepEval proximity tasks, the BEIR scientific subset (NFCorpus, SciDocs, SciFact, TREC-COVID), and held-out unarXive 2024–2026 queries with citation-graph ground truth. **CPU latency protocol: 50 warmup inferences discarded, 1000 measured inferences at seq_len=128, batch=20; report P50/P95/P99, not mean** (Pinterest standard).
|
| 133 |
+
|
| 134 |
+
### B.7 Off-the-shelf scientific-domain rerankers
|
| 135 |
+
|
| 136 |
+
**There is no well-maintained small (<50M param) scientific-domain cross-encoder reranker on Hugging Face as of April 2026 that beats MS MARCO-trained TinyBERT on SciDocs at the 6ms budget.** SPECTER/SPECTER2/SciNCL are bi-encoders (embedders), not rerankers. MedCPT is biomedical-specific. Third-party SciBERT cross-encoders exist but are not validated at MS-MARCO MiniLM-L6 quality. No BAAI bge-reranker fine-tuned on scientific corpus published.
|
| 137 |
+
|
| 138 |
+
**Decision tree:**
|
| 139 |
+
|
| 140 |
+
- **If Amin already has a pseudo-label pipeline producing >200K (query, doc, teacher_score) triples** → distill TinyBERT-L-2 from bge-reranker-v2-m3 on arXiv data. Expect +1–3 nDCG over off-the-shelf.
|
| 141 |
+
- **If Amin wants MVP now** → deploy `cross-encoder/ms-marco-TinyBERT-L-2-v2` with INT8 ONNX (HF already ships `onnx/model_qint8_avx512_vnni.onnx`), measure on held-out eval. If gap vs teacher is <3 nDCG@10, ship; distill later if needed.
|
| 142 |
+
|
| 143 |
+
**Strong recommendation: go off-the-shelf first.** Distillation is ~2–4 weeks of solo-dev work and the marginal gain at 2-layer scale is usually small. Time is better spent on hard-negative mining and LightGBM-2 feature engineering.
|
| 144 |
+
|
| 145 |
+
### B.8 ONNX / FastAPI hot path
|
| 146 |
+
|
| 147 |
+
Latency ranking for BERT-base-class inference on x86 with AVX-512 VNNI:
|
| 148 |
+
|
| 149 |
+
- PyTorch eager fp32: baseline (1.0×)
|
| 150 |
+
- PyTorch INT8 dynamic CPU: 0.4×
|
| 151 |
+
- ONNX Runtime fp32: 0.3×
|
| 152 |
+
- **ONNX Runtime + INT8 dynamic AVX-512 VNNI: 0.15–0.25× (up to 6× over ORT fp32)**
|
| 153 |
+
- torch.compile: 1.5–2× over eager but still behind ONNX on CPU
|
| 154 |
+
|
| 155 |
+
For TinyBERT-L-2-v2 on Render's standard ~2 vCPU x86: fp32 PyTorch seq=128 ≈3–5ms/pair; **INT8 ONNX ≈0.3–1.0ms/pair single-thread; batched 20 pairs ≈2–4ms total wall-clock on AVX-512 VNNI hardware** (2–3× slower without VNNI). Production code pattern:
|
| 156 |
+
|
| 157 |
+
```python
|
| 158 |
+
import onnxruntime as ort
|
| 159 |
+
from tokenizers import Tokenizer
|
| 160 |
+
|
| 161 |
+
sess_options = ort.SessionOptions()
|
| 162 |
+
sess_options.intra_op_num_threads = 2 # match Render vCPUs
|
| 163 |
+
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
|
| 164 |
+
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
|
| 165 |
+
sess_options.add_session_config_entry("session.intra_op.allow_spinning", "0")
|
| 166 |
+
|
| 167 |
+
session = ort.InferenceSession(
|
| 168 |
+
"model_qint8_avx512_vnni.onnx", sess_options, providers=["CPUExecutionProvider"]
|
| 169 |
+
)
|
| 170 |
+
tokenizer = Tokenizer.from_pretrained("cross-encoder/ms-marco-TinyBERT-L-2-v2")
|
| 171 |
+
tokenizer.enable_truncation(max_length=128)
|
| 172 |
+
tokenizer.enable_padding(length=128)
|
| 173 |
+
|
| 174 |
+
def score_pairs(query, docs):
|
| 175 |
+
enc = tokenizer.encode_batch([(query, d) for d in docs])
|
| 176 |
+
return session.run(None, {
|
| 177 |
+
"input_ids": np.array([e.ids for e in enc], dtype=np.int64),
|
| 178 |
+
"attention_mask": np.array([e.attention_mask for e in enc], dtype=np.int64),
|
| 179 |
+
"token_type_ids": np.array([e.type_ids for e in enc], dtype=np.int64),
|
| 180 |
+
})[0].squeeze(-1).tolist()
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
Critical tips: pin padding length to enable kernel fusion; use `tokenizers` (Rust, ~0.1ms for 20 pairs) not `transformers.AutoTokenizer` (~5ms); cache sessions globally; disable thread spinning; skip QAT (dynamic INT8 costs <0.5 nDCG).
|
| 184 |
+
|
| 185 |
+
### B.9 Latency scaling top-20 → top-50 → top-100
|
| 186 |
+
|
| 187 |
+
Linearity is approximately valid but with caveats. K=20→50 ≈ 2.5× latency (6ms → 15ms) with modest sub-linear batching gains of 5–10% from amortized Python/tokenization overhead. K=100 ≈ 4.5× rather than 5×. Memory pressure kicks in at K≥64 with seq_len=512 but not at seq_len=128. Render's 2-vCPU boxes saturate at intra_op_num_threads=2.
|
| 188 |
+
|
| 189 |
+
| K | Strategy | Expected latency |
|
| 190 |
+
|---|---|---|
|
| 191 |
+
| 20 | single batch of 20 | 2–6ms |
|
| 192 |
+
| 50 | single batch of 50 | 6–15ms |
|
| 193 |
+
| 100 | 2 batches of 50, pipelined | 12–25ms |
|
| 194 |
+
| 200 | upgrade to MiniLM-L-4 or go async | 30–50ms |
|
| 195 |
+
|
| 196 |
+
**Beyond K=50, the right move is NOT to batch harder but to prune harder upstream** — make LightGBM-1 more selective. Pinterest and Bing aggressively trim before the expensive stage.
|
| 197 |
+
|
| 198 |
+
## C. Phase 4d — Use-cases and information-gain design doc
|
| 199 |
+
|
| 200 |
+
### C.1 User personas
|
| 201 |
+
|
| 202 |
+
Foundational literature: Bates's "berrypicking" (Online Review 1989) — real scholarly search is iterative, multi-source, goal-mutating, not one-shot. Ellis/Wilson's six activities (starting, chaining, browsing, differentiating, monitoring, extracting) map cleanly: monitoring = stay-current mode; chaining+differentiating+extracting = literature-review mode. Al-Shboul & Abrizah (2014, Journal of Academic Librarianship) is the explicit persona-template reference. Gordon et al. (Taylor & Francis 2020/2021) quantify scholarly pain: only 15.4% of physicists feel successful at staying current; 28.6% feel unsuccessful. Mysore et al. (CHIIR 2023) and Soufan, Ruthven, Azzopardi (CHIIR 2024) empirically confirm berrypicking in modern AI/ML workflows. Niwanputri et al. (SIGIR ICTIR 2025) "Untangling Cognitive Processes in Academic Information Searching" is the 2025 SIGIR anchor. **Scholar Inbox (Flicke et al. 2025, arXiv 2504.08385)** is the closest comparable system — they released an 800k-rating dataset and use an active-learning rating onboarding pattern.
|
| 203 |
+
|
| 204 |
+
Drop-in persona cards for the doc:
|
| 205 |
+
|
| 206 |
+
| # | Persona | Profile state | Mode | Day-1 signal | UX demand |
|
| 207 |
+
|---|---|---|---|---|---|
|
| 208 |
+
| P1 | Brand-new (cold start) | Empty EWMA | Exploration-forced | Categories + 5–10 ratings | Active-learning onboarding (Scholar Inbox) |
|
| 209 |
+
| P2 | PhD student, active | 50–500 interactions, 2–4 tight clusters | Stay-current/deep | Daily skim, narrow topic | Don't flood with diversity early |
|
| 210 |
+
| P3 | Senior researcher/PI | 1k+ interactions, 8–15 clusters | Mixed monitoring | Scan many, save few, dismiss often | No single cluster >40% |
|
| 211 |
+
| P4 | Cross-disciplinary | Multiple distinct medoids | Parallel stay-current | Per-cluster cadence diverges | Cluster-balanced delivery |
|
| 212 |
+
| P5 | Lapsed (3-mo gap) | α_long preserved, α_short stale | Re-orient | High dismissal first 3 sessions | "What changed" framing |
|
| 213 |
+
| P6 | Cold-restart pivot | Has history, wants new field | Explicit pivot | System seeds new cluster | "Start new interest" UI |
|
| 214 |
+
| P7 | Literature-review session | Any profile + deep-session intent | Deep single-cluster | Many click-throughs, long dwells | Suppress MMR, amplify depth |
|
| 215 |
+
| P8 | Stay-current daily | Any profile, 10-min daily | Monitoring | Fast skim, binary save/dismiss | Strong MMR, proportional cluster coverage |
|
| 216 |
+
|
| 217 |
+
### C.2 Information gain per interaction
|
| 218 |
+
|
| 219 |
+
Foundational: Joachims (KDD 2002) clicks as relative pairwise preferences; Joachims et al. (TOIS 2007) eye-tracking validates ~80% reliability for "click i, skip i−1" pairs; Yi et al. (RecSys 2014) dwell time ≥30s as valid-engagement threshold; Xie et al. (WWW 2023) "valid read" = click + sufficient dwell; Yin et al. (WSDM 2013) "Silence is also evidence" — short dwell after click is negative, not missing. **The central paper** is Wang et al. RecSys 2023 (arXiv 2308.12256): dislike as feature only → −0.34% dislike rate (not significant); dislike as **feature AND training label** → −2.44% dislike rate, **−9.60% repeated dislike on same creator**, −2.05% dismissing users, and counterfactually **60.8% reduction in similar-content recommendations versus 22% when dislike is feature-only**. Implicit skip as negative label delivered +0.40% user enjoyment, +0.61% DAU≥1h.
|
| 220 |
+
|
| 221 |
+
Drop-in information-gain table (normalized to click = 1.0 baseline):
|
| 222 |
+
|
| 223 |
+
| Interaction | Sign | Relative strength | EWMA update | ~Bits info |
|
| 224 |
+
|---|---|---|---|---|
|
| 225 |
+
| Explicit category at onboarding | + | 5–10× | α_long seed | 3–5 |
|
| 226 |
+
| Save / bookmark | + | 3–5× | α_short + α_long | ~2 |
|
| 227 |
+
| Click-through to arXiv (no dwell) | + | 1.0× | α_short | ~0.5 |
|
| 228 |
+
| Long dwell (>30s) on abstract | + | 2–3× | α_short elevated | ~1 |
|
| 229 |
+
| Short dwell (<5s) after click | − weak | −0.5× | small α_neg | ~0.3 |
|
| 230 |
+
| Share / export to bib | + | 4–6× | α_short + α_long strong | ~2–3 |
|
| 231 |
+
| Dismiss (feature only) | − | −1× | Layer-1 only | ~0.3 |
|
| 232 |
+
| **Dismiss (feature + training label + similar suppression)** | − | **−3× to −4×** | All three layers | ~1.5–2 |
|
| 233 |
+
| "Don't recommend cluster" mute | − | −10× | Hard filter persistent | 3+ |
|
| 234 |
+
| Passive skip / scroll-past | − very weak | −0.1× | Aggregate only | ~0.05 |
|
| 235 |
+
| Revisit saved paper | + | 2× | α_long | ~1 |
|
| 236 |
+
|
| 237 |
+
**Product principles derived:** every save must move the EWMA profile measurably (if α_short=0.40 doesn't produce a visible medoid shift after one save, the profile is broken); dismissals must be 1-click because their information value is ~3× passive skip; dwell must be normalized per device/context; explicit negatives must enter both the LightGBM feature vector AND the training label — feature-only is essentially wasted.
|
| 238 |
+
|
| 239 |
+
### C.3 Longitudinal journeys
|
| 240 |
+
|
| 241 |
+
Time-drift literature (Koren KDD 2009 timeSVD++; Mansoury CIKM 2020 feedback loop; TDLRP-MF MDPI Systems 2025; TransActV2 arXiv 2506.02267) validates Amin's α_short/α_long split. The temporal-drift papers consistently show α_short ≈ 10× α_long is healthy; Amin's 13× ratio is in range. Per-persona day-1/7/30/90 table: P1 progresses from explicit ratings + popularity-biased exploration to 1–2 tight clusters by week, to 2–4 stable medoids at 30 days, to indistinguishable from P2 at 90 days. P5 on return at d=90 starts with stale α_short; decay α_long by (1−α_long)^90 ≈ 0.065 to partially refresh. P7 is session-scoped only (MMR λ down, cluster depth up, session-TTL long). P8 is steady monitoring at 10-min daily, evolving slowly in α_long regime.
|
| 242 |
+
|
| 243 |
+
### C.4 Instrumentation priorities
|
| 244 |
+
|
| 245 |
+
Production references: Spotify Event Delivery Infrastructure (8M events/s, schema-first, session-context qualifies every signal); Pinterest TransActV2 (arXiv 2506.02267, real-time top-100 sequence, **p99 latency as production-critical metric not mean**); YouTube Covington RecSys 2016 + Wang 2023 (80B signals/day, separate logging for watch/search/subscribe/dismiss/satisfaction); OpenTelemetry Weaver (2025) for schema-first telemetry with SDK generation. The schema must be frozen before any real-user logging (ADR A4) because post-launch migrations are painful.
|
| 246 |
+
|
| 247 |
+
Minimum event families to log: session_start/end + mode_declared; feed_request/served with slot_index, cluster_id, medoid_id, popularity_prior_weight, mmr_lambda, exploration_flag; positive (click, dwell_end, save, bookmark, share, export_bib, revisit) with dwell_ms, scroll_depth, device_context; negative (dismiss, mute_cluster, hide_author, explicit_dislike) with reason_code, layer_applied; profile ops (ewma_step, cluster_rebuild, medoid_shift) with α_used, silhouette_delta; model ops with per-stage latency; health/error events (empty_candidate_set, stale_profile_warning, popularity_fallback_triggered). **Log p50/p95/p99 latency percentiles per stage.** Nightly aggregations for SLO dashboards: personalized-to-popularity ratio (target ≥0.85 after day 7), cluster-share Gini (alert >0.7), exploration-slot fire rate (target 1/10 ±50% drift alert), per-cluster dismiss rate (>35% → mute candidate), save-to-click ratio, α_short day-over-day distance (alert if zero for 14 days), time-between-sessions (detects lapsed users).
|
| 248 |
+
|
| 249 |
+
### C.5 Product principles
|
| 250 |
+
|
| 251 |
+
Netflix North-Star thinking (Gibson Biddle) suggests **"saves per active week"** as ResearchIT's primary leading indicator — tied to customer value, directly moves α_long, not gameable by dismissals. Spotify contextual-session principle: a skip in stay-current mode ≠ a skip in lit-review mode. Pinterest tail-latency principle: operational metrics on p99 not mean. Stated principles for ResearchIT: every save must measurably move the profile; dismissals are always 1-click, always logged as both feature and label; three-layer negatives have distinct half-lives (session/α_neg=0.15/persistent-until-unmuted); context qualifies every signal; exploration is a budget not an afterthought; cluster balance beats global top-K for cross-disciplinary users; cold-start is active not passive (Scholar Inbox pattern); latency SLOs on p99; stale profiles must announce themselves; never dark-launch a ranker change without a popularity-baseline A/B.
|
| 252 |
+
|
| 253 |
+
### C.6 Mode-switching / intent-conditioned recommendation
|
| 254 |
+
|
| 255 |
+
Broder (SIGIR Forum 2002) navigational/informational/transactional extends to informational-narrow (lit-review) vs informational-broad (stay-current). **Jannach, Mobasher et al. TORS 2024 (arXiv 2406.16350) "A Survey on Intent-Aware Recommender Systems"** is the 2024 anchor — categorizes diversification-based, intent-prediction, and latent-intent modeling; identifies gap of offline-only evaluation. RecSys 2024 reproducibility study "A Worrying Reproducibility Study of Intent-Aware Recommendation Models" is cautionary: most intent-aware claims don't replicate. **Industry validates explicit mode switching over fully-latent intent** (Pinterest Homefeed vs Related-Pins vs Search; Spotify Deep-Focus vs What's-New).
|
| 256 |
+
|
| 257 |
+
Recommendation: start with an **explicit two-mode toggle** in UI ("Stay Current" / "Lit Review"): stay-current has high MMR λ, per-cluster quota on, small popularity prior, 10-min session TTL; lit-review has low MMR λ, high single-cluster depth, citation-chain exposure, 60-min session TTL. Add latent intent fallback: if session shows 3 consecutive clicks into one cluster with long dwells, quietly switch to lit-review. Defer sophisticated latent-intent models.
|
| 258 |
+
|
| 259 |
+
### C.7 Failure modes and detection
|
| 260 |
+
|
| 261 |
+
Chaney, Stewart, Engelhardt (RecSys 2018) prove feedback loops amplify homogeneity; Mansoury et al. (CIKM 2020) quantify bubble intensification across rounds; Nguyen et al. (WWW 2014) first longitudinal filter-bubble measurement; Tang et al. (arXiv 2508.11239, Aug 2025) "Mitigating Filter Bubble from Community Detection" defines filter-bubble index = fraction of recs inside user's own community — **directly operationalizable using Ward clusters as the Louvain analog**. Drop-in detection rules:
|
| 262 |
+
|
| 263 |
+
| Failure | Detection rule | Mitigation |
|
| 264 |
+
|---|---|---|
|
| 265 |
+
| Feed collapse | 7-day rolling cluster-share Gini >0.7 OR top-cluster share >0.6 | Force MMR λ up; inject exploration; cap per-cluster at 40% |
|
| 266 |
+
| Stale profile | α_short unchanged for 14 days AND last session >30 days | "Refresh interests" card; boost popularity prior; Scholar-Inbox-style re-prompt |
|
| 267 |
+
| Cluster fragmentation | Cluster count >K_max OR >40% clusters with <3 neighbors | Lower Ward threshold; merge |
|
| 268 |
+
| Cluster over-merging | Silhouette week-over-week Δ <−0.15 | Raise Ward threshold; split top-variance cluster |
|
| 269 |
+
| Filter bubble | Filter-bubble index >0.95 for 30 days | Cross-cluster sampling; raise exploration budget |
|
| 270 |
+
| Popularity collapse | popularity_fallback >0.2 DAU/day | Ranker may be broken; verify LightGBM not degenerate |
|
| 271 |
+
| Latency regression | p99 > SLO for 1h | Standard SRE playbook |
|
| 272 |
+
| Dismissal ineffective | In-cluster rec rate within 7 days of dismiss > baseline | Verify three-layer pipeline; check layer-2 re-training |
|
| 273 |
+
| Feedback-loop amplification | Avg pairwise served-item similarity trending up 4+ weeks | CD-CGCN community-aware negative sampling |
|
| 274 |
+
| Cold-start stuck | Personalized score share <0.3 at day 7 | Push active-learning prompts; lower warm threshold |
|
| 275 |
+
|
| 276 |
+
## D. Phase 5 preview at Phase-4-level detail
|
| 277 |
+
|
| 278 |
+
### D.1 Epsilon-greedy exploration
|
| 279 |
+
|
| 280 |
+
**Spotify BaRT** (McInerney, Lacker, Hansen, Higley, Bouchard, Gruson, Mehrotra; RecSys 2018; DOI 10.1145/3240323.3240354) is the canonical reference. Two-stage contextual bandit over Home shelves (rows + explanations) and cards (playlists). Reward = factorization machine over user × item × explanation × context features predicting a binary stream event (≥30s listen). Epsilon-greedy per-slot: with probability ε pick uniformly among candidates, otherwise argmax. Conditional exploration separates "explore the item" from "explore the explanation" sharing one reward model — this keeps propensities tractable. Training uses counterfactual risk minimization with IPS on logs. Heavier exploration for new users, lighter for established.
|
| 281 |
+
|
| 282 |
+
**Pinterest "Warmer for Less"** (arXiv 2512.17277, Dec 2025) targets industrial cold-start items: **targeted lightweight augmentations (~+5% params) to the main model can match heavier bespoke approaches**. Strongly validates leaning on BGE-M3 content embeddings + light corrections for new arXiv papers rather than a separate CF/graph cold-start pipeline.
|
| 283 |
+
|
| 284 |
+
Literature consensus on exploration budget clusters at **5–15%, with 10% as default**. For ResearchIT:
|
| 285 |
+
|
| 286 |
+
- **Pre-launch → 100 users: ε-greedy at ε=0.10, slot-reservation pattern** (reserve 1/10 feed slots for exploration candidates — cleaner and lower-variance than per-slot coin flips).
|
| 287 |
+
- **100–500 users: stratified exploration** (ε distributed over arXiv primary categories the user hasn't engaged with × medoid-to-item cosine uncertainty).
|
| 288 |
+
- **500–1K users, >1K eng/week: Beta-Bernoulli Thompson sampling at category level.**
|
| 289 |
+
- **>5K users, >10K eng/week: neural-linear bandit (mtNLB-style, KDD 2024 DOI 10.1145/3637528.3671649) reusing LightGBM scorer as representation — only if ε-greedy shows regret plateau.**
|
| 290 |
+
|
| 291 |
+
Thompson vs ε-greedy: Chapelle & Li (NeurIPS 2011) and Vermorel & Mohri (2005) show vanilla ε-greedy routinely matches or beats TS/UCB at small N. TS at item level across 1.6M items with <1K users is infeasible; TS at category or cluster level is tractable. Other contextual bandit references: LinUCB (Li, Chu, Langford, Schapire WWW 2010); NeuralUCB (Zhou ICML 2020); NeUClust (Atalar et al. arXiv 2410.14586, Oct 2024) — contextual-combinatorial for list recommendations; ENR (CIKM 2023) epistemic neural nets for scalable TS; Ban/Qi/He WebConf 2024 tutorial.
|
| 292 |
+
|
| 293 |
+
### D.2 LightFM collaborative filtering
|
| 294 |
+
|
| 295 |
+
LightFM (Kula 2015, arXiv 1507.08439) is legacy-but-still-competitive; in 2026 it remains perfect for Render's CPU-only deployment because every user/item embedding is a sum of feature embeddings (including a unique-ID feature), enabling **strong cold-start with metadata — exactly ResearchIT's setting**. Alternatives: implicit ALS (industrial baseline but no cold-items); LightGCN (SIGIR 2020 arXiv 2002.02126, ~16% avg lift on standard datasets but training overhead); two-tower (Google, needs GPU); UltraGCN (marginal gains). 500-user rule-of-thumb: LightFM with WARP loss crosses above content-only when users×interactions >5K; at 500 users × ~10 positive interactions = ~5K, exactly threshold.
|
| 296 |
+
|
| 297 |
+
**Integration: Pattern 2 (CF score as a LightGBM feature).** Spotify and Pinterest production consistently run CF + content-based candidate generators in parallel with a learned ranker blending them; within the ranker, CF is one feature among many. This gracefully handles users with weak CF signal because LightGBM learns to down-weight it. Don't do separate quota slots (worst at blending score scales). Warm-start uses LightFM's feature-averaging: a new user with claimed research categories/authors gets a warm embedding without any interaction history.
|
| 298 |
+
|
| 299 |
+
### D.3 Dismissal-labeled LightGBM retraining
|
| 300 |
+
|
| 301 |
+
**Minimum viable signal: ~1K dismissal events total** to distinguish systematic item-level dismissals from session noise. **For LightGBM retraining with dismissals as labels: ~10K events.** At 500 users × 5% dismissal rate × 50 impressions/week = ~125 dismissals/week → ~10K takes ~80 weeks of steady use. **Action: add dismissals as features now; add as labels only at scale.** Asymmetric loss via LightGBM's `is_unbalance=True` or explicit `scale_pos_weight`; a dismissal costs more than a missed save because it actively damaged the session. Focal loss (Lin et al.) and class-balanced loss (Cui CVPR 2019) supportable via LightGBM custom objective but only worth it when imbalance exceeds ~1:20.
|
| 302 |
+
|
| 303 |
+
Session-overfitting mitigations: include "fraction of session slots dismissed so far" and "dominant category of session dismissals" as features so LightGBM can learn to discount anomalous sessions; decay dismissal weight by session-age; **within-session negative sampling** (contrast dismissed items against other items shown *in the same session*, not global catalog) — the Wang et al. 2023 pattern. IPS/SNIPS/DR corrections require propensity logging from day 1; for ResearchIT's known policy, exploration slots have propensity = ε / num_candidates, exploit slots ≈1. Apply 99th-percentile weight clipping. SNIPS is the best default (Eugene Yan's benchmarking); DR via Open Bandit Pipeline for robustness; arXiv 2509.00333 (Sept 2025) IPS-weighted BPR + propensity regularizer is a concrete code pattern.
|
| 304 |
+
|
| 305 |
+
### D.4 Other Phase 5+ previews
|
| 306 |
+
|
| 307 |
+
**Semantic IDs / TIGER** (Rajput et al. NeurIPS 2023, arXiv 2305.05065): item = tuple of discrete codewords from RQ-VAE over content embedding; Transformer seq2seq decodes next-item autoregressively. +29% NDCG@5, +17.3% Recall@5 on Beauty vs S³-Rec. **ActionPiece** (Hou et al. ICML 2025 Spotlight, arXiv 2502.13581) is context-aware tokenization (same action → different tokens depending on neighbors) and outperforms TIGER-style context-independent semantic IDs. Spotify Research Sept 2025 "Semantic IDs for Generative Search and Recommendation" (Penha et al.) shows task-specific Semantic IDs fail to generalize cross-task. **Would TIGER work on CPU for 1.6M corpus?** RQ-VAE training is feasible (hours), but autoregressive Transformer decoding with beam=10 hits hundreds of ms/request on Render CPU. **Defer indefinitely** — it solves embedding-table-cost at scale, which is not ResearchIT's pain. Entry threshold: >10K users AND ANN on 1.6M becomes the bottleneck AND a GPU becomes available.
|
| 308 |
+
|
| 309 |
+
**PinnerFormer** (Pancha et al. KDD 2022, arXiv 2205.04507): single-vector user embedding from transformer over recent engagement sequence; novel dense-all-action loss predicts a random positive action within a 14-day future window from any random sequence position. Batch daily inference closes most of the gap to realtime (0.243 vs 0.251 Recall). **Defer indefinitely for solo-dev pre-launch.** A cheap equivalent is mean of BGE-M3 vectors over recent engagements — already what Amin's medoid retrieval does (PinnerSage's original approach). Entry threshold: ≥10K users AND ≥50 avg interactions/user AND a clear need for sequence modeling AND GPU availability.
|
| 310 |
+
|
| 311 |
+
**DPP / Sliding Spectrum Decomposition.** Classic DPP: Kulesza & Taskar 2011; Chen, Zhang, Zhou KDD 2018 (YouTube-scale). SSD: Huang, Wang, Peng, Wang KDD 2021 (arXiv 2107.05204) — originally Xiaohongshu, adopted by Pinterest in early 2025. Pinterest's April 2026 engineering blog ("Evolution of Multi-Objective Optimization at Pinterest Home feed") documents DPP → SSD migration with >2% time-spent-impression week-1 lift. SSD in PyTorch is cleaner than DPP (avoids PSD enforcement, log-dets, Cholesky stability). **For ResearchIT: MMR is fine at 500 users.** Upgrade entry threshold: feed size ≥20 AND ≥2 diversity axes (category × recency × reading-difficulty) AND visible user complaints of "too-similar" results >5% rate.
|
| 312 |
+
|
| 313 |
+
**Calibration of LightGBM scores.** Default binary log-loss training is often near-calibrated; miscalibration mostly appears with `lambdarank`/`rank_xendcg` objectives — then calibration is **essential before multi-objective fusion or thresholding**. Platt scaling (sigmoid(a·score + b)) is small-data-friendly and parametric; isotonic regression is non-parametric and needs ~≥1K calibration points; beta calibration (Kull, Silva Filho, Flach AISTATS 2017) sits between. LinkedIn's in-model isotonic calibration layer and Google's "Scale Calibration of Deep Ranking Models" (Yan et al. KDD 2022) are recent pointers. **For ResearchIT:** isotonic regression on held-out 10–20% of training interactions, refit weekly. When it matters: thresholding (p(save)>0.3), ranking-fusion (combining CF + LightGBM + exploration bonus). When it doesn't: pure ranking by raw LightGBM output. Do this right after 4b (~2 days of work).
|
| 314 |
+
|
| 315 |
+
**Active learning for cold-start.** Nature Scientific Reports 2025 "Active learning algorithm for alleviating the user cold-start problem of recommender systems" uses decision-tree-based item selection with Like/Dislike/Unknown answers, 20-query cap, 3 like-constraints per user — but found online evaluation with 50 real users could not confirm offline lift. MDPI 2024 review and CIKM 2025 "Harnessing Light for Cold-Start Recommendations" confirm uncertainty+popularity hybrid queries as dominant pattern. **Practical pattern for ResearchIT: 2×3 grid at signup** — 2 triplets of 3 papers each spanning 6 arXiv subfields, user picks best per triplet, yielding a seed medoid from ~2 queries. This is Netflix's post-signup "pick 3 you like" flow. Entry threshold: ≥50 signups/week AND measurable onboarding drop-off.
|
| 316 |
+
|
| 317 |
+
### D.5 Scaling infrastructure (SQLite → Supabase)
|
| 318 |
+
|
| 319 |
+
SQLite's single-writer lock ceiling: **~50 writes/second with WAL on SSD, ~10 in default mode**. Any long INSERT blocks all writes. FTS5 shares this limit. For ResearchIT at 500 users × 50 events/session × few sessions/week, still fine. Breaks when: concurrent cluster-snapshot writes + live event logging conflict; >100 concurrent users with mutable state; ML-training jobs run alongside API writes. Supabase Postgres features for recsys: pgvector 0.7 with halfvec (50% memory savings) and parallel HNSW builds (30× faster); Row-Level Security for lab/team multi-tenancy (one `lab_id` column, policy `lab_id = auth.jwt()->>'lab_id'`); realtime subscriptions. Free tier is 500MB; paid starts ~$25/mo.
|
| 320 |
+
|
| 321 |
+
**Migration trigger: hit ~500MB SQLite OR visible writer contention OR concurrent cluster-snapshot + event-log conflicts.** Use immutable snapshot tables (`clusters_v42`, `clusters_v43`) with pointer-table atomic swap; Qdrant/Zilliz collection aliases for zero-downtime rebuilds; keep last 2 snapshots for rollback. Vector cache invalidation: version cluster_snapshot_id on cached candidates; background job refills.
|
| 322 |
+
|
| 323 |
+
### D.6 A/B testing at ~500 users
|
| 324 |
+
|
| 325 |
+
Statistical power at N=500 (α=0.05, 80% power, 50/50 split): binary metric with baseline p=0.10 has **MDE ≈ 5.5 percentage points absolute** (10% → 15.5%, ~55% relative); continuous metric MDE ≈ 0.25σ Cohen's d. **Only large lifts are detectable at this scale.** CUPED (Deng, Xu, Kohavi, Walker WSDM 2013) reduces required N by 2–3× on predictable metrics; 2024/2025 extensions include arXiv 2410.09027 (Lin & Crespo, Etsy) and arXiv 2510.03468 (CUPED + trimmed mean for heavy tails).
|
| 326 |
+
|
| 327 |
+
**For solo pre-launch: scipy.stats + evidently.ai-style notebook now.** GrowthBook (self-hosted, open-source, SQL-based) is the right upgrade at ≥1K users with ≥1 concurrent experiment/month. Skip Statsig (vendor dependency). Skip switchback unless adding shared team feeds where spillover matters. Experiment templates: exploration-% ablation (5/10/15 with primary = 7-day save rate, secondary = session length + dismissal rate); CF on/off at 50/50 user-level randomization; dismissal-feature vs dismissal-label over ≥4 weeks.
|
| 328 |
+
|
| 329 |
+
### D.7 Multi-tenancy / group recommendation
|
| 330 |
+
|
| 331 |
+
Masthoff (2015 survey) taxonomy holds: Average/Additive Utilitarian; **Least Misery** (good for veto scenarios like "labmate dislikes biology → don't recommend to whole lab"); Most Pleasure; **Average Without Misery** (recommended compromise — average but filter below per-individual threshold); Approval Voting / Borda / Kemeny rank aggregation. Fairness-aware 2024–2025: Stratigi et al. (JIIS 2021) SDAA/SIAA sequential satisfaction-balancing; FAccT 2025 "Group Fair Rated Preference Aggregation: Ties Are (Mostly) All You Need" (Fate-Break and Fate-Rate). LLM-based group rec 2025: arXiv 2505.05016 "Pitfalls of Growing Group Complexity" — LLMs often implicitly do Average; explicit prompts for Least Misery change behavior. Academic-collaboration-context group-RS papers remain rare — you'd be doing mostly greenfield work.
|
| 332 |
+
|
| 333 |
+
**Recommendation for ResearchIT**: **Average-Without-Misery with a tunable misery threshold**, enforced via Postgres RLS per-lab. Lab profile surfaces only aggregate signals (counts, category histograms) — never individual read/save events — unless explicitly opted-in; GDPR consent language must be explicit because "labmate X saved this" is a personal-data disclosure. Entry threshold: real user demand (multiple lab opt-ins requested) post-launch; **not in Phase 5 core scope**.
|
| 334 |
+
|
| 335 |
+
## E. Offline evaluation scale-up
|
| 336 |
+
|
| 337 |
+
**Regression testing in CI.** Frozen eval set as a Git-LFS artifact with version-pinned manifest (split date, author allowlist, citation-pair count, dataset hash) — never mutate without bumping `eval_set_v1.0.parquet → v1.1.parquet`. Pytest + GitHub Actions on every PR touching `retrieval/rerank/rank/diversify/`. Threshold-based assertions: hard fail if nDCG@10 drops >3% absolute or Recall@50 drops >2%; soft warn (xfail strict=False) if ILS/entropy moves >10%. Use bootstrap 1000-replicate 95% CIs to fail only when the baseline is excluded. PRs that intentionally move metrics must update `eval/baselines/main.json` with an `EVAL_DELTA_JUSTIFICATION`. CPU budget: freeze to 5k-query subsample (~5 min on Render free tier); full eval is nightly cron. **Tooling: DIY pytest now (~200 LOC, zero deps). Evidently AI** (open-source) has a built-in GitHub Action wrapping Python tests and failing CI on threshold violations with 15+ ranking metrics. DeepEval is overkill for ranking.
|
| 338 |
+
|
| 339 |
+
**Per-stage attribution.** IJCAI-22 "Neural Re-ranking in Multi-stage Recommender Systems: A Review" and Pinterest's WebConf 2023 "End-to-End Diversification" paper: each stage needs its own intermediate ground truth plus a joint evaluation. For ResearchIT: retrieval = Recall@200 (ceiling for all downstream); rerank = nDCG@50 on retrieved set + Precision@10; diversify = ΔnDCG@10 and ΔILS/entropy pre-vs-post. Log `stage_metrics.jsonl` per eval with `{run_id, stage, metric, value, params_hash}`; a "regression diagnosis" script compares PR vs main across stages. Hron et al. 2021 "On component interactions in two-stage recommender systems" is the theoretical grounding — retrieval-rerank interactions are non-trivial. Pinterest reports retrieval-layer diversification gives +8% diversity in candidate set but only +1% at final rank — stage-specific diversity deltas matter.
|
| 340 |
+
|
| 341 |
+
**Experiment tracking.** Append-only `eval_runs.jsonl` now (`{run_id, git_sha, timestamp, dataset_hash, config_hash, metrics, stage_metrics}` with Streamlit/Jupyter for plotting). Adopt MLflow locally (SQLite backend) at Phase 4b when distillation creates many hyperparameter-tuning runs. Skip W&B unless/until a collaborator appears (free tier fine but cloud dependency). Skip DVC (Git-LFS + manifests cover 80% of value). Signal to upgrade from JSONL to MLflow: "I can't find the run from 3 weeks ago in grep."
|
| 342 |
+
|
| 343 |
+
**Synthetic user generation.** RecSim NG (Google 2021), RecBole simulators, **Balog & Zhai 2024 "User Simulation for Evaluating Information Access Systems" (Foundations & Trends, 261 pages) is the foundational survey**. 2025 LLM-agent simulators: UserSimCRS v2 (Balog & Zhai 2025), RecUserSim (Chen et al. WWW 2025 arXiv 2507.22897). Sim4IA workshop at SIGIR 2024 is the community reference. Concrete plan: extract 2–5k author personas from unarXive 2022 author graphs spanning deep specialists, bridge authors, early-career, prolific surveyers, methodology-transfer; choice model `p(save|paper) = σ(α·cos(paper,centroid) + β·cited_by_persona + γ·recency − δ·already_saved)`; add drift by slightly updating centroid with each saved paper. Evaluation: longitudinal nDCG trajectories, calibration of saved/dismissed ratio (expect 15–25%), exploration metric for bridge authors. Budget 2–3 weeks; start 100 personas × 30 days, scale to 2k later. Always triangulate against held-out real data.
|
| 344 |
+
|
| 345 |
+
**Cluster evaluation.** Silhouette coefficient + Davies-Bouldin index daily (Chicco et al. 2025 PeerJ — SC+DBI superior to Dunn/CH on convex clusters). Stability across time is the production-critical metric: Hungarian match day-over-day via `scipy.optimize.linear_sum_assignment` with cost = −|C_i ∩ C_j|; per-cluster Jaccard after matching; aggregate mean Jaccard and fraction with J≥0.8. Complement with Adjusted Rand Index across consecutive days and object-level stability (Toms et al. WorldCat; Toussi 2017). Alert threshold: mean Jaccard <0.7 for 3 consecutive days. **Cluster snapshot versioning is architecturally necessary before Phase 4a** because summaries will be keyed to cluster IDs.
|
| 346 |
+
|
| 347 |
+
**Counterfactual evaluation.** Required from day 1 of Phase 4 — every displayed recommendation must log `p(shown|context)` under the active policy. Without propensities, IPS/SNIPS/DR are retroactively impossible. Inject 5% ε-greedy exploration for non-degenerate propensities. Estimator choice (per Eugene Yan benchmarking + JTIE 2025 reproducible study): **SNIPS is best default** (no hyperparameter, lower variance than IPS); Direct Method alongside for low-variance potentially-biased imputation; Switch-DR in moderate-overlap regimes. **Tooling: Open Bandit Pipeline** (Saito et al.) in Python. JTIE 2025 reporting template: always report oracle decomposition, overlap diagnostics, estimator components, and effective sample size. **ESS <100 = unreliable; don't ship.**
|
| 348 |
+
|
| 349 |
+
## F. Planning and requirements
|
| 350 |
+
|
| 351 |
+
### F.1 Architectural decisions blocking Phase 4 start (ADR sprint, week 0)
|
| 352 |
+
|
| 353 |
+
These seven decisions must be captured as ADRs *before* any Phase 4 code lands:
|
| 354 |
+
|
| 355 |
+
- **A1 Cluster snapshot versioning.** SQLite table `cluster_snapshots(snapshot_date, cluster_id, paper_id, centroid_blob)`, 30-day retention, Hungarian-matched stable IDs as separate column. Without this, Phase 4a cache invalidation is guesswork.
|
| 356 |
+
- **A2 Per-user vs shared cluster summaries.** **Recommended: shared.** Per-cluster cached once per `(cluster_stable_id, snapshot_date)`. Per-user adds 3–5× Claude cost with marginal UX gain pre-launch. Shared ≈$50–80/month; per-user easily $500+. Schema-migration-hard to change later.
|
| 357 |
+
- **A3 LightGBM v1 vs v2.** **Recommended: one-stage LambdaMART in 4b; two-stage deferred to Phase 5.** Single LambdaMART over {bi-encoder score, BM25, recency, category match, author overlap} captures 80% of two-stage value at 30% complexity.
|
| 358 |
+
- **A4 Telemetry event schema v1 (frozen before any logging).** Minimum fields: `event_id, user_id, session_id, timestamp, event_type, paper_id, position, cluster_id, cluster_stable_id, policy_id, propensity, ranker_version, rerank_version, candidate_source, ab_bucket`. Retrofitting is painful. OpenTelemetry OTEP 0152/0243 on schema evolution are the canonical references.
|
| 359 |
+
- **A5 Eval-set version pinning + baseline format.** `eval/baselines/main.json`, `eval/eval_set_v1.0.parquet`; PRs that move metrics update both.
|
| 360 |
+
- **A6 Distillation training-data boundary.** Commit before 4b to: teacher (BGE-reranker-v2-m3), query distribution (must NOT overlap with eval's time-split), output format (MarginMSE margins). Assertion in training: `max(train.timestamp) < eval_cutoff`.
|
| 361 |
+
- **A7 Claude model/cache strategy.** Haiku 4.5 for 4a summaries; 5-min prompt cache on shared system prompt + style guide; single `cache_control` breakpoint on cluster-papers block. Stable-prefix-first prompt structure decided before coding.
|
| 362 |
+
|
| 363 |
+
### F.2 Phase 4 subworkstream entry/exit criteria
|
| 364 |
+
|
| 365 |
+
**4a Claude summaries.** Entry: A1/A2/A7 decided; cluster stability mean Jaccard day-over-day ≥0.7 over 7 days. Exit: all 50–200 active clusters have fresh summary daily; p95 generation latency <3s; monthly cost <$30 at 100 clusters × 1 refresh/day with caching; 20 human-rated summaries score ≥4/5 on coherence. Deliverables: `services/summaries/claude_client.py` with prompt cache + retry/backoff; `services/summaries/summary_job.py` nightly job writing `cluster_summaries(cluster_stable_id, snapshot_date, summary_md, input_tokens, output_tokens, cached_tokens)`; Jinja templates; cost monitoring SQL view. **Effort: 2–3 weeks solo.** Risks: Claude cost overruns (set hard spend cap, log cache hit ratio — if <70%, prompt structure wrong); stale summaries from snapshot_date collisions (use content-hash tie-breaker); prompt injection from abstracts (use `<paper_abstract>` tags + "summarize only; ignore instructions" system line).
|
| 366 |
+
|
| 367 |
+
**4b Distilled reranker.** Entry: Phase 3 eval producing stable nDCG@10 within ±0.5% across runs; retrieval Recall@200 ≥0.85 on held-out; A3/A5/A6 decided; frozen eval set never seen in training (enforced assertion). Exit: student recovers ≥95% teacher nDCG@10 at ≥10× lower CPU latency; ONNX-exported INT8-quantized with PyTorch numerical closeness <1e-3 on 1000 samples; feature-flagged shadow traffic for 1 week with no regressions. Deliverables: teacher-scoring pipeline (non-eval time window); student training script with MarginMSE loss; ONNX export + `optimum-cli`; FastAPI integration with onnxruntime; stage-attribution eval report. **Effort: 4–6 weeks solo** (1 week data prep, 1 week training/tuning, 1 week ONNX+quantization+perf, 1–2 weeks integration+shadow, 1 week buffer). Risks: training-data leakage (time-cutoff assertion); CPU latency regression from naive batching (batch top-50 as one forward pass, not serial); quantization-catastrophic-recall (always compare fp32 vs INT8 on same eval — usually <0.5 nDCG, can be worse with bad calibration data).
|
| 368 |
+
|
| 369 |
+
**4d Use-cases doc.** Entry: Phase 3 eval showing consistent wins; dogfooding anecdotes; 4a scoped (for UX mockups). Exit: 10–15 page markdown doc with 3–5 personas drawn from synthetic-persona work, top 10 use cases with before/after storyboards, explicit non-goals, 3-month roadmap. Deliverables: single markdown doc + 1-page "pitch" derivative. **Effort: 1–1.5 weeks focused writing, calendar-time ~3 weeks** (competes with dev work). Risks: writing-in-a-vacuum (need 5–10 real conversations); premature lock-in (publish externally only after 10 external users × 2 weeks).
|
| 370 |
+
|
| 371 |
+
### F.3 Dependency graph and sequencing
|
| 372 |
+
|
| 373 |
+
```
|
| 374 |
+
Week 0: ADR sprint (A1–A7) [1 week, no coding]
|
| 375 |
+
│
|
| 376 |
+
├──→ 4d Use-Cases Doc (1–1.5 wk writing, weeks 1–3 calendar)
|
| 377 |
+
│
|
| 378 |
+
├──→ 4a Claude Summaries (2–3 wk, weeks 1–3) — needs A1, A2, A7
|
| 379 |
+
│
|
| 380 |
+
└──→ 4b Distilled Reranker (4–6 wk, weeks 4–9) — needs A3, A5, A6
|
| 381 |
+
```
|
| 382 |
+
|
| 383 |
+
**Sequencing rationale: 4a first (cheapest, most visible, low risk, UI-validating, infrastructure reused by 4d); 4d in parallel (writing surfaces missing features); 4b last (largest quality lift but biggest risk, benefits from 4a UI being in prod and 4d clarifying what matters).** Add 30% buffer to every estimate — solo-dev posts uniformly show actual timelines are 1.5–2× initial estimates. **Realistic Phase 4 total: 10–12 weeks with parallelization and buffer; ~8 weeks if nothing breaks (it will).**
|
| 384 |
+
|
| 385 |
+
Week-by-week plan:
|
| 386 |
+
|
| 387 |
+
| Week | 4a | 4b | 4d | Cross-cutting |
|
| 388 |
+
|---|---|---|---|---|
|
| 389 |
+
| 0 | — | — | — | ADR sprint A1–A7 |
|
| 390 |
+
| 1 | Claude client + cache | Teacher scoring script | Persona draft | CI regression harness v1 |
|
| 391 |
+
| 2 | Nightly summary job + DB | 500k-pair sampling + MarginMSE training | Use case storyboards | Synthetic persona sim v0 |
|
| 392 |
+
| 3 | UI integration + human eval | Training runs (MLflow) | External review + polish | Stage-attribution diagnostic |
|
| 393 |
+
| 4 | Cost polish; freeze | ONNX + INT8 export | done | — |
|
| 394 |
+
| 5 | monitoring buffer | CPU perf optimization | — | Cluster stability alerts live |
|
| 395 |
+
| 6 | — | FastAPI integration + flag | — | — |
|
| 396 |
+
| 7 | — | Shadow traffic + debug | — | — |
|
| 397 |
+
| 8 | — | Full rollout + eval report | — | Phase 4 retrospective |
|
| 398 |
+
| 9–10 | — | buffer | — | Plan Phase 5 entry threshold review |
|
| 399 |
+
|
| 400 |
+
### F.4 "Good enough" exit criteria
|
| 401 |
+
|
| 402 |
+
4a: summaries ship to 100% of clusters, cost within budget, no correctness incidents 2 weeks. 4b: ≥95% teacher nDCG@10 recovery, CPU p95 <200ms top-50 rerank, 1 week shadow clean. 4d: 3 external readers provide feedback → 1 revision → published. General rubric for solo dev: primary objective met + smallest acceptable safety net = ship. Resist the "perfect" standard — solo devs chasing "done" on every phase never launch. Log tech debt in `TODO.md`; every 6–8 weeks, 2-week refactoring cycle (Matt Robertson solo-dev pattern).
|
| 403 |
+
|
| 404 |
+
### F.5 Phase 5 entry thresholds
|
| 405 |
+
|
| 406 |
+
| Workstream | Entry threshold | Rationale |
|
| 407 |
+
|---|---|---|
|
| 408 |
+
| ε-greedy exploration | **Day 1 of Phase 4 (even with 1 user)** | Required architectural decision, not future workstream — without exploration no propensities, without propensities no retrospective IPS |
|
| 409 |
+
| LightFM / hybrid CF | ≥100 users OR ≥500 saves total | CF beats pure content only once interaction signal overlaps; below ~500 saves, content+recency wins |
|
| 410 |
+
| Dismissal retraining (as labels) | ≥5K dismissal events AND propensity-logged | Fewer means IPS variance explodes (ESS<100); propensities must come from day 1 or impossible to apply later |
|
| 411 |
+
| Semantic IDs (TIGER) | ≥10K users AND ANN bottleneck measurable AND GPU available | Solves embedding-table-cost at scale — not ResearchIT's pain at 10K users × 1.6M papers |
|
| 412 |
+
| PinnerFormer | ≥10K users AND ≥50 avg interactions/user AND basic sequence features built AND GPU available | Dense-all-action loss needs 14-day future prediction window per user; <50 interactions/user has nothing to learn |
|
| 413 |
+
| DPP / SSD diversity | MMR clustering complaints >5% of user feedback | 500+ LOC complexity not worth it until MMR visibly fails |
|
| 414 |
+
| Calibration (isotonic) | Before any multi-objective score fusion | ~2 days of work; schedule right after 4b |
|
| 415 |
+
| Active learning onboarding | ≥50 signups/week AND measurable funnel drop-off | Nature 2025 study couldn't confirm offline lift online with 50 real users |
|
| 416 |
+
| SQLite → Supabase | ~500MB DB OR writer contention OR cluster-job + event-log collisions | SQLite fine for ResearchIT workload until one of these fires |
|
| 417 |
+
| GrowthBook (from scipy) | ≥1K users AND ≥1 concurrent experiment/month | scipy + notebook covers pre-launch |
|
| 418 |
+
| Lab/group profiles | Multiple explicit lab opt-in requests post-launch | Not in Phase 5 core; greenfield for academic context |
|
| 419 |
+
|
| 420 |
+
### F.6 Cross-cutting risks
|
| 421 |
+
|
| 422 |
+
Telemetry gaps bite hardest in Phase 5 (IPS impossible without propensities): **freeze schema before any logging (A4); include policy_id, propensity, shown_position, ranker_version**. Training data leakage produces phantom lift in 4b: eval-time-cutoff assertion in training script; never use eval queries as teacher-scoring queries. Claude cost overruns: Haiku + shared summaries + caching + hard dashboard cap + daily cost view. Cluster instability causes mis-cached summaries and UI label-jumping: Hungarian-matched stable IDs + Jaccard <0.7 alert. Solo-dev estimation drift: multiply all estimates by 1.5, parallelize ADR+writing with dev, commit to a hard "good enough" definition per workstream. Evaluation-overfit (CI green but real users unhappy): run synthetic-persona longitudinal sim alongside static eval; once you have real users, weight live metrics > offline. Eval-set rot: every 6 months recompute with new cutoff, bump version, re-baseline intentionally.
|
| 423 |
+
|
| 424 |
+
## Conclusion
|
| 425 |
+
|
| 426 |
+
Phase 4 is mostly a 10–12 week engineering effort bounded by two real constraints — solo-dev capacity and a 6ms CPU budget for the cross-encoder — and one architectural constraint: **every downstream Phase 5 workstream depends on decisions made in week 0 of Phase 4**. The ADR sprint is the non-negotiable entry gate. Within Phase 4, the highest-leverage sequencing is 4a (Claude summaries, shared-not-per-user, Haiku 4.5 + Batch API, ~$50–80/mo) in parallel with 4d writing, then 4b (distilled reranker, off-the-shelf TinyBERT-L-2-v2 INT8 ONNX first, distill only if held-out gap >3 nDCG, Option-C LightGBM integration). The novel contribution of Phase 4a is that **no other academic recommender currently shows personalized "You're reading about X" cluster narratives** — Scholar Inbox's shared labels are the closest analogue. The novel contribution of 4b for a solo dev is recognizing that the Shallow Cross-Encoders finding (SIGIR 2024) plus FlashRank's ONNX packaging pattern plus HF-shipped AVX-512-VNNI INT8 models means 6ms for 20 pairs on CPU is genuinely achievable without custom distillation — distillation is the more-complex fallback, not the default. For Phase 5, the single most valuable action that costs nothing now is **logging propensity and policy_id from day 1**, which unlocks SNIPS/DR counterfactual evaluation for every later workstream. The dismissal-as-label YouTube finding (Wang et al. 2023: 22% → 60.8% similar-content reduction when dismissals are both features AND labels) is the best-justified Phase 5 quality lever, but it needs ~10K dismissals and is ~80 weeks away at pre-launch scale — so in the interim, dismissals enter as features only, and the real Phase 5 quality investment should be (in order) calibration of LightGBM scores, ε-greedy exploration at 10%, stratified exploration by unused arXiv category, and LightFM-as-LightGBM-feature once interactions cross 5K. Everything else — TIGER, PinnerFormer, DPP, group rec, active learning, neural bandits — should be deferred until a specific production pain signal fires.
|
docs/walkthroughs/02-Phase2-MultiInterest-Recommender.md
CHANGED
|
@@ -17,7 +17,7 @@ EWMA profiles update (background, non-blocking)
|
|
| 17 |
↓
|
| 18 |
Ward clustering → K distinct interest medoids (auto K per user)
|
| 19 |
↓
|
| 20 |
-
Qdrant prefetch + RRF fusion (~15-25ms, single API call)
|
| 21 |
↓
|
| 22 |
Heuristic re-ranking of ~100 candidates (~1-2ms)
|
| 23 |
↓
|
|
|
|
| 17 |
↓
|
| 18 |
Ward clustering → K distinct interest medoids (auto K per user)
|
| 19 |
↓
|
| 20 |
+
Qdrant prefetch + RRF fusion (~15-25ms, single API call) [⚠️ Replaced by Quota Fusion in Phase 4]
|
| 21 |
↓
|
| 22 |
Heuristic re-ranking of ~100 candidates (~1-2ms)
|
| 23 |
↓
|
docs/walkthroughs/03-Code-Summary-and-Test-Plan.md
CHANGED
|
@@ -36,7 +36,7 @@ The current application is a fully functional FastAPI + HTMX research paper disc
|
|
| 36 |
|
| 37 |
## 2. Comprehensive Testing Plan
|
| 38 |
|
| 39 |
-
The current test suite has **
|
| 40 |
|
| 41 |
### A. Automated Testing (Current & Ongoing)
|
| 42 |
|
|
|
|
| 36 |
|
| 37 |
## 2. Comprehensive Testing Plan
|
| 38 |
|
| 39 |
+
The current test suite has **125 passing tests** (as of Phase 3.5) executing via `pytest`. Our testing strategy is split into three layers: Automated, Manual, and Analytics-based evaluation.
|
| 40 |
|
| 41 |
### A. Automated Testing (Current & Ongoing)
|
| 42 |
|
docs/walkthroughs/04-Next-Steps-and-Phase-Plan.md
CHANGED
|
@@ -15,29 +15,27 @@
|
|
| 15 |
|---|---|---|
|
| 16 |
| Qdrant Cloud (1.6M BGE-M3 papers) | ✅ Live | BQ enabled, HNSW m=32, `arxiv_bgem3_dense` collection |
|
| 17 |
| Phase 1: Zero-ML Recommender | ✅ Complete | Qdrant BEST_SCORE with raw IDs, 55 tests |
|
| 18 |
-
| Phase 2a: EWMA Profiles | ✅ Complete | Long-term (α=0.
|
| 19 |
-
| Phase 2b: Ward Clustering + Prefetch+RRF | ✅ Complete |
|
| 20 |
-
| Phase 2c: Heuristic Re-ranking + MMR | ✅ Complete |
|
|
|
|
|
|
|
| 21 |
| SQLite (interactions, profiles, clusters, metadata cache) | ✅ Live | WAL mode, async via aiosqlite |
|
| 22 |
| HTMX Frontend | ✅ Live | Search, save, dismiss, recommendations |
|
| 23 |
-
| Test Suite | ✅
|
| 24 |
|
| 25 |
### What's NOT Built Yet
|
| 26 |
|
| 27 |
| Component | Planned In | Blocked By |
|
| 28 |
|---|---|---|
|
| 29 |
-
| **
|
| 30 |
-
| Recommendation fixes (RRF→quota, α tuning) | Phase 4 | Code refactor only |
|
| 31 |
-
| LightGBM lambdarank re-ranker | Phase 6 | Need ≥500 labeled save/dismiss interactions |
|
| 32 |
| Cold-start onboarding (category picker / ORCID) | Phase 5 | Not yet designed |
|
| 33 |
-
|
|
| 34 |
-
| Pre-populated metadata store | Phase 4 | arXiv API is the latency bottleneck (~7.6s cold) |
|
| 35 |
| LLM interest summaries per cluster | Phase 8 | Needs Claude/Groq API integration |
|
| 36 |
|
| 37 |
-
> **Note
|
| 38 |
-
>
|
| 39 |
-
>
|
| 40 |
-
> hybrid search is the **#1 priority** for the next phase.
|
| 41 |
|
| 42 |
### Dataset Coverage
|
| 43 |
|
|
@@ -47,7 +45,7 @@
|
|
| 47 |
| Newest paper | `2505.04101` (~May 2025) |
|
| 48 |
| Total papers | 1,596,587 |
|
| 49 |
| Payload stored in Qdrant | `arxiv_id` only |
|
| 50 |
-
| Metadata source | arXiv API (
|
| 51 |
|
| 52 |
---
|
| 53 |
|
|
@@ -126,7 +124,7 @@ The 6 research documents contain several contradictions. Here is each one and it
|
|
| 126 |
|
| 127 |
This is what PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN actually deploy.
|
| 128 |
|
| 129 |
-
**Current status**: RRF is still in the codebase.
|
| 130 |
|
| 131 |
### 2. EWMA α_long = 0.10 vs 0.03
|
| 132 |
|
|
@@ -136,7 +134,7 @@ This is what PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN actually depl
|
|
| 136 |
|
| 137 |
**Resolution**: PinnerSage tested λ=0.1 and **explicitly rejected it as too recent-biased**. Their optimal was λ=0.01. Doc 06 recommends α_long=0.03 as a compromise.
|
| 138 |
|
| 139 |
-
**Current status**: α=0.
|
| 140 |
|
| 141 |
### 3. BGE-reranker-v2 in the Hot Path
|
| 142 |
|
|
@@ -231,46 +229,52 @@ Final results → fetch metadata → render
|
|
| 231 |
|
| 232 |
### Phase 4: Recommendation Pipeline Fixes (~1 week)
|
| 233 |
|
|
|
|
|
|
|
| 234 |
Corrections to the existing recommendation pipeline based on Doc 06's findings.
|
|
|
|
| 235 |
|
| 236 |
#### 4.1 Replace RRF with Importance-Weighted Quota Fusion
|
| 237 |
-
**Why**: RRF lets dominant clusters swamp minor interests — the exact failure mode multi-interest models exist to prevent.
|
| 238 |
|
| 239 |
-
**What to
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
-
**New flow**:
|
| 242 |
```
|
| 243 |
clusters = compute_clusters(...)
|
| 244 |
-
|
| 245 |
-
for each
|
| 246 |
-
|
| 247 |
-
candidates_k = qdrant search with medoid_k (limit = slots_k × 3)
|
| 248 |
-
rerank within cluster_k via LightGBM / heuristic
|
| 249 |
-
take top slots_k
|
| 250 |
-
deduplicate across clusters (assign to highest-ranked)
|
| 251 |
-
MMR over the merged union
|
| 252 |
```
|
| 253 |
|
| 254 |
-
#### 4.2 Tune α_long from 0.10 → 0.03
|
| 255 |
-
**Why**: PinnerSage explicitly rejected 0.10 as too recent-biased.
|
| 256 |
|
| 257 |
-
|
| 258 |
|
| 259 |
-
#### 4.3 Wire the Negative Profile
|
| 260 |
-
**Why**: Currently computed and stored but never used. YouTube showed a 3× gain from using dislikes as both features and labels.
|
| 261 |
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
-
|
| 269 |
-
**Why**: The arXiv API is the #1 latency bottleneck (~7.6 seconds cold for 50 papers).
|
| 270 |
|
| 271 |
-
|
| 272 |
|
| 273 |
-
|
| 274 |
|
| 275 |
---
|
| 276 |
|
|
@@ -379,8 +383,9 @@ Add LightFM hybrid model with switching strategy:
|
|
| 379 |
- ≥10 interactions: LightFM
|
| 380 |
Retrain LightGBM with dismissals as negative labels (YouTube's 3× gain from dual labels).
|
| 381 |
|
| 382 |
-
#### 9.3 Category-Level Negative Suppression
|
| 383 |
-
If ≥3 dismissals hit the same arXiv category within
|
|
|
|
| 384 |
|
| 385 |
---
|
| 386 |
|
|
@@ -388,17 +393,13 @@ If ≥3 dismissals hit the same arXiv category within a week, suppress that cate
|
|
| 388 |
|
| 389 |
If you can only do three things, do these:
|
| 390 |
|
| 391 |
-
### 1. Build hybrid semantic search (Phase 3)
|
| 392 |
-
**Impact**: Replaces the arXiv keyword API placeholder with real vector-based search. This is what the 1.6M BGE-M3 embeddings in Qdrant were built for. Transforms the product from a keyword aggregator into a semantic discovery engine.
|
| 393 |
-
**Effort**: 4 new service files + router swap. ~2-3 weeks.
|
| 394 |
|
| 395 |
-
### 2. Pre-populate the metadata store (Phase
|
| 396 |
-
**Impact**: Drops cold metadata fetch from 7,600ms to <5ms. Single biggest latency win.
|
| 397 |
-
**Effort**: Download Kaggle dataset, write a bulk-insert script, run once.
|
| 398 |
|
| 399 |
-
### 3. Replace RRF with quota fusion in recommendations (Phase 4.1)
|
| 400 |
**Impact**: Prevents the dominant cluster from drowning out minority interests. Fixes the core multi-interest failure mode.
|
| 401 |
-
**Effort**:
|
| 402 |
|
| 403 |
---
|
| 404 |
|
|
@@ -415,5 +416,7 @@ If you can only do three things, do these:
|
|
| 415 |
| — | [Phase 1 Walkthrough](PHASE1-Zero-ML-Recommender.md) | Zero-ML recommender code tour | ✅ Complete |
|
| 416 |
| — | [Phase 2 Recommender Walkthrough](02-Phase2-MultiInterest-Recommender.md) | Multi-interest engine implementation | ✅ Complete |
|
| 417 |
| — | [Code Summary & Test Plan](03-Code-Summary-and-Test-Plan.md) | Codebase summary and testing strategy | ✅ Complete |
|
| 418 |
-
| — | [Phase 2 Hybrid Search Plan](../phases/PHASE2-Hybrid-Search-Plan.md) | BGE-M3 + Zilliz hybrid search
|
|
|
|
|
|
|
| 419 |
| — | **This Document** | Revised phase plan synthesizing all research | ✅ Current |
|
|
|
|
| 15 |
|---|---|---|
|
| 16 |
| Qdrant Cloud (1.6M BGE-M3 papers) | ✅ Live | BQ enabled, HNSW m=32, `arxiv_bgem3_dense` collection |
|
| 17 |
| Phase 1: Zero-ML Recommender | ✅ Complete | Qdrant BEST_SCORE with raw IDs, 55 tests |
|
| 18 |
+
| Phase 2a: EWMA Profiles | ✅ Complete | Long-term (α=0.03 ✅), Short-term (α=0.40), Negative (α=0.15) |
|
| 19 |
+
| Phase 2b: Ward Clustering + Prefetch+RRF | ✅ Complete | L2-norm + adaptive gap threshold, 2+ clusters on real data |
|
| 20 |
+
| Phase 2c: Heuristic Re-ranking + MMR | ✅ Complete | 5-feature scorer (neg penalty wired), MMR λ=0.6, exploration |
|
| 21 |
+
| Phase 3: Hybrid Semantic Search | ✅ Complete | BGE-M3 + Qdrant dense + Zilliz sparse + RRF, 123 tests |
|
| 22 |
+
| Phase 3.5: Turso Metadata DB | ✅ Complete | 1.23GB metadata + citations, search ~10.7s → ~1.75s |
|
| 23 |
| SQLite (interactions, profiles, clusters, metadata cache) | ✅ Live | WAL mode, async via aiosqlite |
|
| 24 |
| HTMX Frontend | ✅ Live | Search, save, dismiss, recommendations |
|
| 25 |
+
| Test Suite | ✅ 125 tests passing | Unit, integration, E2E simulation, search pipeline |
|
| 26 |
|
| 27 |
### What's NOT Built Yet
|
| 28 |
|
| 29 |
| Component | Planned In | Blocked By |
|
| 30 |
|---|---|---|
|
| 31 |
+
| **Rec pipeline fixes (RRF→quota, Hungarian, neg suppression)** | **Phase 4 (NEXT)** | Code refactor only |
|
|
|
|
|
|
|
| 32 |
| Cold-start onboarding (category picker / ORCID) | Phase 5 | Not yet designed |
|
| 33 |
+
| LightGBM lambdarank re-ranker | Phase 6 | Need ≥500 labeled save/dismiss interactions |
|
|
|
|
| 34 |
| LLM interest summaries per cluster | Phase 8 | Needs Claude/Groq API integration |
|
| 35 |
|
| 36 |
+
> **Note**: Hybrid Search (Phase 3), Turso Metadata (Phase 3.5), α_long tuning, L2
|
| 37 |
+
> normalization, and negative profile wiring are all DONE. The next priority is fixing
|
| 38 |
+
> the recommendation fusion from RRF → quota (Phase 4).
|
|
|
|
| 39 |
|
| 40 |
### Dataset Coverage
|
| 41 |
|
|
|
|
| 45 |
| Newest paper | `2505.04101` (~May 2025) |
|
| 46 |
| Total papers | 1,596,587 |
|
| 47 |
| Payload stored in Qdrant | `arxiv_id` only |
|
| 48 |
+
| Metadata source | Turso DB (primary) → arXiv API (fallback) → SQLite cache |
|
| 49 |
|
| 50 |
---
|
| 51 |
|
|
|
|
| 124 |
|
| 125 |
This is what PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN actually deploy.
|
| 126 |
|
| 127 |
+
**Current status**: RRF is still in the codebase. Phase 4 plan created — see `docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`.
|
| 128 |
|
| 129 |
### 2. EWMA α_long = 0.10 vs 0.03
|
| 130 |
|
|
|
|
| 134 |
|
| 135 |
**Resolution**: PinnerSage tested λ=0.1 and **explicitly rejected it as too recent-biased**. Their optimal was λ=0.01. Doc 06 recommends α_long=0.03 as a compromise.
|
| 136 |
|
| 137 |
+
**Current status**: ✅ Already fixed — α=0.03 in `app/recommend/profiles.py:30`.
|
| 138 |
|
| 139 |
### 3. BGE-reranker-v2 in the Hot Path
|
| 140 |
|
|
|
|
| 229 |
|
| 230 |
### Phase 4: Recommendation Pipeline Fixes (~1 week)
|
| 231 |
|
| 232 |
+
> **Detailed plan**: [`docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`](../phases/PHASE4-Recommendation-Pipeline-Fixes.md)
|
| 233 |
+
|
| 234 |
Corrections to the existing recommendation pipeline based on Doc 06's findings.
|
| 235 |
+
Items 4.2 (α_long tuning) and 4.3-old (negative profile wiring) are already done.
|
| 236 |
|
| 237 |
#### 4.1 Replace RRF with Importance-Weighted Quota Fusion
|
| 238 |
+
**Why**: RRF lets dominant clusters swamp minor interests — the exact failure mode multi-interest models exist to prevent. PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN all use quota, not RRF.
|
| 239 |
|
| 240 |
+
**What to build**:
|
| 241 |
+
- New `app/recommend/fusion.py` — `allocate_quotas()` function
|
| 242 |
+
- Refactor `_multi_interest_recommend()` to use `asyncio.gather()` for concurrent per-cluster searches
|
| 243 |
+
- Deduplicate across clusters (first-occurrence wins)
|
| 244 |
|
|
|
|
| 245 |
```
|
| 246 |
clusters = compute_clusters(...)
|
| 247 |
+
quotas = allocate_quotas([c.importance for c in clusters], total=100, min=3)
|
| 248 |
+
results = asyncio.gather(search_by_vector(medoid_k, limit=quota_k*3) for each k)
|
| 249 |
+
deduplicate → rerank → MMR → serve
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
```
|
| 251 |
|
| 252 |
+
#### ~~4.2 Tune α_long from 0.10 → 0.03~~ ✅ ALREADY DONE (Phase 2a)
|
|
|
|
| 253 |
|
| 254 |
+
α_long is already 0.03 in `app/recommend/profiles.py:30`.
|
| 255 |
|
| 256 |
+
#### ~~4.3-old Wire the Negative Profile~~ ✅ ALREADY DONE (Phase 2c)
|
|
|
|
| 257 |
|
| 258 |
+
Negative EWMA is already Feature 5 in `app/recommend/reranker.py` with 0.15 penalty weight.
|
| 259 |
+
|
| 260 |
+
#### 4.3 Hungarian Matching for Cluster Stability
|
| 261 |
+
**Why**: Cluster indices shuffle when users save new papers, breaking analytics and future UI.
|
| 262 |
+
|
| 263 |
+
**What to build**: `stabilize_cluster_ids()` in `clustering.py` using `scipy.optimize.linear_sum_assignment`. Cost matrix of medoid cosine distances; trivial at K≤7.
|
| 264 |
+
|
| 265 |
+
#### 4.4 Category-Level Negative Suppression
|
| 266 |
+
**Why**: YouTube (2023) showed 3× gain from richer negative treatment.
|
| 267 |
+
|
| 268 |
+
**Decisions resolved**:
|
| 269 |
+
- **Primary category only** — avoids over-suppression from secondary tags
|
| 270 |
+
- **14-day window** — standard default (τ_neg = 14 days)
|
| 271 |
+
- **Per-item temporal decay** → deferred to Phase 6 (LightGBM feature)
|
| 272 |
|
| 273 |
+
**What to build**: `get_suppressed_categories()` in `db.py` (SQL join: interactions × paper_metadata), filter in `_multi_interest_recommend()` after reranking.
|
|
|
|
| 274 |
|
| 275 |
+
#### ~~4.5 Pre-populate Metadata Store~~ ✅ ALREADY DONE (Phase 3.5 — Turso)
|
| 276 |
|
| 277 |
+
Turso cloud DB with 1.23GB of metadata + citation counts. Search time: ~10.7s → ~1.75s.
|
| 278 |
|
| 279 |
---
|
| 280 |
|
|
|
|
| 383 |
- ≥10 interactions: LightFM
|
| 384 |
Retrain LightGBM with dismissals as negative labels (YouTube's 3× gain from dual labels).
|
| 385 |
|
| 386 |
+
#### ~~9.3 Category-Level Negative Suppression~~ → Moved to Phase 4.4
|
| 387 |
+
If ≥3 dismissals hit the same primary arXiv category within 14 days, suppress that category.
|
| 388 |
+
**Decision**: Primary category only, τ_neg = 14 days. See Phase 4 plan.
|
| 389 |
|
| 390 |
---
|
| 391 |
|
|
|
|
| 393 |
|
| 394 |
If you can only do three things, do these:
|
| 395 |
|
| 396 |
+
### 1. ~~Build hybrid semantic search (Phase 3)~~ ✅ DONE
|
|
|
|
|
|
|
| 397 |
|
| 398 |
+
### 2. ~~Pre-populate the metadata store (Phase 3.5)~~ ✅ DONE
|
|
|
|
|
|
|
| 399 |
|
| 400 |
+
### 3. Replace RRF with quota fusion in recommendations (Phase 4.1) ← NEXT
|
| 401 |
**Impact**: Prevents the dominant cluster from drowning out minority interests. Fixes the core multi-interest failure mode.
|
| 402 |
+
**Effort**: New `fusion.py` + refactor `_multi_interest_recommend()`. ~1 week for all 3 Phase 4 items.
|
| 403 |
|
| 404 |
---
|
| 405 |
|
|
|
|
| 416 |
| — | [Phase 1 Walkthrough](PHASE1-Zero-ML-Recommender.md) | Zero-ML recommender code tour | ✅ Complete |
|
| 417 |
| — | [Phase 2 Recommender Walkthrough](02-Phase2-MultiInterest-Recommender.md) | Multi-interest engine implementation | ✅ Complete |
|
| 418 |
| — | [Code Summary & Test Plan](03-Code-Summary-and-Test-Plan.md) | Codebase summary and testing strategy | ✅ Complete |
|
| 419 |
+
| — | [Phase 2 Hybrid Search Plan](../phases/PHASE2-Hybrid-Search-Plan.md) | BGE-M3 + Zilliz hybrid search prototype | ✅ Superseded by Phase 3 |
|
| 420 |
+
| — | [Phase 3 Hybrid Semantic Search](../phases/PHASE3-Hybrid-Semantic-Search.md) | Full hybrid search implementation plan | ✅ Complete |
|
| 421 |
+
| — | [Phase 4 Recommendation Fixes](../phases/PHASE4-Recommendation-Pipeline-Fixes.md) | Quota fusion, Hungarian matching, negative suppression | 📋 Planned |
|
| 422 |
| — | **This Document** | Revised phase plan synthesizing all research | ✅ Current |
|
tests/test_clustering.py
CHANGED
|
@@ -15,6 +15,7 @@ import numpy as np
|
|
| 15 |
|
| 16 |
from app.recommend.clustering import (
|
| 17 |
compute_clusters,
|
|
|
|
| 18 |
InterestCluster,
|
| 19 |
MIN_PAPERS_FOR_CLUSTERING,
|
| 20 |
MAX_CLUSTERS,
|
|
@@ -110,6 +111,7 @@ def test_importance_is_sorted_descending():
|
|
| 110 |
def test_few_papers_returns_single_cluster():
|
| 111 |
"""When papers < MIN_PAPERS_FOR_CLUSTERING, return a single catch-all cluster."""
|
| 112 |
ids = ["p1", "p2", "p3"]
|
|
|
|
| 113 |
rng = np.random.RandomState(11)
|
| 114 |
embs = rng.randn(3, 1024).astype(np.float32)
|
| 115 |
# Normalise
|
|
@@ -155,6 +157,237 @@ def test_find_medoid():
|
|
| 155 |
assert idx == 1, f"Expected medoid idx 1, got {idx}"
|
| 156 |
|
| 157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
# ── DB persistence test ──────────────────────────────────────────────────────
|
| 159 |
|
| 160 |
@pytest.fixture
|
|
|
|
| 15 |
|
| 16 |
from app.recommend.clustering import (
|
| 17 |
compute_clusters,
|
| 18 |
+
stabilize_cluster_ids,
|
| 19 |
InterestCluster,
|
| 20 |
MIN_PAPERS_FOR_CLUSTERING,
|
| 21 |
MAX_CLUSTERS,
|
|
|
|
| 111 |
def test_few_papers_returns_single_cluster():
|
| 112 |
"""When papers < MIN_PAPERS_FOR_CLUSTERING, return a single catch-all cluster."""
|
| 113 |
ids = ["p1", "p2", "p3"]
|
| 114 |
+
assert len(ids) < MIN_PAPERS_FOR_CLUSTERING, "test precondition: ids must be below threshold"
|
| 115 |
rng = np.random.RandomState(11)
|
| 116 |
embs = rng.randn(3, 1024).astype(np.float32)
|
| 117 |
# Normalise
|
|
|
|
| 157 |
assert idx == 1, f"Expected medoid idx 1, got {idx}"
|
| 158 |
|
| 159 |
|
| 160 |
+
# ── Hungarian matching / cluster ID stabilisation (Phase 4.2) ────────────────
|
| 161 |
+
|
| 162 |
+
def _make_two_cluster_pair(seed: int = 0) -> tuple[list, list]:
|
| 163 |
+
"""
|
| 164 |
+
Build two well-separated InterestCluster lists sharing the same embedding
|
| 165 |
+
space so Hungarian matching can correctly align them.
|
| 166 |
+
|
| 167 |
+
Returns (new_clusters, old_clusters) where new_clusters[0] corresponds
|
| 168 |
+
semantically to old_clusters[0].
|
| 169 |
+
"""
|
| 170 |
+
rng = np.random.RandomState(seed)
|
| 171 |
+
dim = 1024
|
| 172 |
+
|
| 173 |
+
# Two distinct topic centers
|
| 174 |
+
center_a = rng.randn(dim).astype(np.float32)
|
| 175 |
+
center_a /= np.linalg.norm(center_a)
|
| 176 |
+
center_b = rng.randn(dim).astype(np.float32)
|
| 177 |
+
center_b /= np.linalg.norm(center_b)
|
| 178 |
+
|
| 179 |
+
def _near(center, n=5, spread=0.001):
|
| 180 |
+
# NOTE: spread is scaled small because random noise in 1024-d has
|
| 181 |
+
# magnitude ~sqrt(dim)*spread, so spread=0.05 gives noise≈1.6 which
|
| 182 |
+
# dominates the unit-length center. 0.001 keeps cosine sim ≥ 0.99.
|
| 183 |
+
vecs = []
|
| 184 |
+
for _ in range(n):
|
| 185 |
+
v = center + rng.randn(dim).astype(np.float32) * spread
|
| 186 |
+
v /= np.linalg.norm(v)
|
| 187 |
+
vecs.append(v)
|
| 188 |
+
return vecs
|
| 189 |
+
|
| 190 |
+
medoid_a_new = _near(center_a)[0]
|
| 191 |
+
medoid_b_new = _near(center_b)[0]
|
| 192 |
+
medoid_a_old = _near(center_a)[0]
|
| 193 |
+
medoid_b_old = _near(center_b)[0]
|
| 194 |
+
|
| 195 |
+
old = [
|
| 196 |
+
InterestCluster(cluster_idx=0, medoid_paper_id="old_a", medoid_embedding=medoid_a_old,
|
| 197 |
+
paper_ids=["old_a"], importance=5.0),
|
| 198 |
+
InterestCluster(cluster_idx=1, medoid_paper_id="old_b", medoid_embedding=medoid_b_old,
|
| 199 |
+
paper_ids=["old_b"], importance=3.0),
|
| 200 |
+
]
|
| 201 |
+
# new clusters have swapped order (b first, a second) → naive assignment would shuffle
|
| 202 |
+
new = [
|
| 203 |
+
InterestCluster(cluster_idx=0, medoid_paper_id="new_b", medoid_embedding=medoid_b_new,
|
| 204 |
+
paper_ids=["new_b"], importance=3.0),
|
| 205 |
+
InterestCluster(cluster_idx=1, medoid_paper_id="new_a", medoid_embedding=medoid_a_new,
|
| 206 |
+
paper_ids=["new_a"], importance=5.0),
|
| 207 |
+
]
|
| 208 |
+
return new, old
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
def test_stabilize_matches_semantically_equivalent_clusters():
|
| 212 |
+
"""
|
| 213 |
+
When topic A was cluster 0 and remains cluster 0 after recluster (just
|
| 214 |
+
re-ordered by importance), stabilise_cluster_ids should restore idx=0 for A.
|
| 215 |
+
"""
|
| 216 |
+
new, old = _make_two_cluster_pair()
|
| 217 |
+
# new[0] is topic B, new[1] is topic A
|
| 218 |
+
# old[0] is topic A (idx=0), old[1] is topic B (idx=1)
|
| 219 |
+
stabilised = stabilize_cluster_ids(new, old)
|
| 220 |
+
|
| 221 |
+
# After stabilisation, the cluster containing "new_a" should have idx=0
|
| 222 |
+
# and "new_b" should have idx=1
|
| 223 |
+
idx_map = {c.medoid_paper_id: c.cluster_idx for c in stabilised}
|
| 224 |
+
assert idx_map["new_a"] == 0, f"Topic A should be idx 0, got {idx_map}"
|
| 225 |
+
assert idx_map["new_b"] == 1, f"Topic B should be idx 1, got {idx_map}"
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def test_stabilize_preserves_all_clusters():
|
| 229 |
+
"""Output length must equal input length."""
|
| 230 |
+
new, old = _make_two_cluster_pair()
|
| 231 |
+
stabilised = stabilize_cluster_ids(new, old)
|
| 232 |
+
assert len(stabilised) == len(new)
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def test_stabilize_unique_indices():
|
| 236 |
+
"""All cluster indices in the output must be unique."""
|
| 237 |
+
new, old = _make_two_cluster_pair()
|
| 238 |
+
stabilised = stabilize_cluster_ids(new, old)
|
| 239 |
+
indices = [c.cluster_idx for c in stabilised]
|
| 240 |
+
assert len(indices) == len(set(indices)), f"Duplicate indices: {indices}"
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
def test_stabilize_no_old_clusters_returns_unchanged():
|
| 244 |
+
"""With no old clusters, return new clusters as-is."""
|
| 245 |
+
new, _ = _make_two_cluster_pair()
|
| 246 |
+
result = stabilize_cluster_ids(new, [])
|
| 247 |
+
assert result == new
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def test_stabilize_no_new_clusters_returns_empty():
|
| 251 |
+
"""With no new clusters, return empty list."""
|
| 252 |
+
_, old = _make_two_cluster_pair()
|
| 253 |
+
result = stabilize_cluster_ids([], old)
|
| 254 |
+
assert result == []
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def test_stabilize_rejects_unrelated_match():
|
| 258 |
+
"""
|
| 259 |
+
Doc 06 requirement: Hungarian must NOT inherit an old cluster's identity
|
| 260 |
+
when the cosine similarity is below the threshold (default 0.5). A user's
|
| 261 |
+
genuinely-new topic should get a fresh index, not steal an old NLP idx
|
| 262 |
+
just because Hungarian found the "least bad" assignment.
|
| 263 |
+
"""
|
| 264 |
+
rng = np.random.RandomState(7)
|
| 265 |
+
dim = 1024
|
| 266 |
+
|
| 267 |
+
def _rand_unit():
|
| 268 |
+
v = rng.randn(dim).astype(np.float32)
|
| 269 |
+
return v / np.linalg.norm(v)
|
| 270 |
+
|
| 271 |
+
# Two very different topics: old_topic_vec vs new_topic_vec (orthogonal-ish)
|
| 272 |
+
old_vec = _rand_unit()
|
| 273 |
+
new_vec = _rand_unit()
|
| 274 |
+
# Force near-orthogonality so cosine sim << 0.5
|
| 275 |
+
# (random 1024-dim unit vectors already average near 0, so this should hold)
|
| 276 |
+
cos_sim = float(new_vec @ old_vec)
|
| 277 |
+
assert abs(cos_sim) < 0.3, f"test precondition failed: cos_sim={cos_sim}"
|
| 278 |
+
|
| 279 |
+
old = [InterestCluster(cluster_idx=5, medoid_paper_id="old_topic",
|
| 280 |
+
medoid_embedding=old_vec, paper_ids=[], importance=1.0)]
|
| 281 |
+
new = [InterestCluster(cluster_idx=0, medoid_paper_id="new_topic",
|
| 282 |
+
medoid_embedding=new_vec, paper_ids=[], importance=1.0)]
|
| 283 |
+
|
| 284 |
+
stabilised = stabilize_cluster_ids(new, old)
|
| 285 |
+
# The unrelated new cluster must NOT inherit idx=5
|
| 286 |
+
assert stabilised[0].cluster_idx != 5, \
|
| 287 |
+
"Unrelated topic inherited old cluster's index (threshold not enforced)"
|
| 288 |
+
|
| 289 |
+
|
| 290 |
+
def test_stabilize_custom_threshold():
|
| 291 |
+
"""Custom min_cosine_sim should control matching strictness."""
|
| 292 |
+
rng = np.random.RandomState(13)
|
| 293 |
+
dim = 1024
|
| 294 |
+
base = rng.randn(dim).astype(np.float32)
|
| 295 |
+
base /= np.linalg.norm(base)
|
| 296 |
+
# Slightly perturbed — spread=0.001 in 1024-d gives cos_sim ~ 0.9995
|
| 297 |
+
perturbed = base + rng.randn(dim).astype(np.float32) * 0.001
|
| 298 |
+
perturbed /= np.linalg.norm(perturbed)
|
| 299 |
+
|
| 300 |
+
old = [InterestCluster(cluster_idx=2, medoid_paper_id="old",
|
| 301 |
+
medoid_embedding=base, paper_ids=[], importance=1.0)]
|
| 302 |
+
new = [InterestCluster(cluster_idx=0, medoid_paper_id="new",
|
| 303 |
+
medoid_embedding=perturbed, paper_ids=[], importance=1.0)]
|
| 304 |
+
|
| 305 |
+
# With default threshold 0.5, match succeeds (~0.9995 cos sim)
|
| 306 |
+
default_result = stabilize_cluster_ids(new, old)
|
| 307 |
+
assert default_result[0].cluster_idx == 2
|
| 308 |
+
|
| 309 |
+
# With threshold 0.99999 (stricter than actual 0.9995 sim), match rejected
|
| 310 |
+
strict_result = stabilize_cluster_ids(new, old, min_cosine_sim=0.99999)
|
| 311 |
+
assert strict_result[0].cluster_idx != 2
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def test_stabilize_more_new_than_old():
|
| 315 |
+
"""K grew from 1 → 2: matched cluster keeps idx, new gets fresh idx."""
|
| 316 |
+
rng = np.random.RandomState(21)
|
| 317 |
+
dim = 1024
|
| 318 |
+
|
| 319 |
+
base = rng.randn(dim).astype(np.float32)
|
| 320 |
+
base /= np.linalg.norm(base)
|
| 321 |
+
close = base + rng.randn(dim).astype(np.float32) * 0.001
|
| 322 |
+
close /= np.linalg.norm(close)
|
| 323 |
+
far = rng.randn(dim).astype(np.float32)
|
| 324 |
+
far /= np.linalg.norm(far)
|
| 325 |
+
|
| 326 |
+
old = [InterestCluster(cluster_idx=0, medoid_paper_id="o",
|
| 327 |
+
medoid_embedding=base, paper_ids=[], importance=1.0)]
|
| 328 |
+
new = [
|
| 329 |
+
InterestCluster(cluster_idx=0, medoid_paper_id="n1",
|
| 330 |
+
medoid_embedding=close, paper_ids=[], importance=2.0),
|
| 331 |
+
InterestCluster(cluster_idx=1, medoid_paper_id="n2",
|
| 332 |
+
medoid_embedding=far, paper_ids=[], importance=1.0),
|
| 333 |
+
]
|
| 334 |
+
result = stabilize_cluster_ids(new, old)
|
| 335 |
+
idx_map = {c.medoid_paper_id: c.cluster_idx for c in result}
|
| 336 |
+
assert idx_map["n1"] == 0 # inherits old idx
|
| 337 |
+
assert idx_map["n2"] != 0 # fresh idx
|
| 338 |
+
|
| 339 |
+
|
| 340 |
+
def test_stabilize_fewer_new_than_old():
|
| 341 |
+
"""K shrank from 2 → 1: the surviving cluster keeps its idx."""
|
| 342 |
+
rng = np.random.RandomState(25)
|
| 343 |
+
dim = 1024
|
| 344 |
+
base = rng.randn(dim).astype(np.float32)
|
| 345 |
+
base /= np.linalg.norm(base)
|
| 346 |
+
other = rng.randn(dim).astype(np.float32)
|
| 347 |
+
other /= np.linalg.norm(other)
|
| 348 |
+
close = base + rng.randn(dim).astype(np.float32) * 0.001
|
| 349 |
+
close /= np.linalg.norm(close)
|
| 350 |
+
|
| 351 |
+
old = [
|
| 352 |
+
InterestCluster(cluster_idx=7, medoid_paper_id="oA",
|
| 353 |
+
medoid_embedding=base, paper_ids=[], importance=2.0),
|
| 354 |
+
InterestCluster(cluster_idx=9, medoid_paper_id="oB",
|
| 355 |
+
medoid_embedding=other, paper_ids=[], importance=1.0),
|
| 356 |
+
]
|
| 357 |
+
new = [InterestCluster(cluster_idx=0, medoid_paper_id="nA",
|
| 358 |
+
medoid_embedding=close, paper_ids=[], importance=1.0)]
|
| 359 |
+
|
| 360 |
+
result = stabilize_cluster_ids(new, old)
|
| 361 |
+
assert len(result) == 1
|
| 362 |
+
assert result[0].cluster_idx == 7 # inherits the matching old idx
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def test_stabilize_new_cluster_gets_fresh_index():
|
| 366 |
+
"""
|
| 367 |
+
If new_clusters has more clusters than old, the extras get fresh indices
|
| 368 |
+
not conflicting with any matched index.
|
| 369 |
+
"""
|
| 370 |
+
rng = np.random.RandomState(99)
|
| 371 |
+
dim = 1024
|
| 372 |
+
|
| 373 |
+
emb = lambda: (lambda v: v / np.linalg.norm(v))(rng.randn(dim).astype(np.float32))
|
| 374 |
+
|
| 375 |
+
old = [
|
| 376 |
+
InterestCluster(cluster_idx=0, medoid_paper_id="old_a", medoid_embedding=emb(),
|
| 377 |
+
paper_ids=[], importance=1.0),
|
| 378 |
+
]
|
| 379 |
+
new = [
|
| 380 |
+
InterestCluster(cluster_idx=0, medoid_paper_id="new_a", medoid_embedding=old[0].medoid_embedding.copy(),
|
| 381 |
+
paper_ids=[], importance=1.0),
|
| 382 |
+
InterestCluster(cluster_idx=1, medoid_paper_id="new_brand", medoid_embedding=emb(),
|
| 383 |
+
paper_ids=[], importance=1.0),
|
| 384 |
+
]
|
| 385 |
+
stabilised = stabilize_cluster_ids(new, old)
|
| 386 |
+
indices = {c.medoid_paper_id: c.cluster_idx for c in stabilised}
|
| 387 |
+
assert indices["new_a"] == 0, "Matched cluster should inherit old index 0"
|
| 388 |
+
assert indices["new_brand"] != 0, "New unmatched cluster must not collide with idx 0"
|
| 389 |
+
|
| 390 |
+
|
| 391 |
# ── DB persistence test ──────────────────────────────────────────────────────
|
| 392 |
|
| 393 |
@pytest.fixture
|
tests/test_db.py
CHANGED
|
@@ -116,3 +116,319 @@ async def test_metadata_cache_batch(tmp_db):
|
|
| 116 |
assert "paper0" in result
|
| 117 |
assert "paper2" in result
|
| 118 |
assert "paper99" not in result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
assert "paper0" in result
|
| 117 |
assert "paper2" in result
|
| 118 |
assert "paper99" not in result
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
# ── Phase 4.3: cache_turso_metadata_batch ────────────────────────────────────
|
| 122 |
+
|
| 123 |
+
@pytest.mark.asyncio
|
| 124 |
+
async def test_cache_turso_metadata_batch_writes_all(tmp_db):
|
| 125 |
+
"""Turso dicts should be written to paper_metadata verbatim."""
|
| 126 |
+
import app.db as db
|
| 127 |
+
await db.init_db()
|
| 128 |
+
papers = [
|
| 129 |
+
{
|
| 130 |
+
"arxiv_id": "1706.03762",
|
| 131 |
+
"title": "Attention Is All You Need",
|
| 132 |
+
"abstract": "Transformers.",
|
| 133 |
+
"authors": '["Vaswani"]',
|
| 134 |
+
"category": "cs.CL",
|
| 135 |
+
"published": "2017-06-12",
|
| 136 |
+
"year": 2017,
|
| 137 |
+
"citation_count": 50000,
|
| 138 |
+
},
|
| 139 |
+
{
|
| 140 |
+
"arxiv_id": "2001.00001",
|
| 141 |
+
"title": "Another Paper",
|
| 142 |
+
"abstract": "...",
|
| 143 |
+
"authors": '["Smith"]',
|
| 144 |
+
"category": "cs.CV",
|
| 145 |
+
"published": "2020-01-01",
|
| 146 |
+
"year": 2020,
|
| 147 |
+
},
|
| 148 |
+
]
|
| 149 |
+
await db.cache_turso_metadata_batch(papers)
|
| 150 |
+
|
| 151 |
+
cached = await db.get_cached_metadata("1706.03762")
|
| 152 |
+
assert cached is not None
|
| 153 |
+
assert cached["title"] == "Attention Is All You Need"
|
| 154 |
+
assert cached["category"] == "cs.CL"
|
| 155 |
+
|
| 156 |
+
cached2 = await db.get_cached_metadata("2001.00001")
|
| 157 |
+
assert cached2 is not None
|
| 158 |
+
assert cached2["category"] == "cs.CV"
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
@pytest.mark.asyncio
|
| 162 |
+
async def test_cache_turso_metadata_batch_empty(tmp_db):
|
| 163 |
+
"""Empty input must not crash."""
|
| 164 |
+
import app.db as db
|
| 165 |
+
await db.init_db()
|
| 166 |
+
await db.cache_turso_metadata_batch([])
|
| 167 |
+
# No exception = success
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
@pytest.mark.asyncio
|
| 171 |
+
async def test_cache_turso_metadata_batch_skips_missing_arxiv_id(tmp_db):
|
| 172 |
+
"""Rows without arxiv_id should be skipped, others persisted."""
|
| 173 |
+
import app.db as db
|
| 174 |
+
await db.init_db()
|
| 175 |
+
papers = [
|
| 176 |
+
{"title": "No ID", "category": "cs.LG"}, # missing arxiv_id
|
| 177 |
+
{"arxiv_id": "good.123", "title": "Good", "category": "cs.AI",
|
| 178 |
+
"abstract": "", "authors": "[]", "published": "2024-01-01"},
|
| 179 |
+
]
|
| 180 |
+
await db.cache_turso_metadata_batch(papers)
|
| 181 |
+
cached = await db.get_cached_metadata("good.123")
|
| 182 |
+
assert cached is not None
|
| 183 |
+
assert cached["title"] == "Good"
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
@pytest.mark.asyncio
|
| 187 |
+
async def test_cache_turso_metadata_batch_upserts(tmp_db):
|
| 188 |
+
"""Second write for same arxiv_id should overwrite the first."""
|
| 189 |
+
import app.db as db
|
| 190 |
+
await db.init_db()
|
| 191 |
+
paper_v1 = {"arxiv_id": "p1", "title": "V1", "category": "cs.LG",
|
| 192 |
+
"abstract": "", "authors": "[]", "published": "2024-01-01"}
|
| 193 |
+
paper_v2 = {"arxiv_id": "p1", "title": "V2", "category": "cs.CV",
|
| 194 |
+
"abstract": "", "authors": "[]", "published": "2024-01-01"}
|
| 195 |
+
await db.cache_turso_metadata_batch([paper_v1])
|
| 196 |
+
await db.cache_turso_metadata_batch([paper_v2])
|
| 197 |
+
cached = await db.get_cached_metadata("p1")
|
| 198 |
+
assert cached["title"] == "V2"
|
| 199 |
+
assert cached["category"] == "cs.CV"
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
# ── Phase 4.3: get_suppressed_categories ──────────────────────────────────────
|
| 203 |
+
|
| 204 |
+
@pytest.mark.asyncio
|
| 205 |
+
async def test_suppressed_empty_for_new_user(tmp_db):
|
| 206 |
+
import app.db as db
|
| 207 |
+
await db.init_db()
|
| 208 |
+
result = await db.get_suppressed_categories("never-dismissed")
|
| 209 |
+
assert result == set()
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
@pytest.mark.asyncio
|
| 213 |
+
async def test_suppressed_below_threshold_not_returned(tmp_db):
|
| 214 |
+
"""Two dismissals in one category (< threshold=3) should NOT suppress."""
|
| 215 |
+
import app.db as db
|
| 216 |
+
await db.init_db()
|
| 217 |
+
# Seed metadata
|
| 218 |
+
for i, aid in enumerate(["p1", "p2"]):
|
| 219 |
+
await db.cache_metadata({
|
| 220 |
+
"arxiv_id": aid, "title": f"t{i}", "abstract": "",
|
| 221 |
+
"authors": "[]", "category": "cs.CV", "published": "2024-01-01",
|
| 222 |
+
})
|
| 223 |
+
# Two dismissals — below threshold=3
|
| 224 |
+
await db.log_interaction("u1", "p1", "not_interested")
|
| 225 |
+
await db.log_interaction("u1", "p2", "not_interested")
|
| 226 |
+
|
| 227 |
+
result = await db.get_suppressed_categories("u1")
|
| 228 |
+
assert "cs.CV" not in result
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
@pytest.mark.asyncio
|
| 232 |
+
async def test_suppressed_at_threshold_returned(tmp_db):
|
| 233 |
+
"""Three dismissals in same category should suppress that category."""
|
| 234 |
+
import app.db as db
|
| 235 |
+
await db.init_db()
|
| 236 |
+
for i, aid in enumerate(["p1", "p2", "p3"]):
|
| 237 |
+
await db.cache_metadata({
|
| 238 |
+
"arxiv_id": aid, "title": f"t{i}", "abstract": "",
|
| 239 |
+
"authors": "[]", "category": "physics.optics", "published": "2024-01-01",
|
| 240 |
+
})
|
| 241 |
+
for aid in ["p1", "p2", "p3"]:
|
| 242 |
+
await db.log_interaction("u1", aid, "not_interested")
|
| 243 |
+
|
| 244 |
+
result = await db.get_suppressed_categories("u1")
|
| 245 |
+
assert "physics.optics" in result
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
@pytest.mark.asyncio
|
| 249 |
+
async def test_suppressed_only_counts_not_interested(tmp_db):
|
| 250 |
+
"""Saves should NOT count toward suppression."""
|
| 251 |
+
import app.db as db
|
| 252 |
+
await db.init_db()
|
| 253 |
+
for aid in ["p1", "p2", "p3"]:
|
| 254 |
+
await db.cache_metadata({
|
| 255 |
+
"arxiv_id": aid, "title": "t", "abstract": "",
|
| 256 |
+
"authors": "[]", "category": "cs.CL", "published": "2024-01-01",
|
| 257 |
+
})
|
| 258 |
+
# 3 saves (not dismissals) in same category
|
| 259 |
+
for aid in ["p1", "p2", "p3"]:
|
| 260 |
+
await db.log_interaction("u1", aid, "save")
|
| 261 |
+
|
| 262 |
+
result = await db.get_suppressed_categories("u1")
|
| 263 |
+
assert "cs.CL" not in result
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
@pytest.mark.asyncio
|
| 267 |
+
async def test_suppressed_partitions_categories(tmp_db):
|
| 268 |
+
"""Different categories should be independent."""
|
| 269 |
+
import app.db as db
|
| 270 |
+
await db.init_db()
|
| 271 |
+
# 3 dismissals in cs.AI, 1 in cs.LG
|
| 272 |
+
for aid in ["a1", "a2", "a3"]:
|
| 273 |
+
await db.cache_metadata({
|
| 274 |
+
"arxiv_id": aid, "title": "t", "abstract": "",
|
| 275 |
+
"authors": "[]", "category": "cs.AI", "published": "2024-01-01",
|
| 276 |
+
})
|
| 277 |
+
await db.log_interaction("u1", aid, "not_interested")
|
| 278 |
+
await db.cache_metadata({
|
| 279 |
+
"arxiv_id": "lone", "title": "t", "abstract": "",
|
| 280 |
+
"authors": "[]", "category": "cs.LG", "published": "2024-01-01",
|
| 281 |
+
})
|
| 282 |
+
await db.log_interaction("u1", "lone", "not_interested")
|
| 283 |
+
|
| 284 |
+
result = await db.get_suppressed_categories("u1")
|
| 285 |
+
assert "cs.AI" in result
|
| 286 |
+
assert "cs.LG" not in result
|
| 287 |
+
|
| 288 |
+
|
| 289 |
+
@pytest.mark.asyncio
|
| 290 |
+
async def test_suppressed_ignores_other_users(tmp_db):
|
| 291 |
+
"""One user's dismissals must not affect another user's suppressions."""
|
| 292 |
+
import app.db as db
|
| 293 |
+
await db.init_db()
|
| 294 |
+
for aid in ["p1", "p2", "p3"]:
|
| 295 |
+
await db.cache_metadata({
|
| 296 |
+
"arxiv_id": aid, "title": "t", "abstract": "",
|
| 297 |
+
"authors": "[]", "category": "cs.CV", "published": "2024-01-01",
|
| 298 |
+
})
|
| 299 |
+
await db.log_interaction("userA", aid, "not_interested")
|
| 300 |
+
|
| 301 |
+
result_a = await db.get_suppressed_categories("userA")
|
| 302 |
+
result_b = await db.get_suppressed_categories("userB")
|
| 303 |
+
assert "cs.CV" in result_a
|
| 304 |
+
assert result_b == set()
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
@pytest.mark.asyncio
|
| 308 |
+
async def test_suppressed_empty_category_excluded(tmp_db):
|
| 309 |
+
"""Papers with empty category string should not produce a '' suppression."""
|
| 310 |
+
import app.db as db
|
| 311 |
+
await db.init_db()
|
| 312 |
+
for aid in ["e1", "e2", "e3"]:
|
| 313 |
+
await db.cache_metadata({
|
| 314 |
+
"arxiv_id": aid, "title": "t", "abstract": "",
|
| 315 |
+
"authors": "[]", "category": "", "published": "2024-01-01",
|
| 316 |
+
})
|
| 317 |
+
await db.log_interaction("u1", aid, "not_interested")
|
| 318 |
+
|
| 319 |
+
result = await db.get_suppressed_categories("u1")
|
| 320 |
+
assert "" not in result
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
@pytest.mark.asyncio
|
| 324 |
+
async def test_suppressed_custom_threshold(tmp_db):
|
| 325 |
+
"""Threshold=2 should trigger at 2 dismissals."""
|
| 326 |
+
import app.db as db
|
| 327 |
+
await db.init_db()
|
| 328 |
+
for aid in ["x1", "x2"]:
|
| 329 |
+
await db.cache_metadata({
|
| 330 |
+
"arxiv_id": aid, "title": "t", "abstract": "",
|
| 331 |
+
"authors": "[]", "category": "math.NT", "published": "2024-01-01",
|
| 332 |
+
})
|
| 333 |
+
await db.log_interaction("u1", aid, "not_interested")
|
| 334 |
+
|
| 335 |
+
result = await db.get_suppressed_categories("u1", threshold=2)
|
| 336 |
+
assert "math.NT" in result
|
| 337 |
+
|
| 338 |
+
result_high = await db.get_suppressed_categories("u1", threshold=5)
|
| 339 |
+
assert "math.NT" not in result_high
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
# ── Phase 4.5: Instrumentation columns ───────────────────────────────────────
|
| 343 |
+
|
| 344 |
+
@pytest.mark.asyncio
|
| 345 |
+
async def test_instrumentation_columns_exist(tmp_db):
|
| 346 |
+
"""The interactions table should have ranker_version, candidate_source, cluster_id columns."""
|
| 347 |
+
import app.db as db
|
| 348 |
+
import aiosqlite
|
| 349 |
+
await db.init_db()
|
| 350 |
+
async with aiosqlite.connect(tmp_db) as conn:
|
| 351 |
+
cur = await conn.execute("PRAGMA table_info(interactions)")
|
| 352 |
+
columns = {row[1] for row in await cur.fetchall()}
|
| 353 |
+
assert "ranker_version" in columns
|
| 354 |
+
assert "candidate_source" in columns
|
| 355 |
+
assert "cluster_id" in columns
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
@pytest.mark.asyncio
|
| 359 |
+
async def test_log_interaction_stores_instrumentation_fields(tmp_db):
|
| 360 |
+
"""log_interaction should persist ranker_version, candidate_source, cluster_id."""
|
| 361 |
+
import app.db as db
|
| 362 |
+
import aiosqlite
|
| 363 |
+
await db.init_db()
|
| 364 |
+
await db.log_interaction(
|
| 365 |
+
user_id="u1",
|
| 366 |
+
paper_id="p1",
|
| 367 |
+
event_type="save",
|
| 368 |
+
source="recommendation",
|
| 369 |
+
ranker_version="v4.1_test",
|
| 370 |
+
candidate_source="cluster_0",
|
| 371 |
+
cluster_id=0,
|
| 372 |
+
)
|
| 373 |
+
async with aiosqlite.connect(tmp_db) as conn:
|
| 374 |
+
conn.row_factory = aiosqlite.Row
|
| 375 |
+
cur = await conn.execute(
|
| 376 |
+
"SELECT ranker_version, candidate_source, cluster_id FROM interactions WHERE paper_id = 'p1'"
|
| 377 |
+
)
|
| 378 |
+
row = dict(await cur.fetchone())
|
| 379 |
+
assert row["ranker_version"] == "v4.1_test"
|
| 380 |
+
assert row["candidate_source"] == "cluster_0"
|
| 381 |
+
assert row["cluster_id"] == 0
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
@pytest.mark.asyncio
|
| 385 |
+
async def test_log_interaction_instrumentation_defaults_to_null(tmp_db):
|
| 386 |
+
"""Omitting instrumentation fields should store NULLs (backward compat)."""
|
| 387 |
+
import app.db as db
|
| 388 |
+
import aiosqlite
|
| 389 |
+
await db.init_db()
|
| 390 |
+
await db.log_interaction("u1", "p2", "save", source="search")
|
| 391 |
+
async with aiosqlite.connect(tmp_db) as conn:
|
| 392 |
+
conn.row_factory = aiosqlite.Row
|
| 393 |
+
cur = await conn.execute(
|
| 394 |
+
"SELECT ranker_version, candidate_source, cluster_id FROM interactions WHERE paper_id = 'p2'"
|
| 395 |
+
)
|
| 396 |
+
row = dict(await cur.fetchone())
|
| 397 |
+
assert row["ranker_version"] is None
|
| 398 |
+
assert row["candidate_source"] is None
|
| 399 |
+
assert row["cluster_id"] is None
|
| 400 |
+
|
| 401 |
+
|
| 402 |
+
@pytest.mark.asyncio
|
| 403 |
+
async def test_migration_idempotent(tmp_db):
|
| 404 |
+
"""Calling init_db() twice must not crash (ALTER TABLE migration is safe)."""
|
| 405 |
+
import app.db as db
|
| 406 |
+
await db.init_db()
|
| 407 |
+
await db.init_db() # second call — migration should be idempotent
|
| 408 |
+
# No exception = success
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
@pytest.mark.asyncio
|
| 412 |
+
async def test_instrumentation_exploration_tag(tmp_db):
|
| 413 |
+
"""Exploration papers should be stored with candidate_source='exploration'."""
|
| 414 |
+
import app.db as db
|
| 415 |
+
import aiosqlite
|
| 416 |
+
await db.init_db()
|
| 417 |
+
await db.log_interaction(
|
| 418 |
+
user_id="u1",
|
| 419 |
+
paper_id="explore_paper",
|
| 420 |
+
event_type="save",
|
| 421 |
+
source="recommendation",
|
| 422 |
+
ranker_version="v4.1_quota_hungarian_suppression",
|
| 423 |
+
candidate_source="exploration",
|
| 424 |
+
cluster_id=None,
|
| 425 |
+
)
|
| 426 |
+
async with aiosqlite.connect(tmp_db) as conn:
|
| 427 |
+
conn.row_factory = aiosqlite.Row
|
| 428 |
+
cur = await conn.execute(
|
| 429 |
+
"SELECT candidate_source, cluster_id FROM interactions WHERE paper_id = 'explore_paper'"
|
| 430 |
+
)
|
| 431 |
+
row = dict(await cur.fetchone())
|
| 432 |
+
assert row["candidate_source"] == "exploration"
|
| 433 |
+
assert row["cluster_id"] is None
|
| 434 |
+
|
tests/test_fusion.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Tests for importance-weighted quota fusion.
|
| 3 |
+
|
| 4 |
+
Covers:
|
| 5 |
+
- Proportional allocation (dominant cluster gets most slots)
|
| 6 |
+
- Floor guarantee (every cluster gets at least min_slots)
|
| 7 |
+
- Total slots == sum of allocated slots (or >= when floors force it)
|
| 8 |
+
- Remainder distributed correctly
|
| 9 |
+
- Single cluster gets all slots
|
| 10 |
+
- Equal importances → roughly equal allocation
|
| 11 |
+
- Zero importances fall back to equal distribution
|
| 12 |
+
- merge_quota_results deduplication and order
|
| 13 |
+
"""
|
| 14 |
+
from app.recommend.fusion import allocate_quotas, merge_quota_results
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
# ── allocate_quotas ───────────────────────────────────────────────────────────
|
| 18 |
+
|
| 19 |
+
def test_proportional_allocation():
|
| 20 |
+
"""Dominant cluster should receive proportionally more slots."""
|
| 21 |
+
importances = [7.0, 3.0]
|
| 22 |
+
slots = allocate_quotas(importances, total_slots=100, min_slots=3)
|
| 23 |
+
assert len(slots) == 2
|
| 24 |
+
assert slots[0] > slots[1], "Dominant cluster (imp=7) should get more slots than minor (imp=3)"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def test_floor_guarantee():
|
| 28 |
+
"""Every cluster must receive at least min_slots regardless of importance."""
|
| 29 |
+
# One huge cluster and one tiny one
|
| 30 |
+
importances = [99.0, 1.0]
|
| 31 |
+
slots = allocate_quotas(importances, total_slots=100, min_slots=3)
|
| 32 |
+
assert all(s >= 3 for s in slots), f"Floor violated: {slots}"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def test_total_slots_met():
|
| 36 |
+
"""Sum of allocated slots should equal total_slots when no floor pressure."""
|
| 37 |
+
importances = [5.0, 3.0, 2.0]
|
| 38 |
+
total = 100
|
| 39 |
+
slots = allocate_quotas(importances, total_slots=total, min_slots=3)
|
| 40 |
+
assert sum(slots) == total, f"Expected sum={total}, got {sum(slots)} from {slots}"
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def test_floor_overrides_total():
|
| 44 |
+
"""When many clusters with min_slots exceed total, allocation may go over."""
|
| 45 |
+
# 7 clusters × 3 min_slots = 21 > 20 total
|
| 46 |
+
importances = [1.0] * 7
|
| 47 |
+
slots = allocate_quotas(importances, total_slots=20, min_slots=3)
|
| 48 |
+
assert all(s >= 3 for s in slots), f"Floor violated under pressure: {slots}"
|
| 49 |
+
assert len(slots) == 7
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def test_single_cluster_gets_all():
|
| 53 |
+
"""A single cluster should receive all slots (or min_slots if larger)."""
|
| 54 |
+
slots = allocate_quotas([5.0], total_slots=50, min_slots=3)
|
| 55 |
+
assert slots == [50]
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def test_equal_importances_roughly_equal():
|
| 59 |
+
"""Equal importances should produce roughly equal slot counts."""
|
| 60 |
+
importances = [1.0, 1.0, 1.0]
|
| 61 |
+
slots = allocate_quotas(importances, total_slots=99, min_slots=3)
|
| 62 |
+
assert len(slots) == 3
|
| 63 |
+
assert slots == [33, 33, 33], f"Expected equal split [33,33,33], got {slots}"
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def test_zero_importances_fallback():
|
| 67 |
+
"""All-zero importances should not crash; falls back to equal distribution."""
|
| 68 |
+
importances = [0.0, 0.0, 0.0]
|
| 69 |
+
slots = allocate_quotas(importances, total_slots=30, min_slots=3)
|
| 70 |
+
assert len(slots) == 3
|
| 71 |
+
assert sum(slots) == 30
|
| 72 |
+
assert all(s >= 3 for s in slots)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def test_empty_importances():
|
| 76 |
+
"""Empty input returns empty list."""
|
| 77 |
+
assert allocate_quotas([], total_slots=100) == []
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def test_remainder_distributed():
|
| 81 |
+
"""With 3 equal clusters and 100 slots, remainder 1 goes to someone."""
|
| 82 |
+
importances = [1.0, 1.0, 1.0]
|
| 83 |
+
# 100 / 3 = 33.333 → floor is 33 each, remainder = 1
|
| 84 |
+
slots = allocate_quotas(importances, total_slots=100, min_slots=3)
|
| 85 |
+
assert sum(slots) == 100
|
| 86 |
+
assert sorted(slots) == [33, 33, 34]
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def test_two_cluster_sum_correct():
|
| 90 |
+
"""70/30 split on 100 slots: sum should be exactly 100."""
|
| 91 |
+
slots = allocate_quotas([70.0, 30.0], total_slots=100, min_slots=3)
|
| 92 |
+
assert sum(slots) == 100
|
| 93 |
+
assert slots[0] >= slots[1]
|
| 94 |
+
assert slots[1] >= 3
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def test_doc06_worked_example():
|
| 98 |
+
"""
|
| 99 |
+
Doc 06 worked example:
|
| 100 |
+
importances = [0.55, 0.30, 0.15], total=30, min=3
|
| 101 |
+
raw = [16.5, 9.0, 4.5]
|
| 102 |
+
floor = [16, 9, 4] (sum=29)
|
| 103 |
+
remainder = 1 → largest frac (0.5 at idx 0) gets it
|
| 104 |
+
final = [17, 9, 4]
|
| 105 |
+
"""
|
| 106 |
+
slots = allocate_quotas([0.55, 0.30, 0.15], total_slots=30, min_slots=3)
|
| 107 |
+
assert slots == [17, 9, 4], f"Doc 06 example expected [17, 9, 4], got {slots}"
|
| 108 |
+
assert sum(slots) == 30
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def test_doc06_tiny_cluster_floor():
|
| 112 |
+
"""
|
| 113 |
+
Doc 06 tiny-cluster edge case:
|
| 114 |
+
importances = [0.60, 0.25, 0.10, 0.05], total=30, min=3
|
| 115 |
+
raw = [18.0, 7.5, 3.0, 1.5]
|
| 116 |
+
floor applied: [18, 7, 3, 3] -- smallest cluster gets 3 not 1
|
| 117 |
+
"""
|
| 118 |
+
slots = allocate_quotas([0.60, 0.25, 0.10, 0.05], total_slots=30, min_slots=3)
|
| 119 |
+
# The smallest cluster must get at least min_slots (3), not 1
|
| 120 |
+
assert slots[3] >= 3, f"Floor violated: smallest cluster got {slots[3]}"
|
| 121 |
+
# The dominant cluster still dominates
|
| 122 |
+
assert slots[0] > slots[1] > slots[2]
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def test_fractional_priority_deterministic():
|
| 126 |
+
"""
|
| 127 |
+
Remainder should go to clusters with the largest fractional parts.
|
| 128 |
+
importances=[10,10,10], total=20, min=3
|
| 129 |
+
raw = [6.667, 6.667, 6.667]
|
| 130 |
+
floor = [6, 6, 6] (sum=18)
|
| 131 |
+
remainder = 2 → all fractions equal (0.667), first two get +1 (stable sort)
|
| 132 |
+
final = [7, 7, 6]
|
| 133 |
+
"""
|
| 134 |
+
slots = allocate_quotas([10.0, 10.0, 10.0], total_slots=20, min_slots=3)
|
| 135 |
+
assert sum(slots) == 20
|
| 136 |
+
# With 2 remainder slots and 3 equal clusters, counts should be [7, 7, 6] in some order
|
| 137 |
+
assert sorted(slots, reverse=True) == [7, 7, 6]
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def test_fractional_priority_prefers_larger_frac():
|
| 141 |
+
"""
|
| 142 |
+
Cluster with larger fractional part should receive remainder bonus first.
|
| 143 |
+
importances=[2, 3] on 10 slots, min=3:
|
| 144 |
+
raw = [4.0, 6.0]
|
| 145 |
+
floor = [4, 6] (sum=10, remainder=0)
|
| 146 |
+
final = [4, 6]
|
| 147 |
+
"""
|
| 148 |
+
slots = allocate_quotas([2.0, 3.0], total_slots=10, min_slots=3)
|
| 149 |
+
assert slots == [4, 6]
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def test_many_clusters_floor_overflow():
|
| 153 |
+
"""
|
| 154 |
+
10 clusters, each needs min=3, but total=20 means 10×3=30 > 20.
|
| 155 |
+
Floor guarantee overrides total — sum exceeds total_slots.
|
| 156 |
+
"""
|
| 157 |
+
slots = allocate_quotas([1.0] * 10, total_slots=20, min_slots=3)
|
| 158 |
+
assert len(slots) == 10
|
| 159 |
+
assert all(s >= 3 for s in slots)
|
| 160 |
+
# Floor overflow: sum exceeds requested total because min_slots dominates
|
| 161 |
+
assert sum(slots) == 30
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def test_zero_importances_respects_floor_edge():
|
| 165 |
+
"""
|
| 166 |
+
Zero-importance with total < n × min should still respect floor.
|
| 167 |
+
"""
|
| 168 |
+
slots = allocate_quotas([0.0, 0.0, 0.0], total_slots=6, min_slots=3)
|
| 169 |
+
assert all(s >= 3 for s in slots)
|
| 170 |
+
assert len(slots) == 3
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
def test_dominant_cluster_does_not_starve_minority():
|
| 174 |
+
"""
|
| 175 |
+
Critical Doc 06 fairness test:
|
| 176 |
+
User 70% NLP, 30% RL — RL must not get zero slots (the RRF failure mode).
|
| 177 |
+
"""
|
| 178 |
+
slots = allocate_quotas([70.0, 30.0], total_slots=30, min_slots=3)
|
| 179 |
+
assert slots[1] >= 3, f"Minority RL cluster starved: got {slots[1]}"
|
| 180 |
+
assert slots[0] > slots[1] # but dominance is still preserved
|
| 181 |
+
assert sum(slots) == 30
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def test_allocation_order_matches_input():
|
| 185 |
+
"""Output order must match input order (importance-ranked already by caller)."""
|
| 186 |
+
slots = allocate_quotas([50.0, 25.0, 25.0], total_slots=100, min_slots=3)
|
| 187 |
+
# Cluster 0 is the largest, gets most slots; clusters 1 and 2 tied
|
| 188 |
+
assert slots[0] >= slots[1]
|
| 189 |
+
assert slots[0] >= slots[2]
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
# ── merge_quota_results ───────────────────────────────────────────────────────
|
| 193 |
+
|
| 194 |
+
def test_merge_respects_quota():
|
| 195 |
+
"""Each cluster contributes at most its quota to the result."""
|
| 196 |
+
cluster_a = ["a1", "a2", "a3", "a4", "a5"]
|
| 197 |
+
cluster_b = ["b1", "b2", "b3"]
|
| 198 |
+
result = merge_quota_results([cluster_a, cluster_b], quotas=[3, 3])
|
| 199 |
+
a_count = sum(1 for r in result if r.startswith("a"))
|
| 200 |
+
b_count = sum(1 for r in result if r.startswith("b"))
|
| 201 |
+
assert a_count <= 3, f"Cluster A exceeded quota: {a_count}"
|
| 202 |
+
assert b_count <= 3, f"Cluster B exceeded quota: {b_count}"
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def test_merge_deduplicates():
|
| 206 |
+
"""Papers appearing in multiple clusters should appear only once."""
|
| 207 |
+
cluster_a = ["shared", "a1", "a2"]
|
| 208 |
+
cluster_b = ["shared", "b1", "b2"]
|
| 209 |
+
result = merge_quota_results([cluster_a, cluster_b], quotas=[3, 3])
|
| 210 |
+
assert result.count("shared") == 1, "Duplicate 'shared' should appear only once"
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
def test_merge_preserves_order():
|
| 214 |
+
"""Cluster A results appear before Cluster B results."""
|
| 215 |
+
cluster_a = ["a1", "a2"]
|
| 216 |
+
cluster_b = ["b1", "b2"]
|
| 217 |
+
result = merge_quota_results([cluster_a, cluster_b], quotas=[2, 2])
|
| 218 |
+
assert result == ["a1", "a2", "b1", "b2"]
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def test_merge_empty_cluster():
|
| 222 |
+
"""An empty cluster contributes nothing; others still fill their quota."""
|
| 223 |
+
cluster_a = ["a1", "a2", "a3"]
|
| 224 |
+
cluster_b: list[str] = []
|
| 225 |
+
result = merge_quota_results([cluster_a, cluster_b], quotas=[3, 3])
|
| 226 |
+
assert result == ["a1", "a2", "a3"]
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
def test_merge_empty_input():
|
| 230 |
+
"""No clusters → empty result."""
|
| 231 |
+
assert merge_quota_results([], []) == []
|
tests/test_integration.py
CHANGED
|
@@ -3,6 +3,7 @@ Integration tests: full HTTP request/response cycle via FastAPI TestClient.
|
|
| 3 |
Tests the complete pipeline: search → save → recommendations.
|
| 4 |
"""
|
| 5 |
import pytest
|
|
|
|
| 6 |
from fastapi.testclient import TestClient
|
| 7 |
|
| 8 |
|
|
@@ -148,7 +149,8 @@ def test_recommendations_after_save(client, monkeypatch):
|
|
| 148 |
return ["1706.03762"]
|
| 149 |
monkeypatch.setattr(qs, "recommend", fake_recommend)
|
| 150 |
|
| 151 |
-
# Also mock metadata fetch so we don't hit
|
|
|
|
| 152 |
import app.arxiv_svc as arxiv
|
| 153 |
async def fake_batch(ids):
|
| 154 |
return {
|
|
@@ -162,7 +164,8 @@ def test_recommendations_after_save(client, monkeypatch):
|
|
| 162 |
"year": 2017,
|
| 163 |
}
|
| 164 |
}
|
| 165 |
-
monkeypatch.setattr(
|
|
|
|
| 166 |
|
| 167 |
client.get("/")
|
| 168 |
client.post("/api/papers/0704.0002/save", data={"source": "search"})
|
|
@@ -173,6 +176,110 @@ def test_recommendations_after_save(client, monkeypatch):
|
|
| 173 |
|
| 174 |
# ── Full pipeline smoke test ───────────────────────────────────────────────────
|
| 175 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
def test_full_pipeline_smoke(client, monkeypatch):
|
| 177 |
"""
|
| 178 |
1. User visits home → gets cookie
|
|
@@ -211,6 +318,7 @@ def test_full_pipeline_smoke(client, monkeypatch):
|
|
| 211 |
return ["2302.11382"]
|
| 212 |
monkeypatch.setattr(qs, "recommend", fake_rec)
|
| 213 |
|
|
|
|
| 214 |
async def fake_meta(ids):
|
| 215 |
return {
|
| 216 |
"2302.11382": {
|
|
@@ -223,7 +331,8 @@ def test_full_pipeline_smoke(client, monkeypatch):
|
|
| 223 |
"year": 2023,
|
| 224 |
}
|
| 225 |
}
|
| 226 |
-
monkeypatch.setattr(
|
|
|
|
| 227 |
|
| 228 |
resp = client.get("/api/recommendations")
|
| 229 |
assert resp.status_code == 200
|
|
|
|
| 3 |
Tests the complete pipeline: search → save → recommendations.
|
| 4 |
"""
|
| 5 |
import pytest
|
| 6 |
+
from unittest.mock import AsyncMock
|
| 7 |
from fastapi.testclient import TestClient
|
| 8 |
|
| 9 |
|
|
|
|
| 149 |
return ["1706.03762"]
|
| 150 |
monkeypatch.setattr(qs, "recommend", fake_recommend)
|
| 151 |
|
| 152 |
+
# Also mock metadata fetch so we don't hit Turso DB in this test
|
| 153 |
+
import app.turso_svc as turso
|
| 154 |
import app.arxiv_svc as arxiv
|
| 155 |
async def fake_batch(ids):
|
| 156 |
return {
|
|
|
|
| 164 |
"year": 2017,
|
| 165 |
}
|
| 166 |
}
|
| 167 |
+
monkeypatch.setattr(turso, "fetch_metadata_batch", fake_batch)
|
| 168 |
+
monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
|
| 169 |
|
| 170 |
client.get("/")
|
| 171 |
client.post("/api/papers/0704.0002/save", data={"source": "search"})
|
|
|
|
| 176 |
|
| 177 |
# ── Full pipeline smoke test ───────────────────────────────────────────────────
|
| 178 |
|
| 179 |
+
def test_quota_pipeline_preserves_minority_cluster(client, monkeypatch):
|
| 180 |
+
"""
|
| 181 |
+
Phase 4.1 end-to-end check: with 5+ saves forming 2 distinct interests,
|
| 182 |
+
the quota pipeline must surface papers from BOTH clusters in the final feed.
|
| 183 |
+
This is the exact failure mode RRF was causing.
|
| 184 |
+
"""
|
| 185 |
+
import numpy as np
|
| 186 |
+
import app.qdrant_svc as qs
|
| 187 |
+
import app.turso_svc as turso
|
| 188 |
+
import app.arxiv_svc as arxiv
|
| 189 |
+
import app.recommend.profiles as prof_mod
|
| 190 |
+
|
| 191 |
+
# Set up cookie
|
| 192 |
+
client.get("/")
|
| 193 |
+
|
| 194 |
+
# 5 saved papers, split into two topics (3 "NLP", 2 "RL") via embeddings
|
| 195 |
+
saved_ids = ["nlp_a", "nlp_b", "nlp_c", "rl_a", "rl_b"]
|
| 196 |
+
rng = np.random.RandomState(42)
|
| 197 |
+
nlp_center = rng.randn(1024).astype(np.float32)
|
| 198 |
+
nlp_center /= np.linalg.norm(nlp_center)
|
| 199 |
+
rl_center = rng.randn(1024).astype(np.float32)
|
| 200 |
+
rl_center /= np.linalg.norm(rl_center)
|
| 201 |
+
|
| 202 |
+
def _near(center):
|
| 203 |
+
v = center + rng.randn(1024).astype(np.float32) * 0.05
|
| 204 |
+
return (v / np.linalg.norm(v)).tolist()
|
| 205 |
+
|
| 206 |
+
saved_vectors = {
|
| 207 |
+
"nlp_a": _near(nlp_center),
|
| 208 |
+
"nlp_b": _near(nlp_center),
|
| 209 |
+
"nlp_c": _near(nlp_center),
|
| 210 |
+
"rl_a": _near(rl_center),
|
| 211 |
+
"rl_b": _near(rl_center),
|
| 212 |
+
}
|
| 213 |
+
|
| 214 |
+
# Candidate pool: 50 NLP-ish, 50 RL-ish
|
| 215 |
+
candidate_vectors = {}
|
| 216 |
+
nlp_candidates = [f"nlp_cand_{i}" for i in range(50)]
|
| 217 |
+
rl_candidates = [f"rl_cand_{i}" for i in range(50)]
|
| 218 |
+
for cid in nlp_candidates:
|
| 219 |
+
candidate_vectors[cid] = _near(nlp_center)
|
| 220 |
+
for cid in rl_candidates:
|
| 221 |
+
candidate_vectors[cid] = _near(rl_center)
|
| 222 |
+
|
| 223 |
+
async def fake_get_paper_vectors(ids):
|
| 224 |
+
combined = {**saved_vectors, **candidate_vectors}
|
| 225 |
+
return {aid: combined[aid] for aid in ids if aid in combined}
|
| 226 |
+
|
| 227 |
+
# search_by_vector returns candidates aligned with whichever centre
|
| 228 |
+
# the query is closer to
|
| 229 |
+
async def fake_search_by_vector(query_vector, limit, exclude_ids=None):
|
| 230 |
+
qv = np.array(query_vector, dtype=np.float32)
|
| 231 |
+
qv /= np.linalg.norm(qv)
|
| 232 |
+
if float(qv @ nlp_center) > float(qv @ rl_center):
|
| 233 |
+
pool = nlp_candidates
|
| 234 |
+
else:
|
| 235 |
+
pool = rl_candidates
|
| 236 |
+
exclude = exclude_ids or set()
|
| 237 |
+
return [p for p in pool if p not in exclude][:limit]
|
| 238 |
+
|
| 239 |
+
monkeypatch.setattr(qs, "get_paper_vectors", fake_get_paper_vectors)
|
| 240 |
+
monkeypatch.setattr(qs, "search_by_vector", fake_search_by_vector)
|
| 241 |
+
|
| 242 |
+
# Skip EWMA short-term lookup — returns None
|
| 243 |
+
async def fake_load_profile(uid, kind):
|
| 244 |
+
return None
|
| 245 |
+
monkeypatch.setattr(prof_mod, "load_profile", fake_load_profile)
|
| 246 |
+
|
| 247 |
+
async def fake_interaction_count(uid, kind):
|
| 248 |
+
return 0
|
| 249 |
+
monkeypatch.setattr(prof_mod, "get_interaction_count", fake_interaction_count)
|
| 250 |
+
|
| 251 |
+
# Metadata: provide category so templates render
|
| 252 |
+
async def fake_meta(ids):
|
| 253 |
+
return {
|
| 254 |
+
aid: {
|
| 255 |
+
"arxiv_id": aid,
|
| 256 |
+
"title": f"Title {aid}",
|
| 257 |
+
"abstract": "...",
|
| 258 |
+
"authors": "[]",
|
| 259 |
+
"category": "cs.CL" if aid.startswith("nlp") else "cs.LG",
|
| 260 |
+
"published": "2024-01-01",
|
| 261 |
+
"year": 2024,
|
| 262 |
+
}
|
| 263 |
+
for aid in ids
|
| 264 |
+
}
|
| 265 |
+
monkeypatch.setattr(turso, "fetch_metadata_batch", fake_meta)
|
| 266 |
+
from unittest.mock import AsyncMock
|
| 267 |
+
monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
|
| 268 |
+
|
| 269 |
+
# Save 5 papers to cross the MIN_PAPERS_FOR_CLUSTERING threshold
|
| 270 |
+
for aid in saved_ids:
|
| 271 |
+
client.post(f"/api/papers/{aid}/save", data={"source": "search"})
|
| 272 |
+
|
| 273 |
+
resp = client.get("/api/recommendations")
|
| 274 |
+
assert resp.status_code == 200
|
| 275 |
+
|
| 276 |
+
# The response should include recs from BOTH candidate pools (quota working)
|
| 277 |
+
has_nlp_rec = any(f"nlp_cand_{i}" in resp.text for i in range(50))
|
| 278 |
+
has_rl_rec = any(f"rl_cand_{i}" in resp.text for i in range(50))
|
| 279 |
+
assert has_nlp_rec, "No NLP cluster recs — dominant cluster failed to surface"
|
| 280 |
+
assert has_rl_rec, "Minority RL cluster starved — quota fusion is not working"
|
| 281 |
+
|
| 282 |
+
|
| 283 |
def test_full_pipeline_smoke(client, monkeypatch):
|
| 284 |
"""
|
| 285 |
1. User visits home → gets cookie
|
|
|
|
| 318 |
return ["2302.11382"]
|
| 319 |
monkeypatch.setattr(qs, "recommend", fake_rec)
|
| 320 |
|
| 321 |
+
import app.turso_svc as turso
|
| 322 |
async def fake_meta(ids):
|
| 323 |
return {
|
| 324 |
"2302.11382": {
|
|
|
|
| 331 |
"year": 2023,
|
| 332 |
}
|
| 333 |
}
|
| 334 |
+
monkeypatch.setattr(turso, "fetch_metadata_batch", fake_meta)
|
| 335 |
+
monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
|
| 336 |
|
| 337 |
resp = client.get("/api/recommendations")
|
| 338 |
assert resp.status_code == 200
|
tests/test_search_router.py
CHANGED
|
@@ -1,10 +1,13 @@
|
|
| 1 |
"""
|
| 2 |
-
Layer 3: Search router integration tests — Phase 3.
|
| 3 |
|
| 4 |
Tests /search endpoint with mocked hybrid_search_svc.
|
| 5 |
Validates: ranking preservation, arXiv fallback, saved/dismissed state,
|
| 6 |
HTMX partials, and that empty queries don't trigger hybrid search.
|
| 7 |
|
|
|
|
|
|
|
|
|
|
| 8 |
No network, no model, no external services needed.
|
| 9 |
"""
|
| 10 |
import pytest
|
|
@@ -41,15 +44,17 @@ def test_search_hybrid_returns_papers(client, monkeypatch):
|
|
| 41 |
"""
|
| 42 |
/search?q=... should use hybrid search and render paper cards.
|
| 43 |
We mock hybrid_search_svc.search() to return known IDs and
|
| 44 |
-
|
| 45 |
"""
|
| 46 |
import app.hybrid_search_svc as hs
|
|
|
|
| 47 |
import app.arxiv_svc as arxiv
|
| 48 |
|
| 49 |
monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
|
| 50 |
"1706.03762", "2301.00001",
|
| 51 |
]))
|
| 52 |
-
|
|
|
|
| 53 |
"1706.03762": {
|
| 54 |
"arxiv_id": "1706.03762",
|
| 55 |
"title": "Attention Is All You Need",
|
|
@@ -69,6 +74,8 @@ def test_search_hybrid_returns_papers(client, monkeypatch):
|
|
| 69 |
"year": 2023,
|
| 70 |
},
|
| 71 |
}))
|
|
|
|
|
|
|
| 72 |
|
| 73 |
resp = client.get("/search?q=transformer+attention")
|
| 74 |
assert resp.status_code == 200
|
|
@@ -82,13 +89,15 @@ def test_search_hybrid_preserves_ranking(client, monkeypatch):
|
|
| 82 |
returned by hybrid_search_svc.search() — i.e., paper A before paper B.
|
| 83 |
"""
|
| 84 |
import app.hybrid_search_svc as hs
|
|
|
|
| 85 |
import app.arxiv_svc as arxiv
|
| 86 |
|
| 87 |
# Hybrid search returns A first, then B
|
| 88 |
monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
|
| 89 |
"2401.00001", "1706.03762",
|
| 90 |
]))
|
| 91 |
-
|
|
|
|
| 92 |
"2401.00001": {
|
| 93 |
"arxiv_id": "2401.00001",
|
| 94 |
"title": "First Paper Should Appear First",
|
|
@@ -102,6 +111,7 @@ def test_search_hybrid_preserves_ranking(client, monkeypatch):
|
|
| 102 |
"category": "cs.CL", "published": "2017-06-12", "year": 2017,
|
| 103 |
},
|
| 104 |
}))
|
|
|
|
| 105 |
|
| 106 |
resp = client.get("/search?q=test+query")
|
| 107 |
# First paper should appear before second paper in HTML
|
|
@@ -144,12 +154,14 @@ def test_search_sets_saved_dismissed_flags(client, monkeypatch):
|
|
| 144 |
based on the user's state.
|
| 145 |
"""
|
| 146 |
import app.hybrid_search_svc as hs
|
|
|
|
| 147 |
import app.arxiv_svc as arxiv
|
| 148 |
|
| 149 |
monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
|
| 150 |
"1706.03762", "2301.00001",
|
| 151 |
]))
|
| 152 |
-
|
|
|
|
| 153 |
"1706.03762": {
|
| 154 |
"arxiv_id": "1706.03762", "title": "Saved Paper",
|
| 155 |
"abstract": "...", "authors": '["A"]',
|
|
@@ -161,6 +173,7 @@ def test_search_sets_saved_dismissed_flags(client, monkeypatch):
|
|
| 161 |
"category": "cs.AI", "published": "2023-01-01", "year": 2023,
|
| 162 |
},
|
| 163 |
}))
|
|
|
|
| 164 |
|
| 165 |
# First: visit home to get cookie, then save a paper
|
| 166 |
client.get("/")
|
|
@@ -180,16 +193,19 @@ def test_search_htmx_partial_with_hybrid(client, monkeypatch):
|
|
| 180 |
same as before the hybrid search swap.
|
| 181 |
"""
|
| 182 |
import app.hybrid_search_svc as hs
|
|
|
|
| 183 |
import app.arxiv_svc as arxiv
|
| 184 |
|
| 185 |
monkeypatch.setattr(hs, "search", AsyncMock(return_value=["1706.03762"]))
|
| 186 |
-
|
|
|
|
| 187 |
"1706.03762": {
|
| 188 |
"arxiv_id": "1706.03762", "title": "HTMX Test Paper",
|
| 189 |
"abstract": "...", "authors": '["A"]',
|
| 190 |
"category": "cs.CL", "published": "2017-06-12", "year": 2017,
|
| 191 |
},
|
| 192 |
}))
|
|
|
|
| 193 |
|
| 194 |
resp = client.get(
|
| 195 |
"/search?q=transformer",
|
|
|
|
| 1 |
"""
|
| 2 |
+
Layer 3: Search router integration tests — Phase 3 + 3.5.
|
| 3 |
|
| 4 |
Tests /search endpoint with mocked hybrid_search_svc.
|
| 5 |
Validates: ranking preservation, arXiv fallback, saved/dismissed state,
|
| 6 |
HTMX partials, and that empty queries don't trigger hybrid search.
|
| 7 |
|
| 8 |
+
Phase 3.5: Turso is now the primary metadata source, arXiv API is fallback.
|
| 9 |
+
All tests mock turso_svc.fetch_metadata_batch to avoid hitting the real DB.
|
| 10 |
+
|
| 11 |
No network, no model, no external services needed.
|
| 12 |
"""
|
| 13 |
import pytest
|
|
|
|
| 44 |
"""
|
| 45 |
/search?q=... should use hybrid search and render paper cards.
|
| 46 |
We mock hybrid_search_svc.search() to return known IDs and
|
| 47 |
+
turso_svc.fetch_metadata_batch() to return metadata for those IDs.
|
| 48 |
"""
|
| 49 |
import app.hybrid_search_svc as hs
|
| 50 |
+
import app.turso_svc as turso
|
| 51 |
import app.arxiv_svc as arxiv
|
| 52 |
|
| 53 |
monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
|
| 54 |
"1706.03762", "2301.00001",
|
| 55 |
]))
|
| 56 |
+
# Phase 3.5: Turso is the primary metadata source
|
| 57 |
+
monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
|
| 58 |
"1706.03762": {
|
| 59 |
"arxiv_id": "1706.03762",
|
| 60 |
"title": "Attention Is All You Need",
|
|
|
|
| 74 |
"year": 2023,
|
| 75 |
},
|
| 76 |
}))
|
| 77 |
+
# arXiv fallback returns empty (Turso found everything)
|
| 78 |
+
monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
|
| 79 |
|
| 80 |
resp = client.get("/search?q=transformer+attention")
|
| 81 |
assert resp.status_code == 200
|
|
|
|
| 89 |
returned by hybrid_search_svc.search() — i.e., paper A before paper B.
|
| 90 |
"""
|
| 91 |
import app.hybrid_search_svc as hs
|
| 92 |
+
import app.turso_svc as turso
|
| 93 |
import app.arxiv_svc as arxiv
|
| 94 |
|
| 95 |
# Hybrid search returns A first, then B
|
| 96 |
monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
|
| 97 |
"2401.00001", "1706.03762",
|
| 98 |
]))
|
| 99 |
+
# Phase 3.5: Turso is the primary metadata source
|
| 100 |
+
monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
|
| 101 |
"2401.00001": {
|
| 102 |
"arxiv_id": "2401.00001",
|
| 103 |
"title": "First Paper Should Appear First",
|
|
|
|
| 111 |
"category": "cs.CL", "published": "2017-06-12", "year": 2017,
|
| 112 |
},
|
| 113 |
}))
|
| 114 |
+
monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
|
| 115 |
|
| 116 |
resp = client.get("/search?q=test+query")
|
| 117 |
# First paper should appear before second paper in HTML
|
|
|
|
| 154 |
based on the user's state.
|
| 155 |
"""
|
| 156 |
import app.hybrid_search_svc as hs
|
| 157 |
+
import app.turso_svc as turso
|
| 158 |
import app.arxiv_svc as arxiv
|
| 159 |
|
| 160 |
monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
|
| 161 |
"1706.03762", "2301.00001",
|
| 162 |
]))
|
| 163 |
+
# Phase 3.5: Turso is the primary metadata source
|
| 164 |
+
monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
|
| 165 |
"1706.03762": {
|
| 166 |
"arxiv_id": "1706.03762", "title": "Saved Paper",
|
| 167 |
"abstract": "...", "authors": '["A"]',
|
|
|
|
| 173 |
"category": "cs.AI", "published": "2023-01-01", "year": 2023,
|
| 174 |
},
|
| 175 |
}))
|
| 176 |
+
monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
|
| 177 |
|
| 178 |
# First: visit home to get cookie, then save a paper
|
| 179 |
client.get("/")
|
|
|
|
| 193 |
same as before the hybrid search swap.
|
| 194 |
"""
|
| 195 |
import app.hybrid_search_svc as hs
|
| 196 |
+
import app.turso_svc as turso
|
| 197 |
import app.arxiv_svc as arxiv
|
| 198 |
|
| 199 |
monkeypatch.setattr(hs, "search", AsyncMock(return_value=["1706.03762"]))
|
| 200 |
+
# Phase 3.5: Turso is the primary metadata source
|
| 201 |
+
monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
|
| 202 |
"1706.03762": {
|
| 203 |
"arxiv_id": "1706.03762", "title": "HTMX Test Paper",
|
| 204 |
"abstract": "...", "authors": '["A"]',
|
| 205 |
"category": "cs.CL", "published": "2017-06-12", "year": 2017,
|
| 206 |
},
|
| 207 |
}))
|
| 208 |
+
monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
|
| 209 |
|
| 210 |
resp = client.get(
|
| 211 |
"/search?q=transformer",
|