Spaces:

siddhm11
/

ResearchIT

Running

siddhm11 commited on Apr 26

Commit

61d5f0d

1 Parent(s): 10fbe3b

Phase 4 complete + Phase 4.5 instrumentation foundation

Phase 4 (Recommendation Pipeline Fixes) - all implemented:
- 4.1: Importance-weighted quota fusion (fusion.py, 20 tests)
- 4.2: Turso metadata (done in Phase 3.5)
- 4.3: Hungarian matching for cluster stability (10 tests)
- 4.4: Category-level negative suppression (8 tests)

Phase 4.5 (Instrumentation Foundation) - NEW:
- Added ranker_version, candidate_source, cluster_id to interactions table
- ALTER TABLE migration for existing DBs (idempotent)
- Pipeline tagging: Tier 1 papers tagged by cluster/exploration
- End-to-end flow: recommendations.py -> templates -> events.py -> db.py
- 5 dedicated instrumentation tests

TASK-TRACKER: Phase 4 marked COMPLETE, Phase 4.5 added, Phase 8 expanded
Test count: 123 -> 176 (175 passing, 1 pre-existing flaky)

Files changed (24) hide show

app/db.py +104 -11
app/recommend/clustering.py +90 -0
app/recommend/fusion.py +103 -0
app/routers/events.py +12 -0
app/routers/recommendations.py +198 -60
app/routers/saved.py +14 -3
app/routers/search.py +4 -1
app/templates/index.html +11 -8
app/templates/partials/action_buttons.html +9 -4
app/templates/partials/paper_card.html +4 -1
app/turso_svc.py +5 -3
docs/TASK-TRACKER.md +102 -30
docs/phases/PHASE3-Hybrid-Semantic-Search.md +1 -1
docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md +603 -0
docs/research/03-MultiInterest-Recommender-Architecture.md +1 -1
docs/research/07-LLM-Summaries-Reranker-and-Scaling-Research.md +426 -0
docs/walkthroughs/02-Phase2-MultiInterest-Recommender.md +1 -1
docs/walkthroughs/03-Code-Summary-and-Test-Plan.md +1 -1
docs/walkthroughs/04-Next-Steps-and-Phase-Plan.md +55 -52
tests/test_clustering.py +233 -0
tests/test_db.py +316 -0
tests/test_fusion.py +231 -0
tests/test_integration.py +112 -3
tests/test_search_router.py +22 -6

app/db.py CHANGED Viewed

@@ -6,6 +6,11 @@ Tables
 interactions        – every user action (save, not_interested, click, view)
 paper_qdrant_map    – arxiv_id → integer Qdrant point ID (cached lazily)
 paper_metadata      – arXiv API response cache (title, abstract, …)
 """
 import aiosqlite
 from app.config import DB_PATH
@@ -17,14 +22,17 @@ PRAGMA journal_mode=WAL;
 PRAGMA synchronous=NORMAL;
 CREATE TABLE IF NOT EXISTS interactions (
-    id            INTEGER PRIMARY KEY AUTOINCREMENT,
-    user_id       TEXT    NOT NULL,
-    paper_id      TEXT    NOT NULL,
-    event_type    TEXT    NOT NULL,   -- save | not_interested | click | view
-    source        TEXT,               -- search | recommendation
-    position      INTEGER,
-    query_id      TEXT,
-    timestamp     TEXT    NOT NULL DEFAULT (datetime('now'))
 );
 CREATE INDEX IF NOT EXISTS idx_ui_user_ts
@@ -73,10 +81,25 @@ CREATE TABLE IF NOT EXISTS user_clusters (
 """
 async def init_db() -> None:
     """Create tables if they don't exist. Called once at startup."""
     async with aiosqlite.connect(DB_PATH) as db:
         await db.executescript(_SCHEMA)
         await db.commit()
@@ -89,13 +112,18 @@ async def log_interaction(
     source: str | None = None,
     position: int | None = None,
     query_id: str | None = None,
 ) -> None:
     async with aiosqlite.connect(DB_PATH) as db:
         await db.execute(
             """INSERT INTO interactions
-               (user_id, paper_id, event_type, source, position, query_id)
-               VALUES (?, ?, ?, ?, ?, ?)""",
-            (user_id, paper_id, event_type, source, position, query_id),
         )
         await db.commit()
@@ -273,3 +301,68 @@ async def get_user_clusters(user_id: str) -> list[dict]:
         )
         rows = await cur.fetchall()
         return [dict(r) for r in rows]

 interactions        – every user action (save, not_interested, click, view)
 paper_qdrant_map    – arxiv_id → integer Qdrant point ID (cached lazily)
 paper_metadata      – arXiv API response cache (title, abstract, …)
+Phase 4.5 instrumentation columns (interactions table):
+  ranker_version    – identifies which pipeline version served the paper
+  candidate_source  – granular origin: 'cluster_0', 'exploration', 'ewma', etc.
+  cluster_id        – which interest cluster served this paper (NULL if N/A)
 """
 import aiosqlite
 from app.config import DB_PATH
 PRAGMA synchronous=NORMAL;
 CREATE TABLE IF NOT EXISTS interactions (
+    id               INTEGER PRIMARY KEY AUTOINCREMENT,
+    user_id          TEXT    NOT NULL,
+    paper_id         TEXT    NOT NULL,
+    event_type       TEXT    NOT NULL,   -- save | not_interested | click | view
+    source           TEXT,               -- search | recommendation
+    position         INTEGER,
+    query_id         TEXT,
+    ranker_version   TEXT,               -- Phase 4.5: pipeline version tag
+    candidate_source TEXT,               -- Phase 4.5: 'cluster_0' | 'exploration' | 'ewma' | 'qdrant_recommend'
+    cluster_id       INTEGER,            -- Phase 4.5: interest cluster index (NULL if N/A)
+    timestamp        TEXT    NOT NULL DEFAULT (datetime('now'))
 );
 CREATE INDEX IF NOT EXISTS idx_ui_user_ts
 """
+# ── Phase 4.5: ALTER TABLE migration for existing DBs ─────────────────────────
+# SQLite does not support IF NOT EXISTS for columns, so we try/except.
+_MIGRATION_4_5 = [
+    "ALTER TABLE interactions ADD COLUMN ranker_version TEXT",
+    "ALTER TABLE interactions ADD COLUMN candidate_source TEXT",
+    "ALTER TABLE interactions ADD COLUMN cluster_id INTEGER",
+]
 async def init_db() -> None:
     """Create tables if they don't exist. Called once at startup."""
     async with aiosqlite.connect(DB_PATH) as db:
         await db.executescript(_SCHEMA)
+        # Phase 4.5: add instrumentation columns to existing DBs
+        for stmt in _MIGRATION_4_5:
+            try:
+                await db.execute(stmt)
+            except Exception:
+                pass  # Column already exists — safe to ignore
         await db.commit()
     source: str | None = None,
     position: int | None = None,
     query_id: str | None = None,
+    ranker_version: str | None = None,
+    candidate_source: str | None = None,
+    cluster_id: int | None = None,
 ) -> None:
     async with aiosqlite.connect(DB_PATH) as db:
         await db.execute(
             """INSERT INTO interactions
+               (user_id, paper_id, event_type, source, position, query_id,
+                ranker_version, candidate_source, cluster_id)
+               VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+            (user_id, paper_id, event_type, source, position, query_id,
+             ranker_version, candidate_source, cluster_id),
         )
         await db.commit()
         )
         rows = await cur.fetchall()
         return [dict(r) for r in rows]
+# ── Phase 4.3: Category suppression helpers ───────────────────────────────────
+async def cache_turso_metadata_batch(papers: list[dict]) -> None:
+    """
+    Write Turso paper dicts to the paper_metadata SQLite cache.
+    Called after every Turso fetch so dismissal-category JOINs work.
+    Silently skips rows missing required fields.
+    """
+    if not papers:
+        return
+    async with aiosqlite.connect(DB_PATH) as conn:
+        for paper in papers:
+            if not paper.get("arxiv_id"):
+                continue
+            try:
+                await conn.execute(
+                    """INSERT OR REPLACE INTO paper_metadata
+                       (arxiv_id, title, abstract, authors, category, published)
+                       VALUES (:arxiv_id, :title, :abstract, :authors, :category, :published)""",
+                    {
+                        "arxiv_id": paper.get("arxiv_id", ""),
+                        "title": paper.get("title", ""),
+                        "abstract": paper.get("abstract", ""),
+                        "authors": paper.get("authors", "[]"),
+                        "category": paper.get("category", ""),
+                        "published": paper.get("published", ""),
+                    },
+                )
+            except Exception:
+                pass
+        await conn.commit()
+async def get_suppressed_categories(
+    user_id: str,
+    threshold: int = 3,
+    window_days: int = 14,
+) -> set[str]:
+    """
+    Return categories the user has strongly signalled disinterest in.
+    A category is suppressed when the user has dismissed ≥ threshold papers
+    in that category within the last window_days days.
+    Requires paper_metadata to be populated (via cache_turso_metadata_batch).
+    Returns an empty set if no suppressions are found.
+    """
+    async with aiosqlite.connect(DB_PATH) as conn:
+        cur = await conn.execute(
+            """SELECT pm.category, COUNT(*) AS cnt
+               FROM interactions i
+               JOIN paper_metadata pm ON i.paper_id = pm.arxiv_id
+               WHERE i.user_id = ?
+                 AND i.event_type = 'not_interested'
+                 AND i.timestamp >= datetime('now', ? || ' days')
+                 AND pm.category != ''
+               GROUP BY pm.category
+               HAVING COUNT(*) >= ?""",
+            (user_id, f"-{window_days}", threshold),
+        )
+        rows = await cur.fetchall()
+        return {row[0] for row in rows}

app/recommend/clustering.py CHANGED Viewed

@@ -20,6 +20,7 @@ import json
 from dataclasses import dataclass, field
 import numpy as np
 from scipy.cluster.hierarchy import ward, fcluster
 from scipy.spatial.distance import pdist
 from app import db
@@ -183,6 +184,95 @@ def _find_medoid(embeddings: np.ndarray, centroid: np.ndarray) -> int:
     return int(np.argmin(distances))
 # ── Persistence ───────────────────────────────────────────────────────────────
 async def save_clusters_to_db(user_id: str, clusters: list[InterestCluster]) -> None:

 from dataclasses import dataclass, field
 import numpy as np
 from scipy.cluster.hierarchy import ward, fcluster
+from scipy.optimize import linear_sum_assignment
 from scipy.spatial.distance import pdist
 from app import db
     return int(np.argmin(distances))
+# ── Cluster ID stabilisation (Phase 4.2) ─────────────────────────────────────
+# Hungarian matches below this cosine similarity are rejected as "unrelated".
+# Doc 06 §"Clustering specifics": a genuinely new interest must not steal an
+# old cluster's identity just because Hungarian found the least-bad assignment.
+CLUSTER_MATCH_MIN_COSINE = 0.5
+def stabilize_cluster_ids(
+    new_clusters: list[InterestCluster],
+    old_clusters: list[InterestCluster],
+    min_cosine_sim: float = CLUSTER_MATCH_MIN_COSINE,
+) -> list[InterestCluster]:
+    """
+    Preserve cluster identity across reclusters using the Hungarian algorithm.
+    Every time the user saves a paper we recluster from scratch.  Without
+    stabilisation, cluster indices shuffle (NLP was 0, now it's 2), breaking
+    future analytics and UI labels.
+    Algorithm:
+      1. Build cost matrix: cost[i][j] = 1 - cosine_sim(new_medoid_i, old_medoid_j)
+      2. Solve with scipy linear_sum_assignment (O(K³), trivial for K ≤ 7)
+      3. Matched pairs with cosine_sim >= min_cosine_sim inherit the old idx
+      4. Weak matches (cosine_sim < min_cosine_sim) and unmatched new clusters
+         get the next available index
+    Args:
+        new_clusters:   freshly computed clusters (cluster_idx values ignored)
+        old_clusters:   clusters from the previous recluster (stable reference)
+        min_cosine_sim: reject matches below this cosine similarity (default 0.5)
+    Returns:
+        new_clusters with stable cluster_idx values assigned.
+    """
+    if not old_clusters or not new_clusters:
+        return new_clusters
+    new_embs = np.array([c.medoid_embedding for c in new_clusters], dtype=np.float32)
+    old_embs = np.array([c.medoid_embedding for c in old_clusters], dtype=np.float32)
+    # L2-normalise before cosine similarity
+    def _safe_norm(embs: np.ndarray) -> np.ndarray:
+        norms = np.linalg.norm(embs, axis=1, keepdims=True)
+        return embs / np.where(norms < 1e-10, 1.0, norms)
+    new_embs = _safe_norm(new_embs)
+    old_embs = _safe_norm(old_embs)
+    # Cosine similarity → cost matrix (n_new × n_old)
+    sim = new_embs @ old_embs.T
+    cost = 1.0 - sim
+    # Hungarian assignment — works on rectangular matrices
+    row_ind, col_ind = linear_sum_assignment(cost)
+    # Accept only pairs whose cosine similarity clears the threshold.
+    # Weak matches would steal an old cluster's identity for an unrelated topic.
+    new_to_stable: dict[int, int] = {}
+    for r, c in zip(row_ind, col_ind):
+        if float(sim[r, c]) >= min_cosine_sim:
+            new_to_stable[int(r)] = old_clusters[int(c)].cluster_idx
+    used_ids: set[int] = set(new_to_stable.values())
+    next_id = 0
+    result: list[InterestCluster] = []
+    for i, cluster in enumerate(new_clusters):
+        if i in new_to_stable:
+            stable_idx = new_to_stable[i]
+        else:
+            # No strong match — assign next free index
+            while next_id in used_ids:
+                next_id += 1
+            stable_idx = next_id
+            used_ids.add(stable_idx)
+            next_id += 1
+        result.append(InterestCluster(
+            cluster_idx=stable_idx,
+            medoid_paper_id=cluster.medoid_paper_id,
+            medoid_embedding=cluster.medoid_embedding,
+            paper_ids=cluster.paper_ids,
+            importance=cluster.importance,
+        ))
+    return result
 # ── Persistence ───────────────────────────────────────────────────────────────
 async def save_clusters_to_db(user_id: str, clusters: list[InterestCluster]) -> None:

app/recommend/fusion.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Importance-weighted quota fusion for multi-interest recommendations.
+Replaces RRF for the recommendation pipeline (not search).
+RRF is correct for search (different retrievers, same query).
+For recommendations (different cluster queries, same user), RRF lets
+the dominant cluster drown minority interests.  Quota ensures every
+interest cluster gets a guaranteed floor of slots.
+Reference: doc 06 §3.1 — "importance-weighted quota with a floor"
+  w_k = importance_k / sum(importance_k)
+  slot_k = max(floor(F * w_k), F_min)   # F = total, F_min = 3
+  # distribute remainder by largest fractional part
+"""
+from __future__ import annotations
+def allocate_quotas(
+    importances: list[float],
+    total_slots: int,
+    min_slots: int = 3,
+) -> list[int]:
+    """
+    Allocate recommendation slots proportionally to cluster importances,
+    with a guaranteed minimum per cluster.
+    Args:
+        importances: importance score per cluster, same order as clusters
+        total_slots: total candidate slots to distribute (e.g. 100)
+        min_slots:   minimum slots guaranteed to every cluster (default 3)
+    Returns:
+        List of slot counts, same length and order as importances.
+        sum(result) >= total_slots (may exceed if floor constraints force it).
+    """
+    n = len(importances)
+    if n == 0:
+        return []
+    if n == 1:
+        return [max(total_slots, min_slots)]
+    total_imp = sum(importances)
+    if total_imp <= 0:
+        # Degenerate: equal distribution with floor guarantee
+        per = total_slots // n
+        result = [per] * n
+        for i in range(total_slots - per * n):
+            result[i] += 1
+        return [max(r, min_slots) for r in result]
+    # Proportional raw allocations
+    raw = [imp / total_imp * total_slots for imp in importances]
+    # Apply floor: max(floor(raw_i), min_slots)
+    floored = [max(int(r), min_slots) for r in raw]
+    remainder = total_slots - sum(floored)
+    if remainder <= 0:
+        # Floor guarantees already account for all slots (or more)
+        return floored
+    # Distribute remainder slots by largest fractional part of raw allocations
+    fracs = sorted(range(n), key=lambda i: raw[i] % 1.0, reverse=True)
+    for j in range(remainder):
+        floored[fracs[j % n]] += 1
+    return floored
+def merge_quota_results(
+    per_cluster_ids: list[list[str]],
+    quotas: list[int],
+) -> list[str]:
+    """
+    Merge per-cluster search results respecting quota allocations.
+    Takes up to `quota_k` unique results from each cluster in round-robin
+    order across clusters (by importance rank), deduplicating globally.
+    Args:
+        per_cluster_ids: list of arxiv_id lists, one per cluster (importance order)
+        quotas:          slot count for each cluster (same order)
+    Returns:
+        Merged list of arxiv_ids, deduplicated, quota-bounded per cluster.
+    """
+    seen: set[str] = set()
+    result: list[str] = []
+    for cluster_ids, quota in zip(per_cluster_ids, quotas):
+        count = 0
+        for aid in cluster_ids:
+            if count >= quota:
+                break
+            if aid not in seen:
+                result.append(aid)
+                seen.add(aid)
+                count += 1
+    return result

app/routers/events.py CHANGED Viewed

@@ -24,6 +24,9 @@ async def save_paper(
     source: str = Form(default="search"),
     position: int = Form(default=0),
     query_id: str = Form(default=""),
     user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
 ):
     user_id = user_id or str(uuid.uuid4())
@@ -35,6 +38,9 @@ async def save_paper(
         source=source,
         position=position or None,
         query_id=query_id or None,
     )
     us.record_positive(user_id, paper_id)
@@ -57,6 +63,9 @@ async def not_interested(
     source: str = Form(default="search"),
     position: int = Form(default=0),
     query_id: str = Form(default=""),
     user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
 ):
     user_id = user_id or str(uuid.uuid4())
@@ -68,6 +77,9 @@ async def not_interested(
         source=source,
         position=position or None,
         query_id=query_id or None,
     )
     us.record_negative(user_id, paper_id)

     source: str = Form(default="search"),
     position: int = Form(default=0),
     query_id: str = Form(default=""),
+    ranker_version: str = Form(default=""),
+    candidate_source: str = Form(default=""),
+    cluster_id: str = Form(default=""),
     user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
 ):
     user_id = user_id or str(uuid.uuid4())
         source=source,
         position=position or None,
         query_id=query_id or None,
+        ranker_version=ranker_version or None,
+        candidate_source=candidate_source or None,
+        cluster_id=int(cluster_id) if cluster_id else None,
     )
     us.record_positive(user_id, paper_id)
     source: str = Form(default="search"),
     position: int = Form(default=0),
     query_id: str = Form(default=""),
+    ranker_version: str = Form(default=""),
+    candidate_source: str = Form(default=""),
+    cluster_id: str = Form(default=""),
     user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
 ):
     user_id = user_id or str(uuid.uuid4())
         source=source,
         position=position or None,
         query_id=query_id or None,
+        ranker_version=ranker_version or None,
+        candidate_source=candidate_source or None,
+        cluster_id=int(cluster_id) if cluster_id else None,
     )
     us.record_negative(user_id, paper_id)

app/routers/recommendations.py CHANGED Viewed

@@ -6,16 +6,21 @@ GET /api/recommendations
   – Returns the recommendations partial HTML
 Recommendation pipeline (cascading fallback):
-  Phase 2b: Multi-interest clustering → prefetch + RRF fusion  (≥5 saves)
-  Phase 2a: EWMA long-term vector → single vector search       (≥3 saves)
-  Phase 1:  Qdrant BEST_SCORE Recommend API with raw IDs       (≥1 save)
 """
-import json
 import uuid
 import numpy as np
 from fastapi import APIRouter, Request, Cookie
 from fastapi.responses import HTMLResponse
-from app import qdrant_svc, arxiv_svc, user_state as us
 from app.config import COOKIE_NAME, REC_LIMIT, REC_MIN_POSITIVES
 from app.templates_env import templates
 from app.recommend import profiles
@@ -23,16 +28,28 @@ from app.recommend.clustering import (
     compute_clusters,
     save_clusters_to_db,
     load_clusters_from_db,
     MIN_PAPERS_FOR_CLUSTERING,
 )
 from app.recommend.reranker import rerank_candidates
 from app.recommend.diversity import mmr_rerank, inject_exploration
 router = APIRouter(prefix="/api")
 # Minimum EWMA interactions before switching from ID-based to vector-based recs
 _MIN_EWMA_INTERACTIONS = 3
 @router.get("/recommendations", response_class=HTMLResponse)
 async def get_recommendations(
@@ -56,14 +73,27 @@ async def get_recommendations(
     seen = us.all_seen(user_id)
-    # ── Tier 1: Multi-interest clustering + RRF (Phase 2b, ≥5 saves) ─────
-    rec_arxiv_ids = await _multi_interest_recommend(user_id, state, seen, REC_LIMIT)
-    # ── Tier 2: EWMA single-vector search (Phase 2a, ≥3 saves) ───────────
     if not rec_arxiv_ids:
         rec_arxiv_ids = await _ewma_recommend(user_id, seen, REC_LIMIT)
-    # ── Tier 3: Qdrant Recommend API (Phase 1 fallback, ≥1 save) ─────────
     if not rec_arxiv_ids:
         rec_arxiv_ids = await qdrant_svc.recommend(
             positive_arxiv_ids=state.positive_list,
@@ -71,16 +101,43 @@ async def get_recommendations(
             seen_arxiv_ids=seen,
             limit=REC_LIMIT,
         )
     if not rec_arxiv_ids:
         return _empty_resp()
-    meta = await arxiv_svc.fetch_metadata_batch(rec_arxiv_ids)
-    papers = [
-        {**meta[aid], "saved": False, "dismissed": False}
-        for aid in rec_arxiv_ids
-        if aid in meta
-    ]
     resp = templates.TemplateResponse(
         request,
@@ -91,35 +148,34 @@ async def get_recommendations(
     return resp
-# ── Tier 1: Multi-interest clustering + prefetch RRF ─────────────────────────
-# Per-cluster candidate limits (descending by importance)
-_CLUSTER_LIMITS = [40, 30, 25, 20, 15, 15, 15]
 async def _multi_interest_recommend(
     user_id: str, state, seen: set[str], limit: int
-) -> list[str]:
     """
-    Full recommendation pipeline (Phase 2b + 2c):
       1. Ward clustering → identify distinct interests
-      2. Prefetch + RRF → retrieve ~100 candidates
-      3. Heuristic re-ranking → score candidates
-      4. MMR diversity → select top-k with diversity
-      5. Exploration injection → 1-2 serendipitous papers
-    Only activates when the user has ≥ MIN_PAPERS_FOR_CLUSTERING saves.
-    Returns [] to trigger fallback to Tier 2.
     """
     positives = state.positive_list
     if len(positives) < MIN_PAPERS_FOR_CLUSTERING:
-        return []
     try:
         # Fetch embeddings for all saved papers
         vectors = await qdrant_svc.get_paper_vectors(positives)
         if len(vectors) < MIN_PAPERS_FOR_CLUSTERING:
-            return []
         # Build aligned arrays (only papers we got vectors for)
         aligned_ids = [pid for pid in positives if pid in vectors]
@@ -129,38 +185,89 @@ async def _multi_interest_recommend(
         # ── Step 1: Compute interest clusters ─────────────────────────────
         clusters = compute_clusters(aligned_ids, aligned_embs)
         await save_clusters_to_db(user_id, clusters)
-        # ── Step 2: Multi-interest retrieval via prefetch + RRF ───────────
-        interest_vectors = []
-        for i, cluster in enumerate(clusters):
-            per_cluster_limit = _CLUSTER_LIMITS[i] if i < len(_CLUSTER_LIMITS) else 15
-            interest_vectors.append(
-                (cluster.medoid_embedding.tolist(), per_cluster_limit)
-            )
         st_vec = await profiles.load_profile(user_id, "short_term")
-        st_list = st_vec.tolist() if st_vec is not None else None
-        candidate_ids = await qdrant_svc.multi_interest_search(
-            interest_vectors=interest_vectors,
-            short_term_vector=st_list,
-            exclude_ids=seen,
-            total_limit=100,  # retrieve wide, narrow with re-ranking
-        )
         if not candidate_ids:
-            return []
-        # ── Step 3: Re-rank candidates ────────────────────────────────────
-        # Fetch embeddings + metadata for candidates
         cand_vectors = await qdrant_svc.get_paper_vectors(candidate_ids)
-        cand_meta = await arxiv_svc.fetch_metadata_batch(candidate_ids)
-        # Only process candidates we have both vectors and metadata for
         valid_ids = [cid for cid in candidate_ids if cid in cand_vectors and cid in cand_meta]
         if not valid_ids:
-            return candidate_ids[:limit]  # fallback: return raw retrieval
         valid_embs = np.array([cand_vectors[cid] for cid in valid_ids], dtype=np.float32)
         valid_meta = [cand_meta[cid] for cid in valid_ids]
@@ -168,6 +275,7 @@ async def _multi_interest_recommend(
         lt_vec = await profiles.load_profile(user_id, "long_term")
         neg_vec = await profiles.load_profile(user_id, "negative")
         reranked_ids, reranked_scores, reranked_embs = rerank_candidates(
             candidate_ids=valid_ids,
             candidate_embeddings=valid_embs,
@@ -177,7 +285,19 @@ async def _multi_interest_recommend(
             negative_vec=neg_vec,
         )
-        # ── Step 4: MMR diversity enforcement ─────────────────────────────
         query_vec = lt_vec if lt_vec is not None else aligned_embs.mean(axis=0)
         mmr_selected = mmr_rerank(
             query_embedding=query_vec,
@@ -188,18 +308,38 @@ async def _multi_interest_recommend(
             top_k=limit,
         )
-        # ── Step 5: Exploration injection ─────────────────────────────────
         final = inject_exploration(
             selected_ids=mmr_selected,
             all_candidate_ids=reranked_ids,
             n_explore=2,
         )
-        return final[:limit + 2]  # allow slightly over limit for exploration
     except Exception as e:
         print(f"[recommendations] multi-interest search failed: {e}")
-        return []
 # ── Tier 2: EWMA single-vector search ────────────────────────────────────────
@@ -227,5 +367,3 @@ async def _ewma_recommend(
         limit=limit,
         exclude_ids=seen,
     )

   – Returns the recommendations partial HTML
 Recommendation pipeline (cascading fallback):
+  Phase 2b / 4.1: Multi-interest clustering → quota fusion     (≥5 saves)
+  Phase 2a:       EWMA long-term vector → single vector search  (≥3 saves)
+  Phase 1:        Qdrant BEST_SCORE Recommend API with raw IDs  (≥1 save)
+Phase 4 changes vs Phase 2b:
+  - RRF replaced with importance-weighted quota fusion (doc 06 §3.1)
+  - Hungarian matching stabilises cluster IDs across reclusters (4.2)
+  - Category-level suppression filters strongly disliked topics (4.3)
 """
+import asyncio
 import uuid
 import numpy as np
 from fastapi import APIRouter, Request, Cookie
 from fastapi.responses import HTMLResponse
+from app import db, qdrant_svc, arxiv_svc, turso_svc, user_state as us
 from app.config import COOKIE_NAME, REC_LIMIT, REC_MIN_POSITIVES
 from app.templates_env import templates
 from app.recommend import profiles
     compute_clusters,
     save_clusters_to_db,
     load_clusters_from_db,
+    stabilize_cluster_ids,
     MIN_PAPERS_FOR_CLUSTERING,
 )
+from app.recommend.fusion import allocate_quotas, merge_quota_results
 from app.recommend.reranker import rerank_candidates
 from app.recommend.diversity import mmr_rerank, inject_exploration
 router = APIRouter(prefix="/api")
+# Phase 4.5: Pipeline version tag for instrumentation.  Bump this on any
+# change to the ranking logic so A/B attribution is possible.
+_RANKER_VERSION = "v4.1_quota_hungarian_suppression"
 # Minimum EWMA interactions before switching from ID-based to vector-based recs
 _MIN_EWMA_INTERACTIONS = 3
+# Candidate oversampling factor per cluster (fetch more than quota to handle dedup)
+_OVERSAMPLE = 3
+# Short-term session context: fixed supplementary pool size
+_ST_SUPPLEMENT = 20
 @router.get("/recommendations", response_class=HTMLResponse)
 async def get_recommendations(
     seen = us.all_seen(user_id)
+    # Phase 4.5: paper_tags maps arxiv_id → instrumentation metadata
+    # populated by whichever tier serves the result.
+    paper_tags: dict[str, dict] = {}
+    rec_arxiv_ids: list[str] = []
+    # ── Tier 1: Multi-interest clustering + quota fusion (≥5 saves) ──────
+    rec_arxiv_ids, paper_tags = await _multi_interest_recommend(
+        user_id, state, seen, REC_LIMIT,
+    )
+    # ── Tier 2: EWMA single-vector search (≥3 saves) ──────────────────────
     if not rec_arxiv_ids:
         rec_arxiv_ids = await _ewma_recommend(user_id, seen, REC_LIMIT)
+        for aid in rec_arxiv_ids:
+            paper_tags[aid] = {
+                "ranker_version": _RANKER_VERSION,
+                "candidate_source": "ewma_longterm",
+                "cluster_id": "",
+            }
+    # ── Tier 3: Qdrant Recommend API (≥1 save fallback) ───────────────────
     if not rec_arxiv_ids:
         rec_arxiv_ids = await qdrant_svc.recommend(
             positive_arxiv_ids=state.positive_list,
             seen_arxiv_ids=seen,
             limit=REC_LIMIT,
         )
+        for aid in rec_arxiv_ids:
+            paper_tags[aid] = {
+                "ranker_version": _RANKER_VERSION,
+                "candidate_source": "qdrant_recommend",
+                "cluster_id": "",
+            }
     if not rec_arxiv_ids:
         return _empty_resp()
+    # Phase 3.5: Turso primary, arXiv API fallback
+    meta = await turso_svc.fetch_metadata_batch(rec_arxiv_ids)
+    missing = [aid for aid in rec_arxiv_ids if aid not in meta]
+    if missing:
+        try:
+            arxiv_meta = await arxiv_svc.fetch_metadata_batch(missing)
+            meta.update(arxiv_meta)
+        except Exception as e:
+            print(f"[recommendations] arXiv fallback for {len(missing)} IDs failed: {e}")
+    # Cache to SQLite so category suppression JOINs work (Phase 4.3)
+    await db.cache_turso_metadata_batch(list(meta.values()))
+    papers = []
+    for aid in rec_arxiv_ids:
+        if aid not in meta:
+            continue
+        tags = paper_tags.get(aid, {})
+        papers.append({
+            **meta[aid],
+            "saved": False,
+            "dismissed": False,
+            # Phase 4.5 instrumentation — embedded in card, flows back via HTMX
+            "ranker_version": tags.get("ranker_version", _RANKER_VERSION),
+            "candidate_source": tags.get("candidate_source", ""),
+            "cluster_id": tags.get("cluster_id", ""),
+        })
     resp = templates.TemplateResponse(
         request,
     return resp
+# ── Tier 1: Multi-interest clustering + quota fusion ─────────────────────────
 async def _multi_interest_recommend(
     user_id: str, state, seen: set[str], limit: int
+) -> tuple[list[str], dict[str, dict]]:
     """
+    Full recommendation pipeline (Phase 2b + Phase 4 corrections):
       1. Ward clustering → identify distinct interests
+      2. Quota allocation → per-cluster slot budgets (replaces RRF)
+      3. Parallel per-cluster ANN searches → retrieve candidates
+      4. Hungarian matching → stabilise cluster IDs across reclusters
+      5. Category suppression → remove strongly disliked topics
+      6. Heuristic re-ranking → score candidates
+      7. MMR diversity → select top-k with diversity
+      8. Exploration injection → serendipitous papers
+    Returns ([], {}) to trigger fallback to Tier 2.
+    Phase 4.5: second element is {arxiv_id: {ranker_version, candidate_source, cluster_id}}.
     """
     positives = state.positive_list
     if len(positives) < MIN_PAPERS_FOR_CLUSTERING:
+        return [], {}
     try:
         # Fetch embeddings for all saved papers
         vectors = await qdrant_svc.get_paper_vectors(positives)
         if len(vectors) < MIN_PAPERS_FOR_CLUSTERING:
+            return [], {}
         # Build aligned arrays (only papers we got vectors for)
         aligned_ids = [pid for pid in positives if pid in vectors]
         # ── Step 1: Compute interest clusters ─────────────────────────────
         clusters = compute_clusters(aligned_ids, aligned_embs)
+        # ── Step 4.2: Stabilise cluster IDs with Hungarian matching ───────
+        old_clusters_data = await load_clusters_from_db(user_id)
+        if old_clusters_data:
+            from app.recommend.clustering import InterestCluster
+            old_clusters = [
+                InterestCluster(
+                    cluster_idx=row["cluster_idx"],
+                    medoid_paper_id=row["medoid_paper_id"],
+                    medoid_embedding=np.array(
+                        vectors[row["medoid_paper_id"]], dtype=np.float32
+                    ) if row["medoid_paper_id"] in vectors else np.zeros(1024, dtype=np.float32),
+                    paper_ids=[],
+                    importance=row["importance"],
+                )
+                for row in old_clusters_data
+            ]
+            clusters = stabilize_cluster_ids(clusters, old_clusters)
         await save_clusters_to_db(user_id, clusters)
+        # ── Step 2: Quota allocation ───────────────────────────────────────
+        importances = [c.importance for c in clusters]
+        quotas = allocate_quotas(importances, total_slots=100, min_slots=3)
+        # ── Step 3: Parallel per-cluster ANN searches ─────────────────────
         st_vec = await profiles.load_profile(user_id, "short_term")
+        search_tasks = [
+            qdrant_svc.search_by_vector(
+                query_vector=c.medoid_embedding.tolist(),
+                limit=quota * _OVERSAMPLE,
+                exclude_ids=seen,
+            )
+            for c, quota in zip(clusters, quotas)
+        ]
+        per_cluster_results = await asyncio.gather(*search_tasks)
+        # Phase 4.5: Build paper → cluster mapping BEFORE merge (so we know
+        # which cluster each paper was retrieved from).
+        paper_cluster_map: dict[str, int] = {}
+        for cluster, result_ids in zip(clusters, per_cluster_results):
+            for aid in result_ids:
+                if aid not in paper_cluster_map:  # first-occurrence wins
+                    paper_cluster_map[aid] = cluster.cluster_idx
+        # Apply quota merge (dedup globally, respect per-cluster quotas)
+        candidate_ids = merge_quota_results(list(per_cluster_results), quotas)
+        # Supplement with short-term session context
+        if st_vec is not None:
+            seen_so_far = seen | set(candidate_ids)
+            st_results = await qdrant_svc.search_by_vector(
+                query_vector=st_vec.tolist(),
+                limit=_ST_SUPPLEMENT,
+                exclude_ids=seen_so_far,
+            )
+            for aid in st_results:
+                if aid not in set(candidate_ids):
+                    candidate_ids.append(aid)
+                    paper_cluster_map[aid] = -1  # short-term supplement
         if not candidate_ids:
+            return [], {}
+        # ── Step 5: Fetch candidate vectors + metadata ────────────────────
         cand_vectors = await qdrant_svc.get_paper_vectors(candidate_ids)
+        cand_meta = await turso_svc.fetch_metadata_batch(candidate_ids)
+        cand_missing = [cid for cid in candidate_ids if cid not in cand_meta]
+        if cand_missing:
+            try:
+                arxiv_cand_meta = await arxiv_svc.fetch_metadata_batch(cand_missing)
+                cand_meta.update(arxiv_cand_meta)
+            except Exception as e:
+                print(f"[recommendations] arXiv fallback for {len(cand_missing)} IDs failed: {e}")
+        # Cache fetched metadata to SQLite for category suppression
+        await db.cache_turso_metadata_batch(list(cand_meta.values()))
+        # Only process candidates with both vectors and metadata
         valid_ids = [cid for cid in candidate_ids if cid in cand_vectors and cid in cand_meta]
         if not valid_ids:
+            return candidate_ids[:limit], {}
         valid_embs = np.array([cand_vectors[cid] for cid in valid_ids], dtype=np.float32)
         valid_meta = [cand_meta[cid] for cid in valid_ids]
         lt_vec = await profiles.load_profile(user_id, "long_term")
         neg_vec = await profiles.load_profile(user_id, "negative")
+        # ── Step 6: Heuristic re-ranking ──────────────────────────────────
         reranked_ids, reranked_scores, reranked_embs = rerank_candidates(
             candidate_ids=valid_ids,
             candidate_embeddings=valid_embs,
             negative_vec=neg_vec,
         )
+        # ── Step 4.3: Category suppression ────────────────────────────────
+        suppressed = await db.get_suppressed_categories(user_id)
+        if suppressed:
+            kept = [
+                i for i, cid in enumerate(reranked_ids)
+                if cand_meta.get(cid, {}).get("category", "") not in suppressed
+            ]
+            if kept:
+                reranked_ids = [reranked_ids[i] for i in kept]
+                reranked_scores = [reranked_scores[i] for i in kept]
+                reranked_embs = reranked_embs[kept]
+        # ── Step 7: MMR diversity enforcement ─────────────────────────────
         query_vec = lt_vec if lt_vec is not None else aligned_embs.mean(axis=0)
         mmr_selected = mmr_rerank(
             query_embedding=query_vec,
             top_k=limit,
         )
+        # ── Step 8: Exploration injection ─────────────────────────────────
         final = inject_exploration(
             selected_ids=mmr_selected,
             all_candidate_ids=reranked_ids,
             n_explore=2,
         )
+        final = final[:limit + 2]
+        # Phase 4.5: Build per-paper instrumentation tags
+        exploration_set = set(final) - set(mmr_selected)
+        paper_tags: dict[str, dict] = {}
+        for aid in final:
+            cluster_idx = paper_cluster_map.get(aid)
+            if aid in exploration_set:
+                source = "exploration"
+            elif cluster_idx == -1:
+                source = "short_term_supplement"
+            elif cluster_idx is not None:
+                source = f"cluster_{cluster_idx}"
+            else:
+                source = "tier1_unknown"
+            paper_tags[aid] = {
+                "ranker_version": _RANKER_VERSION,
+                "candidate_source": source,
+                "cluster_id": str(cluster_idx) if cluster_idx is not None and cluster_idx >= 0 else "",
+            }
+        return final, paper_tags
     except Exception as e:
         print(f"[recommendations] multi-interest search failed: {e}")
+        return [], {}
 # ── Tier 2: EWMA single-vector search ────────────────────────────────────────
         limit=limit,
         exclude_ids=seen,
     )

app/routers/saved.py CHANGED Viewed

@@ -3,12 +3,12 @@ Saved papers router.
 GET /saved
   – Shows all papers the user has currently saved (positive_list)
-  – Metadata fetched via arXiv API + SQLite cache
 """
 import uuid
 from fastapi import APIRouter, Request, Cookie
 from fastapi.responses import HTMLResponse
-from app import arxiv_svc, user_state as us
 from app.config import COOKIE_NAME
 from app.templates_env import templates
@@ -27,7 +27,18 @@ async def saved_papers(
     papers = []
     if saved_ids:
-        meta = await arxiv_svc.fetch_metadata_batch(saved_ids)
         papers = [
             {**meta[aid], "saved": True, "dismissed": False}
             for aid in saved_ids

 GET /saved
   – Shows all papers the user has currently saved (positive_list)
+  – Metadata fetched via Turso DB (Phase 3.5), arXiv API fallback
 """
 import uuid
 from fastapi import APIRouter, Request, Cookie
 from fastapi.responses import HTMLResponse
+from app import arxiv_svc, db, turso_svc, user_state as us
 from app.config import COOKIE_NAME
 from app.templates_env import templates
     papers = []
     if saved_ids:
+        # Phase 3.5: Turso primary, arXiv API fallback
+        meta = await turso_svc.fetch_metadata_batch(saved_ids)
+        missing = [aid for aid in saved_ids if aid not in meta]
+        if missing:
+            try:
+                arxiv_meta = await arxiv_svc.fetch_metadata_batch(missing)
+                meta.update(arxiv_meta)
+            except Exception as e:
+                print(f"[saved] arXiv fallback for {len(missing)} IDs failed: {e}")
+        # Phase 4.3: Cache to SQLite so dismissal category JOINs work
+        await db.cache_turso_metadata_batch(list(meta.values()))
         papers = [
             {**meta[aid], "saved": True, "dismissed": False}
             for aid in saved_ids

app/routers/search.py CHANGED Viewed

@@ -14,7 +14,7 @@ Phase 3.5: Metadata now fetched from Turso cloud DB (fast, includes citations)
 import uuid
 from fastapi import APIRouter, Request, Cookie
 from fastapi.responses import HTMLResponse
-from app import arxiv_svc, turso_svc, user_state as us, hybrid_search_svc
 from app.config import COOKIE_NAME, ARXIV_MAX_RESULTS
 from app.templates_env import templates
@@ -53,6 +53,9 @@ async def search(
                 except Exception as e:
                     print(f"[search] arXiv fallback for {len(missing)} IDs failed: {e}")
             # Preserve ranking order from hybrid search
             papers = [meta[aid] for aid in arxiv_ids if aid in meta]

 import uuid
 from fastapi import APIRouter, Request, Cookie
 from fastapi.responses import HTMLResponse
+from app import arxiv_svc, db, turso_svc, user_state as us, hybrid_search_svc
 from app.config import COOKIE_NAME, ARXIV_MAX_RESULTS
 from app.templates_env import templates
                 except Exception as e:
                     print(f"[search] arXiv fallback for {len(missing)} IDs failed: {e}")
+            # Phase 4.3: Cache to SQLite so dismissal category JOINs work
+            await db.cache_turso_metadata_batch(list(meta.values()))
             # Preserve ranking order from hybrid search
             papers = [meta[aid] for aid in arxiv_ids if aid in meta]

app/templates/index.html CHANGED Viewed

@@ -31,14 +31,17 @@
   <!-- Recommendations section -->
   <div>
     <h2 class="text-lg font-semibold mb-3">Recommended for You</h2>
-    <div id="rec-section"
-         hx-get="/api/recommendations"
-         hx-trigger="load"
-         hx-indicator="#rec-spinner"
-         hx-swap="innerHTML">
-      <div class="flex items-center gap-2 text-base-content/50">
-        <span id="rec-spinner" class="htmx-indicator loading loading-spinner loading-sm"></span>
-        <span>Loading recommendations…</span>
       </div>
     </div>
   </div>

   <!-- Recommendations section -->
   <div>
     <h2 class="text-lg font-semibold mb-3">Recommended for You</h2>
+    <div id="rec-section-wrapper" class="relative">
+      <span id="rec-spinner" class="htmx-indicator loading loading-spinner loading-sm absolute right-0 top-0"></span>
+      <div id="rec-section"
+           hx-get="/api/recommendations"
+           hx-trigger="load"
+           hx-indicator="#rec-spinner"
+           hx-swap="innerHTML">
+        <div class="flex items-center gap-2 text-base-content/50">
+          <span class="loading loading-spinner loading-sm"></span>
+          <span>Loading recommendations…</span>
+        </div>
       </div>
     </div>
   </div>

app/templates/partials/action_buttons.html CHANGED Viewed

@@ -2,12 +2,16 @@
   Action buttons for a paper card.
   Expects: paper_id (or paper.arxiv_id), saved (bool), dismissed (bool)
   Optional: source ("search" | "recommendation" | "saved"), position (int)
   These are returned directly by the /api/papers/{id}/save endpoint
   so they also work as a standalone partial.
 #}
 {% set pid = paper_id if paper_id is defined else paper.arxiv_id %}
 {% set is_saved = saved if saved is defined else (paper.saved | default(false)) %}
 {% set _source = source if source is defined else "search" %}
 {% if is_saved %}
   <!-- Already saved — show saved state, allow unsave via not-interested -->
@@ -19,7 +23,7 @@
             hx-post="/api/papers/{{ pid }}/not-interested"
             hx-target="#paper-{{ pid }}"
             hx-swap="outerHTML swap:200ms"
-            hx-vals='{"source": "{{ _source }}"}'>
       Remove
     </button>
   </div>
@@ -28,9 +32,9 @@
     <!-- Save -->
     <button class="btn btn-primary btn-xs"
             hx-post="/api/papers/{{ pid }}/save"
-            hx-target="#actions-{{ pid }}"
             hx-swap="innerHTML"
-            hx-vals='{"source": "{{ _source }}", "position": "{{ position | default(0) }}"}'>
       ⭐ Save
     </button>
     <!-- Not interested (removes the whole card) -->
@@ -38,8 +42,9 @@
             hx-post="/api/papers/{{ pid }}/not-interested"
             hx-target="#paper-{{ pid }}"
             hx-swap="outerHTML swap:200ms"
-            hx-vals='{"source": "{{ _source }}"}'>
       ✕ Not interested
     </button>
   </div>
 {% endif %}

   Action buttons for a paper card.
   Expects: paper_id (or paper.arxiv_id), saved (bool), dismissed (bool)
   Optional: source ("search" | "recommendation" | "saved"), position (int)
+  Phase 4.5: ranker_version, candidate_source, cluster_id (set by recommendations.py)
   These are returned directly by the /api/papers/{id}/save endpoint
   so they also work as a standalone partial.
 #}
 {% set pid = paper_id if paper_id is defined else paper.arxiv_id %}
 {% set is_saved = saved if saved is defined else (paper.saved | default(false)) %}
 {% set _source = source if source is defined else "search" %}
+{% set _ranker_version = paper.ranker_version | default("") if paper is defined else "" %}
+{% set _candidate_source = paper.candidate_source | default("") if paper is defined else "" %}
+{% set _cluster_id = paper.cluster_id | default("") if paper is defined else "" %}
 {% if is_saved %}
   <!-- Already saved — show saved state, allow unsave via not-interested -->
             hx-post="/api/papers/{{ pid }}/not-interested"
             hx-target="#paper-{{ pid }}"
             hx-swap="outerHTML swap:200ms"
+            hx-vals='{"source": "{{ _source }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}"}'>
       Remove
     </button>
   </div>
     <!-- Save -->
     <button class="btn btn-primary btn-xs"
             hx-post="/api/papers/{{ pid }}/save"
+            hx-target="[id='actions-{{ pid }}']"
             hx-swap="innerHTML"
+            hx-vals='{"source": "{{ _source }}", "position": "{{ position | default(0) }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}"}'>
       ⭐ Save
     </button>
     <!-- Not interested (removes the whole card) -->
             hx-post="/api/papers/{{ pid }}/not-interested"
             hx-target="#paper-{{ pid }}"
             hx-swap="outerHTML swap:200ms"
+            hx-vals='{"source": "{{ _source }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}"}'>
       ✕ Not interested
     </button>
   </div>
 {% endif %}

app/templates/partials/paper_card.html CHANGED Viewed

@@ -25,11 +25,14 @@
     {% endif %}
   </div>
-  <!-- Meta: arXiv ID + year -->
   <div class="text-xs text-base-content/50">
     [{{ paper.arxiv_id }}]
     {% if paper.published %} · {{ paper.published[:4] }}{% endif %}
     {% if authors_list %} · {{ authors_list | join(", ") }}{% endif %}
   </div>
   <!-- Abstract (truncated) -->

     {% endif %}
   </div>
+  <!-- Meta: arXiv ID + year + citations -->
   <div class="text-xs text-base-content/50">
     [{{ paper.arxiv_id }}]
     {% if paper.published %} · {{ paper.published[:4] }}{% endif %}
     {% if authors_list %} · {{ authors_list | join(", ") }}{% endif %}
+    {% if paper.citation_count %}
+    · <span class="font-medium text-base-content/70" title="{{ paper.influential_citations|default(0) }} influential">📊 {{ paper.citation_count }} citations</span>
+    {% endif %}
   </div>
   <!-- Abstract (truncated) -->

app/turso_svc.py CHANGED Viewed

@@ -59,9 +59,11 @@ async def fetch_metadata_batch(arxiv_ids: list[str]) -> dict[str, dict]:
     pipeline_url = url.rstrip("/")
     # Convert to HTTP API URL format
     if pipeline_url.startswith("libsql://"):
-        pipeline_url = pipeline_url.replace("libsql://", "https://")
-    if not pipeline_url.startswith("https://"):
-        pipeline_url = "https://" + pipeline_url.lstrip("https://").lstrip("http://")
     payload = {
         "requests": [

     pipeline_url = url.rstrip("/")
     # Convert to HTTP API URL format
     if pipeline_url.startswith("libsql://"):
+        pipeline_url = "https://" + pipeline_url[len("libsql://"):]
+    elif pipeline_url.startswith("http://"):
+        pipeline_url = "https://" + pipeline_url[len("http://"):]
+    elif not pipeline_url.startswith("https://"):
+        pipeline_url = "https://" + pipeline_url
     payload = {
         "requests": [

docs/TASK-TRACKER.md CHANGED Viewed

@@ -1,8 +1,8 @@
 # ResearchIT — Master Task Tracker
 > **Purpose**: Single source of truth for all completed, in-progress, and upcoming work.
-> **Last updated**: 2026-04-20
-> **Current phase**: Phase 3.5 (Turso Metadata DB) — COMPLETE ✔
 ---
@@ -241,21 +241,25 @@
 ---
-## Phase 4: Recommendation Pipeline Fixes 📋 NOT STARTED
-> *Fix the known architectural debt in the recommendation pipeline.*
-> *Estimated effort: ~1 week*
 ### 4.1 — Replace RRF with Importance-Weighted Quota Fusion
-- [ ] Create `app/recommend/fusion.py` — quota allocation logic
   - `w_k = importance_k / sum(importance_k)`
   - `slot_k = max(floor(F × w_k), F_min=3)` — every cluster gets at least 3 slots
   - Distribute remainder by largest fractional part
-- [ ] Refactor `_multi_interest_recommend()` in `recommendations.py`
   - Replace `multi_interest_search()` with per-cluster separate ANN queries
-  - Allocate feed slots proportionally
-  - Deduplicate across clusters (assign to highest-ranked)
-  - MMR over merged union
 ### 4.2 — Pre-populate Metadata Store ✅ DONE (via Turso)
 - [x] Bulk-loaded arXiv metadata from Kaggle to Turso cloud DB (Phase 3.5)
@@ -265,13 +269,60 @@
 - [x] **Impact**: Search time dropped from ~10.7s to ~1.75s on HF Spaces
 ### 4.3 — Hungarian Matching for Cluster Stability
-- [ ] Implement Hungarian matching in `clustering.py`
-  - Match new cluster IDs to previous IDs by medoid similarity
-  - Prevents cluster IDs from shuffling between reclusterings
-### 4.4 — Wire Remaining Negative Signal Components
-- [ ] Per-item short-term decay: `score -= α × exp(-dt / τ_neg)` — needs per-item timestamp tracking
-- [ ] Category-level suppression: if ≥3 dismissals hit the same arXiv category within a week, suppress for 2 weeks
 ---
@@ -306,7 +357,8 @@
 > *Replace heuristic scorer with a trained LightGBM lambdarank model.*
 > *Blocked by: ≥500 labeled interactions OR citation-graph bootstrap*
-> *Estimated effort: ~2-4 weeks*
 - [ ] Citation-graph pseudo-labels from unarXive 2022 (cited = relevance 2, co-cited = 1, random = 0)
 - [ ] Author-as-user simulation
@@ -329,11 +381,30 @@
 ## Phase 8: LLM Interest Summaries + Distilled Re-ranker 📋 NOT STARTED
-> *Estimated effort: ~2 weeks*
-- [ ] Claude/Groq interest summaries per cluster (human-readable descriptions)
-- [ ] Distill BGE-reranker-v2-m3 offline → TinyBERT-L2 student (FlashRank recipe)
-- [ ] Deploy student score as LightGBM feature on top-20
 ---
@@ -380,18 +451,19 @@
 | Test File | Count | Status |
 |---|---|---|
 | `tests/test_profiles.py` | 11 | ✅ Passing |
-| `tests/test_clustering.py` | 10 | ✅ Passing |
 | `tests/test_reranker_diversity.py` | 13 | ✅ Passing |
-| `tests/test_db.py` | — | ✅ Passing |
 | `tests/test_qdrant_svc.py` | — | ✅ Passing |
 | `tests/test_arxiv_svc.py` | — | ✅ Passing |
-| `tests/test_integration.py` | — | ✅ Passing |
 | `tests/test_user_state.py` | — | ✅ Passing |
 | `tests/test_saved.py` | — | ✅ Passing |
 | `tests/test_hybrid_search.py` | 21 | ✅ Passing |
 | `tests/test_search_router.py` | 6 | ✅ Passing |
 | `tests/test_live_search.py` | 8 | ✅ Passing |
-| **Total** | **123** | ✅ |
 | `test_e2e_recs.py` (standalone) | 1 | ✅ E2E simulation |
 ---
@@ -404,8 +476,8 @@
 | L2-normalize before Ward clustering | ✅ Applied | `app/recommend/clustering.py` |
 | Medoid not centroid | ✅ Applied | `app/recommend/clustering.py` → `_find_medoid()` |
 | Negative EWMA wired into reranking | ✅ Applied | `app/recommend/reranker.py` → Feature 5 |
-| RRF → quota fusion for recommendations | [!] Backlog | Phase 4.1 |
-| Hungarian cluster matching | [!] Backlog | Phase 4.3 |
-| Per-item short-term negative decay | [!] Backlog | Phase 4.4 |
-| Category-level suppression | [!] Backlog | Phase 4.4 |
 | BGE-reranker NEVER in hot path | ✅ Followed | Heuristic scorer used instead |

 # ResearchIT — Master Task Tracker
 > **Purpose**: Single source of truth for all completed, in-progress, and upcoming work.
+> **Last updated**: 2026-04-26
+> **Current phase**: Phase 4.5 (Instrumentation Foundation) — COMPLETE ✔
 ---
 ---
+## Phase 4: Recommendation Pipeline Fixes ✅ COMPLETE
+> *Fixed the known architectural debt in the recommendation pipeline.*
+> *Detailed plan: `docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`*
 ### 4.1 — Replace RRF with Importance-Weighted Quota Fusion
+- [x] Create `app/recommend/fusion.py` — quota allocation logic
   - `w_k = importance_k / sum(importance_k)`
   - `slot_k = max(floor(F × w_k), F_min=3)` — every cluster gets at least 3 slots
   - Distribute remainder by largest fractional part
+- [x] Create `tests/test_fusion.py` — **20 unit tests** for quota allocation
+  - Proportionality, floor enforcement, total invariant, edge cases, Doc 06 worked examples
+- [x] Refactor `_multi_interest_recommend()` in `recommendations.py`
   - Replace `multi_interest_search()` with per-cluster separate ANN queries
+  - Use `asyncio.gather()` for concurrent searches (~15ms wall-clock)
+  - Allocate feed slots proportionally via `allocate_quotas()`
+  - Deduplicate across clusters (first-occurrence = highest-ranked cluster wins)
+  - MMR over merged union (unchanged)
+- [x] Keep `qdrant_svc.multi_interest_search()` in codebase (no deletion)
 ### 4.2 — Pre-populate Metadata Store ✅ DONE (via Turso)
 - [x] Bulk-loaded arXiv metadata from Kaggle to Turso cloud DB (Phase 3.5)
 - [x] **Impact**: Search time dropped from ~10.7s to ~1.75s on HF Spaces
 ### 4.3 — Hungarian Matching for Cluster Stability
+- [x] Add `stabilize_cluster_ids()` function to `clustering.py`
+  - Uses `scipy.optimize.linear_sum_assignment` (already a dependency)
+  - Cost matrix: `1 - cosine_sim(new_medoid, old_medoid)` — trivial at K≤7
+  - Matched clusters keep old indices; new clusters get next available
+  - Min cosine threshold (0.5) rejects unrelated matches
+- [x] Call between `compute_clusters()` and `save_clusters_to_db()` in recommendations.py
+- [x] **10 tests** in `test_clustering.py` — perturbed clusters preserve indices,
+  unrelated match rejection, K growth/shrink, custom thresholds
+### 4.4 — Category-Level Negative Suppression
+- [x] Add `get_suppressed_categories()` to `db.py`
+  - Joins `interactions` + `paper_metadata` to find categories with ≥3 dismissals
+  - **Primary category only** (decision: avoid over-suppression)
+  - **14-day window** (standard default, τ_neg = 14 days)
+- [x] Add suppression filter in `_multi_interest_recommend()` after reranking
+- [x] Cache Turso metadata to `paper_metadata` via `cache_turso_metadata_batch()`
+- [x] **8 tests** in `test_db.py` — threshold, partitioning, user isolation, custom threshold
+- [~] Per-item short-term decay → **deferred to Phase 6** (LightGBM feature)
+**Gaps**: None.
+---
+## Phase 4.5: Instrumentation Foundation ✅ COMPLETE
+> *Added telemetry columns to the interactions table so every saved/dismissed paper*
+> *can be attributed to its pipeline tier, cluster origin, and ranker version.*
+> *Doc 07 (ADR A4) identified this as the single most valuable early investment —*
+> *retrofitting these fields after real user data exists is painful and blocks all*
+> *later counterfactual evaluation.*
+### Schema changes
+- [x] Add `ranker_version TEXT` to `interactions` table — pipeline version tag
+- [x] Add `candidate_source TEXT` to `interactions` — e.g. `cluster_0`, `exploration`, `ewma_longterm`, `qdrant_recommend`, `short_term_supplement`
+- [x] Add `cluster_id INTEGER` to `interactions` — interest cluster index (NULL if N/A)
+- [x] ALTER TABLE migration for existing DBs (safe try/except, idempotent)
+### Pipeline tagging
+- [x] Add `_RANKER_VERSION` constant to `recommendations.py`
+- [x] Tag Tier 1 papers with cluster origin, exploration status, short-term supplement
+- [x] Tag Tier 2 papers as `ewma_longterm`
+- [x] Tag Tier 3 papers as `qdrant_recommend`
+- [x] Build `paper_cluster_map` before quota merge (first-occurrence = cluster attribution)
+- [x] Exploration papers tagged as `candidate_source='exploration'`
+### End-to-end flow
+- [x] `recommendations.py` embeds tags in paper dicts
+- [x] `action_buttons.html` includes tags in `hx-vals` JSON
+- [x] `events.py` accepts `ranker_version`, `candidate_source`, `cluster_id` Form fields
+- [x] `db.log_interaction()` stores all three new columns
+**Files modified**: `app/db.py`, `app/routers/events.py`, `app/routers/recommendations.py`, `app/templates/partials/action_buttons.html`
+**Gaps**: None. `propensity` and `policy_id` fields deferred until ε-greedy exploration (Phase 9).
 ---
 > *Replace heuristic scorer with a trained LightGBM lambdarank model.*
 > *Blocked by: ≥500 labeled interactions OR citation-graph bootstrap*
+> *Estimated effort: ~2-4 weeks*
+> *Architecture decision: one-stage LambdaMART first (Doc 07 ADR A3)*
 - [ ] Citation-graph pseudo-labels from unarXive 2022 (cited = relevance 2, co-cited = 1, random = 0)
 - [ ] Author-as-user simulation
 ## Phase 8: LLM Interest Summaries + Distilled Re-ranker 📋 NOT STARTED
+> *Estimated effort: ~10-12 weeks (Doc 07)*
+> *Detailed research plan: `docs/research/07-LLM-Summaries-Reranker-and-Scaling-Research.md`*
+> *Entry criteria: Phase 7 eval producing stable nDCG@10; cluster stability Jaccard ≥0.7 over 7 days*
+### 8a — Claude-generated per-cluster interest summaries (Doc 07 §A)
+- [ ] Cluster snapshot versioning (ADR A1)
+- [ ] Content-addressed caching: `sha256(sorted(paper_ids) + prompt_version + model)`
+- [ ] Shared summaries (not per-user) — Haiku 4.5 + Batch API (~$50-80/month @ 1K users)
+- [ ] Nightly regeneration job with 7-day TTL + event-triggered refresh
+- [ ] "You're reading about X" UI framing with sub-theme bullets
+- [ ] Anthropic Citations API for hallucination prevention
+### 8b — Distilled cross-encoder reranker (Doc 07 §B)
+- [ ] Deploy `cross-encoder/ms-marco-TinyBERT-L-2-v2` INT8 ONNX as MVP
+- [ ] 6ms budget for 20 pairs on CPU (AVX-512 VNNI)
+- [ ] TinyBERT score as LightGBM feature (Option C architecture)
+- [ ] Custom distillation from BGE-reranker-v2-m3 only if held-out gap >3 nDCG
+- [ ] MarginMSE loss + SciNCL citation-graph hard negatives
+### 8c — Use-cases and information-gain design doc (Doc 07 §C)
+- [ ] 8 user personas (P1 cold-start through P8 stay-current)
+- [ ] Information-gain table (save=3-5×, dismiss-as-label=−3-4×, passive skip=−0.1×)
+- [ ] Mode-switching UI: "Stay Current" vs "Lit Review" toggle
+- [ ] Failure mode detection rules (feed collapse, stale profile, filter bubble)
 ---
 | Test File | Count | Status |
 |---|---|---|
 | `tests/test_profiles.py` | 11 | ✅ Passing |
+| `tests/test_clustering.py` | 21 | ✅ Passing | (9 compute + 10 Hungarian + 2 persistence) |
 | `tests/test_reranker_diversity.py` | 13 | ✅ Passing |
+| `tests/test_fusion.py` | 20 | ✅ Passing | (Phase 4.1) |
+| `tests/test_db.py` | 19 | ✅ Passing | (includes 4 Turso cache + 8 suppression) |
 | `tests/test_qdrant_svc.py` | — | ✅ Passing |
 | `tests/test_arxiv_svc.py` | — | ✅ Passing |
+| `tests/test_integration.py` | — | ✅ Passing | (includes quota pipeline E2E) |
 | `tests/test_user_state.py` | — | ✅ Passing |
 | `tests/test_saved.py` | — | ✅ Passing |
 | `tests/test_hybrid_search.py` | 21 | ✅ Passing |
 | `tests/test_search_router.py` | 6 | ✅ Passing |
 | `tests/test_live_search.py` | 8 | ✅ Passing |
+| **Total** | **171** | ✅ |
 | `test_e2e_recs.py` (standalone) | 1 | ✅ E2E simulation |
 ---
 | L2-normalize before Ward clustering | ✅ Applied | `app/recommend/clustering.py` |
 | Medoid not centroid | ✅ Applied | `app/recommend/clustering.py` → `_find_medoid()` |
 | Negative EWMA wired into reranking | ✅ Applied | `app/recommend/reranker.py` → Feature 5 |
+| RRF → quota fusion for recommendations | ✅ Applied | `app/recommend/fusion.py` (Phase 4.1) |
+| Hungarian cluster matching | ✅ Applied | `app/recommend/clustering.py` → `stabilize_cluster_ids()` (Phase 4.3) |
+| Per-item short-term negative decay | [!] Backlog | Phase 6 (LightGBM feature) |
+| Category-level suppression | ✅ Applied | `app/db.py` → `get_suppressed_categories()` (Phase 4.4) |
 | BGE-reranker NEVER in hot path | ✅ Followed | Heuristic scorer used instead |

docs/phases/PHASE3-Hybrid-Semantic-Search.md CHANGED Viewed

@@ -3,7 +3,7 @@
 > **Purpose**: Replace the Phase 1 placeholder arXiv keyword API search with real vector-based
 > semantic search using BGE-M3 encoding + Qdrant dense + Zilliz sparse + RRF fusion.
 >
-> **Status**: 📋 Not started
 > **Estimated effort**: ~2-3 weeks
 > **Predecessor**: Phase 2c (complete) — the recommendation pipeline
 > **Deployment target**: Hugging Face Spaces (Docker SDK, free tier: 16GB RAM, 2 vCPUs)

 > **Purpose**: Replace the Phase 1 placeholder arXiv keyword API search with real vector-based
 > semantic search using BGE-M3 encoding + Qdrant dense + Zilliz sparse + RRF fusion.
 >
+> **Status**: ✅ Complete
 > **Estimated effort**: ~2-3 weeks
 > **Predecessor**: Phase 2c (complete) — the recommendation pipeline
 > **Deployment target**: Hugging Face Spaces (Docker SDK, free tier: 16GB RAM, 2 vCPUs)

docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md ADDED Viewed

	@@ -0,0 +1,603 @@

+# Phase 4 — Recommendation Pipeline Fixes
+> **Purpose**: Fix the 3 remaining architectural faults identified by Doc 06 in the
+> recommendation pipeline: replace RRF with importance-weighted quota fusion, add
+> Hungarian matching for cluster stability, and wire category-level negative suppression.
+>
+> **Status**: 📋 Not started
+> **Estimated effort**: ~1 week
+> **Predecessor**: Phase 3.5 (complete) — Turso metadata DB
+> **Deployment target**: Same — Hugging Face Spaces (no infra changes)
+---
+## Why This Matters
+The recommendation engine works today — all 3 tiers cascade correctly, EWMA profiles
+update, Ward clustering detects interests, and MMR enforces diversity. But Doc 06
+identified three concrete faults that degrade quality for multi-interest users:
+| # | Fault | Impact | Who gets hurt |
+|---|---|---|---|
+| **4.1** | RRF fuses interest clusters by consensus, not proportionally | Dominant cluster drowns minority interests | User who likes both NLP (70%) and RL (30%) never sees RL papers |
+| **4.3** | Cluster indices shuffle on every recluster | Future analytics and UI labels break | Any user who saves a new paper |
+| **4.4** | No category-level negative suppression | Dismissed topics keep reappearing | User who dismisses 5 physics papers still gets physics recs |
+**What's already fixed (not Phase 4)**:
+- ✅ α_long = 0.03 (was 0.10, fixed Phase 2a — PinnerSage rejected 0.10)
+- ✅ L2-normalize before Ward (fixed Phase 2b — Doc 06 fault #4)
+- ✅ Negative EWMA penalty in reranker (fixed Phase 2c — Feature 5, weight 0.15)
+- ✅ Metadata store pre-populated (Phase 3.5 — Turso, 1.23GB)
+---
+## Current Architecture vs Target Architecture
+### Current Retrieval (Phase 2b — being fixed)
+```
+Cluster medoids + short-term vector
+      │
+      ▼
+Single Qdrant prefetch+RRF call
+  ├── Prefetch: medoid_1 (limit=40)
+  ├── Prefetch: medoid_2 (limit=30)
+  ├── Prefetch: medoid_3 (limit=25)
+  └── Prefetch: short_term (limit=25)
+      │
+      ▼
+FusionQuery(fusion=Fusion.RRF)
+      │  ← papers near ALL cluster centroids get boosted
+      │  ← minority interests get drowned
+      ▼
+~100 candidates → rerank → MMR → serve
+```
+**Problem**: RRF was designed for fusing *different retrievers on the same query*
+(BM25 + vector). Here we're fusing *different queries for the same user*. Consensus
+means "near the centroid of everything" — the exact failure multi-interest models
+exist to prevent.
+### Target Retrieval (Phase 4)
+```
+compute_clusters() → K clusters with importance scores
+      │
+      ▼
+allocate_quotas([imp_1, imp_2, ...], total=100, min=3)
+  → [55, 30, 15] (proportional, each ≥ 3)
+      │
+      ▼
+asyncio.gather(                   ← concurrent, ~15ms wall-clock
+  search_by_vector(medoid_1, limit=55×3),   # 3× over-fetch for rerank headroom
+  search_by_vector(medoid_2, limit=30×3),
+  search_by_vector(medoid_3, limit=15×3),
+  search_by_vector(short_term, limit=25),   # session boost
+)
+      │
+      ▼
+Deduplicate across clusters
+  (assign each paper to its highest-ranked cluster)
+      │
+      ▼
+Category suppression: drop papers from suppressed categories
+      │
+      ▼
+Rerank → MMR → exploration → serve
+```
+**Evidence this is correct**:
+- PinnerSage (KDD 2020): samples 3 medoids proportional to importance — no RRF
+- Taobao ULIM (RecSys 2025): per-category parallel retrieval with quota — +5.54% clicks
+- Pinterest Bucketized-ANN (SIGIR 2023): ensures minority items aren't dropped
+- Twitter kNN-Embed: candidates per cluster proportional to mixture weight
+- Bruch et al. (SIGIR 2022): RRF optimises Recall not nDCG — quota gives better nDCG
+---
+## 4.1 — Replace RRF with Importance-Weighted Quota Fusion
+### New File: `app/recommend/fusion.py`
+Pure-math module with zero I/O dependencies. Contains one function:
+```python
+def allocate_quotas(
+    importances: list[float],
+    total_slots: int = 100,
+    min_slots: int = 3,
+) -> list[int]:
+    """
+    Importance-weighted quota allocation with a minimum floor.
+    Each cluster gets feed slots proportional to its importance,
+    with a guaranteed minimum of `min_slots` to protect minority interests.
+    Algorithm:
+      1. Normalise: w_k = importance_k / sum(importances)
+      2. Raw allocation: raw_k = total_slots × w_k
+      3. Apply floor: slot_k = max(floor(raw_k), min_slots)
+      4. Distribute remainder by largest fractional part
+      5. Guarantee: sum(slots) == total_slots
+    This is the Doc 06 formula verbatim:
+      slot_k = max(⌊F × w_k⌋, F_min=3)
+    Reference: PinnerSage (KDD 2020), Taobao ULIM (RecSys 2025),
+    Pinterest Bucketized-ANN (SIGIR 2023).
+    """
+```
+**Worked example** (from Doc 06 §"Worked example"):
+- 3 clusters with importances [0.55, 0.30, 0.15], total_slots=30
+- Raw allocation: [16.5, 9.0, 4.5]
+- Floor applied: [16, 9, 4] (all ≥ 3, so floor has no effect)
+- Remainder: 30 - 29 = 1 slot → goes to cluster 0 (largest fractional part: 0.5)
+- Final: [17, 9, 4] — minority cluster gets 4 slots, not 0
+**Edge case — tiny cluster**:
+- 4 clusters with importances [0.60, 0.25, 0.10, 0.05], total_slots=30
+- Raw allocation: [18.0, 7.5, 3.0, 1.5]
+- Without floor: [18, 7, 3, 1] — smallest cluster gets 1 paper
+- With floor (min=3): [18, 7, 3, 3] — smallest cluster gets 3 papers
+### Modified File: `app/routers/recommendations.py`
+The `_multi_interest_recommend()` function changes its retrieval step:
+**What gets removed**:
+- The `_CLUSTER_LIMITS = [40, 30, 25, 20, 15, 15, 15]` hardcoded list
+- The call to `qdrant_svc.multi_interest_search()` (the prefetch+RRF path)
+- Building the `interest_vectors` list of `(medoid_embedding, limit)` tuples
+**What replaces it**:
+```python
+import asyncio
+from app.recommend.fusion import allocate_quotas
+# Step 2: Quota-based parallel retrieval (replaces RRF)
+quotas = allocate_quotas(
+    importances=[c.importance for c in clusters],
+    total_slots=100,   # wide retrieval net
+    min_slots=3,       # every cluster gets at least 3 slots
+)
+# Launch concurrent ANN searches — one per cluster + session
+search_coros = []
+for cluster, quota in zip(clusters, quotas):
+    search_coros.append(
+        qdrant_svc.search_by_vector(
+            query_vector=cluster.medoid_embedding.tolist(),
+            limit=quota * 3,  # 3× over-fetch for rerank headroom
+            exclude_ids=seen,
+        )
+    )
+# Add short-term session vector if available
+st_vec = await profiles.load_profile(user_id, "short_term")
+if st_vec is not None:
+    search_coros.append(
+        qdrant_svc.search_by_vector(
+            query_vector=st_vec.tolist(),
+            limit=25,
+            exclude_ids=seen,
+        )
+    )
+# Execute all searches concurrently (~15ms wall-clock)
+per_cluster_results = await asyncio.gather(*search_coros)
+# Deduplicate: first occurrence wins (highest-ranked cluster)
+seen_in_results = set()
+candidate_ids = []
+for result_list in per_cluster_results:
+    for arxiv_id in result_list:
+        if arxiv_id not in seen_in_results:
+            seen_in_results.add(arxiv_id)
+            candidate_ids.append(arxiv_id)
+```
+**Key design decisions**:
+1. **`asyncio.gather()` for concurrency** — Each `search_by_vector()` call takes ~5-15ms.
+   With `asyncio.gather()`, 3-7 concurrent queries run in ~15-25ms wall-clock — same as
+   the old single prefetch call.
+2. **3× over-fetch** — We fetch `quota × 3` candidates per cluster, then let the reranker
+   pick the best `quota` from each. This gives the heuristic scorer enough headroom to
+   find quality papers even if some candidates are poor matches.
+3. **First-occurrence deduplication** — Papers appearing in multiple cluster results are
+   assigned to whichever cluster ranked them highest (first encounter). This is simple,
+   deterministic, and matches the PinnerSage pattern.
+4. **`multi_interest_search()` is NOT deleted** — The function stays in `qdrant_svc.py`
+   for potential future use. We simply stop calling it from the recommendations router.
+### Latency Impact
+| Stage | Before (RRF) | After (Quota) |
+|---|---|---|
+| Qdrant retrieval | ~15-25ms (1 prefetch call) | ~15-25ms (3-7 concurrent calls) |
+| Dedup + quota | N/A | <1ms |
+| Rerank + MMR | ~12ms | ~12ms (unchanged) |
+| **Total pipeline** | ~30ms | ~30ms |
+No latency regression. The concurrent gather matches the prefetch parallelism.
+---
+## 4.3 — Hungarian Matching for Cluster Stability
+### Why This Matters
+When a user saves a new paper, `compute_clusters()` runs Ward clustering from scratch.
+The cluster that was "NLP papers" yesterday might get `cluster_idx=2` today and
+`cluster_idx=0` tomorrow. This breaks:
+- Future analytics ("which cluster does the user engage with most?")
+- Future UI labels ("Your Interest: Natural Language Processing")
+- A/B test logs that reference cluster indices
+- Doc 06 §"Clustering specifics" calls this "the real operational risk"
+### Modified File: `app/recommend/clustering.py`
+Add a new function called between `compute_clusters()` and `save_clusters_to_db()`:
+```python
+from scipy.optimize import linear_sum_assignment
+def stabilize_cluster_ids(
+    new_clusters: list[InterestCluster],
+    old_clusters: list[dict] | None,
+    paper_vectors: dict[str, list[float]] | None = None,
+) -> list[InterestCluster]:
+    """
+    Remap new cluster indices to match previous clusters via Hungarian matching.
+    1. Compute cost matrix: cost[i][j] = 1 - cosine_sim(new_medoid_i, old_medoid_j)
+    2. Solve assignment with scipy.optimize.linear_sum_assignment
+    3. Remap new cluster_idx to matched old cluster_idx
+    4. Genuinely new clusters (no match) get next available index
+    At K ≤ 7 this is trivially fast (7×7 matrix).
+    Reference: Doc 06 §"Clustering specifics" — "persist cluster→medoid-paper-id
+    mapping across reclusterings and use Hungarian matching against previous medoids."
+    """
+```
+**Algorithm walkthrough**:
+1. Load previous clusters from SQLite via `load_clusters_from_db(user_id)`
+2. If `old_clusters is None` (first time): no remapping needed, return as-is
+3. Build a cost matrix of shape `(K_new, K_old)`:
+   - For each pair, fetch the old medoid embedding from `paper_vectors`
+   - `cost[i][j] = 1 - cosine_similarity(new_medoid_i, old_medoid_j)`
+4. Run `scipy.optimize.linear_sum_assignment(cost_matrix)` — O(K³), trivial at K≤7
+5. For matched pairs `(new_i, old_j)` where `cost < 0.5` (cosine sim > 0.5):
+   assign `new_clusters[new_i].cluster_idx = old_clusters[old_j]['cluster_idx']`
+6. For unmatched new clusters: assign the next available index
+**Where it's called** — in `_multi_interest_recommend()` in `recommendations.py`:
+```python
+# Step 1: Compute interest clusters
+clusters = compute_clusters(aligned_ids, aligned_embs)
+# Step 1.5: Stabilise cluster IDs against previous run
+old_clusters = await load_clusters_from_db(user_id)
+clusters = stabilize_cluster_ids(clusters, old_clusters, vectors)
+# Step 1.6: Persist (now with stable IDs)
+await save_clusters_to_db(user_id, clusters)
+```
+### What Needs to Change
+The old medoid embeddings need to be compared against new medoid embeddings. The old
+medoid embeddings aren't stored in SQLite (only the `medoid_paper_id` is). Two options:
+**Option A** (recommended): Use the `paper_vectors` dict that's already loaded at the
+top of `_multi_interest_recommend()` (line 128: `vectors = await qdrant_svc.get_paper_vectors(positives)`).
+Old medoid paper IDs are likely in this set since the medoid IS a saved paper. If not,
+do a small `get_paper_vectors([old_medoid_id])` call.
+**Option B**: Store medoid embeddings as BLOBs in `user_clusters` table. This adds a
+4KB column but avoids any Qdrant call. Overhead is negligible.
+**Decision**: Option A — avoids schema migration and the vectors are already in memory.
+---
+## 4.4 — Category-Level Negative Suppression
+### Design Decisions (Per User Input)
+1. **Primary category only** — arXiv papers have multiple categories (e.g., `cs.CV`, `cs.AI`).
+   Suppression applies to the **primary category only** to avoid suffocating the recommendation
+   graph. A paper tagged `[cs.CV, cs.AI]` is only suppressed if `cs.CV` (primary) is
+   suppressed, not if `cs.AI` is.
+2. **τ_neg = 14 days** — Standard default from the literature. If a user dismisses ≥3 papers
+   from the same primary category within 14 days, that category is suppressed for 14 days
+   from the last dismissal.
+### ⚠️ Critical Implementation Detail: Category Format Mismatch
+The arXiv API and Turso store categories in **different formats**:
+- **arXiv API** (`arxiv_svc.py`): uses arXiv codes like `cs.CV`, `cs.CL`, `stat.ML`
+- **Turso** (`turso_svc.py`): uses `primary_topic` which contains human-readable labels
+  like `"AI/ML"`, `"Computer Vision"`, `"NLP/Computational Linguistics"`
+- Both write to `paper_metadata.category` via different paths
+This means `paper_metadata.category` contains a **mix of both formats** depending on
+which service populated it. The suppression logic must handle this:
+```python
+# In the suppression filter, normalise category comparison:
+# - Papers from arXiv have codes: "cs.CV"
+# - Papers from Turso have labels: "Computer Vision"
+# Both may appear in suppressed_cats, so we suppress on exact match
+```
+**Resolution**: The `get_suppressed_categories()` query will return whatever format is
+in the database. The filter in `recommendations.py` will compare candidate categories
+(from Turso metadata) against the suppressed set. Since recommendations primarily use
+Turso for metadata, the formats will match. For the rare arXiv-fallback case, we accept
+the slight inconsistency — it's a minor gap that self-corrects as more Turso data is used.
+### What's Already Done
+The EWMA negative profile is already wired as Feature 5 in `reranker.py`:
+```python
+# Feature 5: cosine_sim_negative (0.15 penalty weight)
+neg_penalty = cosine_sim(candidate, neg_profile) * 0.15
+final_score -= neg_penalty
+```
+This gives a "soft" directional signal: papers semantically similar to dismissed papers
+get demoted. What's missing is the "hard" category-level suppression.
+### What's NOT Being Done (Deferred)
+**Per-item temporal decay** (`score -= α × exp(-dt / τ)`) is deferred to Phase 6.
+Reasoning:
+- Requires per-dismissed-item timestamps matched against candidates
+- Most naturally expressed as a LightGBM feature (`days_since_most_recent_similar_dismissal`)
+- The EWMA negative penalty already covers the directional signal
+- Adding hand-tuned temporal formulas when LightGBM is the next phase would create throwaway code
+### Modified File: `app/db.py`
+Add one new helper function:
+```python
+async def get_suppressed_categories(
+    user_id: str,
+    threshold: int = 3,
+    days: int = 14,
+) -> set[str]:
+    """
+    Find primary arXiv categories where the user has dismissed ≥ threshold
+    papers within the last `days` days.
+    Joins interactions (event_type='not_interested') against paper_metadata
+    to get the category of each dismissed paper.
+    Returns: set of category strings to suppress (e.g., {'cs.CV', 'physics.optics'})
+    """
+    async with aiosqlite.connect(DB_PATH) as db:
+        cur = await db.execute(
+            """SELECT pm.category, COUNT(*) as cnt
+               FROM interactions i
+               JOIN paper_metadata pm ON i.paper_id = pm.arxiv_id
+               WHERE i.user_id = ?
+                 AND i.event_type = 'not_interested'
+                 AND i.timestamp >= datetime('now', ?)
+               GROUP BY pm.category
+               HAVING cnt >= ?""",
+            (user_id, f"-{days} days", threshold),
+        )
+        rows = await cur.fetchall()
+        return {row[0] for row in rows if row[0]}
+```
+**Data dependency**: This requires dismissed papers to have their metadata in
+`paper_metadata`. Currently:
+- Papers from **arXiv API** (`arxiv_svc.py`) are automatically cached via `db.cache_metadata()`
+- Papers from **Turso** (`turso_svc.py`) are **NOT cached** to `paper_metadata`
+This is a gap. When a user dismisses a paper whose metadata came from Turso (the common
+case since Phase 3.5), the category won't be in `paper_metadata` and the suppression
+join will miss it.
+**Fix**: Add a `cache_turso_metadata()` helper in the recommendations router that writes
+Turso-sourced paper dicts to `paper_metadata` after fetching. This is a small INSERT OR
+IGNORE — ~1ms overhead for 20 papers. We should also add this to `search.py` and
+`saved.py` so ALL metadata paths feed the cache.
+### Modified File: `app/routers/recommendations.py`
+In `_multi_interest_recommend()`, after re-ranking but before MMR:
+```python
+# Step 3.5: Category suppression
+suppressed_cats = await db.get_suppressed_categories(user_id)
+if suppressed_cats:
+    # Filter out candidates whose primary category is suppressed
+    reranked_ids_filtered = []
+    reranked_scores_filtered = []
+    reranked_embs_list = []
+    for i, rid in enumerate(reranked_ids):
+        cat = cand_meta.get(rid, {}).get("category", "")
+        # Extract primary category (first in the list, or the whole string)
+        primary_cat = cat.split()[0] if cat else ""
+        if primary_cat not in suppressed_cats:
+            reranked_ids_filtered.append(rid)
+            reranked_scores_filtered.append(reranked_scores[i])
+            reranked_embs_list.append(reranked_embs[i])
+    if reranked_ids_filtered:
+        reranked_ids = reranked_ids_filtered
+        reranked_scores = reranked_scores_filtered
+        reranked_embs = np.array(reranked_embs_list, dtype=np.float32)
+```
+---
+## What Does NOT Change
+These are explicitly out of scope for Phase 4:
+| Component | Why it stays |
+|---|---|
+| **Search pipeline** (`search.py`, `hybrid_search_svc.py`) | RRF is correct for search (different retrievers, same query) |
+| **α_long = 0.03** (`profiles.py`) | Already fixed in Phase 2a |
+| **L2 normalization** (`clustering.py`) | Already applied before Ward in Phase 2b |
+| **Negative EWMA Feature 5** (`reranker.py`) | Already wired in Phase 2c |
+| **`qdrant_svc.multi_interest_search()`** | Kept in codebase, just no longer called by recs |
+| **Per-item temporal decay** | Deferred to Phase 6 (LightGBM feature) |
+| **Templates / UI** | No frontend changes |
+| **Infrastructure** | Same deployment, same databases |
+---
+## Files Changed — Complete Map
+| File | Action | Lines Changed (est.) | What Changes |
+|---|---|---|---|
+| `app/recommend/fusion.py` | **NEW** | ~60 | `allocate_quotas()` function |
+| `app/routers/recommendations.py` | **MODIFY** | ~40 | Replace RRF call with quota + parallel search; add category suppression |
+| `app/recommend/clustering.py` | **MODIFY** | ~50 | Add `stabilize_cluster_ids()` with Hungarian matching |
+| `app/db.py` | **MODIFY** | ~20 | Add `get_suppressed_categories()` |
+| `tests/test_fusion.py` | **NEW** | ~80 | Unit tests for quota allocation |
+| `tests/test_clustering.py` | **MODIFY** | ~30 | Add test for Hungarian matching stability |
+| `tests/test_search_router.py` | **NO CHANGE** | 0 | Search pipeline untouched |
+| `tests/test_integration.py` | **NO CHANGE** | 0 | Integration tests use mocks, unaffected |
+**Total new/modified production code**: ~170 lines
+**Total new test code**: ~110 lines
+---
+## Implementation Order
+Each step leaves the app in a working state. Tests pass after every step.
+### Step 1 — Create `fusion.py` + unit tests (~30 min)
+Build `allocate_quotas()` in isolation with thorough unit tests:
+- `test_basic_allocation` — 3 clusters, verify proportionality
+- `test_floor_enforcement` — tiny cluster still gets `min_slots`
+- `test_total_equals_requested` — sum always equals `total_slots`
+- `test_single_cluster` — all slots go to the one cluster
+- `test_equal_importances` — even split
+- `test_many_clusters_with_floor` — 7 clusters, floor forces redistribution
+### Step 2 — Refactor `_multi_interest_recommend()` (~1 hour)
+Replace the RRF call with quota + `asyncio.gather()`. Key changes:
+1. Remove `_CLUSTER_LIMITS` hardcoded list
+2. Import `allocate_quotas` from `fusion.py`
+3. Replace `multi_interest_search()` with per-cluster `search_by_vector()` calls
+4. Add deduplication logic
+5. Wire short-term vector as a separate search
+**Test**: Run `python -m pytest tests/ -v` — all tests must pass.
+### Step 3 — Add Hungarian matching to `clustering.py` (~1 hour)
+1. Add `stabilize_cluster_ids()` function
+2. Call it in `_multi_interest_recommend()` between `compute_clusters()` and `save_clusters_to_db()`
+3. Add test: create clusters, slightly perturb, verify indices preserved
+**Test**: Run `python -m pytest tests/test_clustering.py -v`
+### Step 4 — Add category suppression (~30 min)
+1. Add `get_suppressed_categories()` to `db.py`
+2. Add suppression filter in `_multi_interest_recommend()` after reranking
+3. Ensure Turso metadata is cached to `paper_metadata` for the join to work
+**Test**: Run full `python -m pytest tests/ -v`
+### Step 5 — End-to-end verification (~30 min)
+1. Run `python test_e2e_recs.py` — verify recommendations generate correctly
+2. Verify latency stays comparable (~7-8s end-to-end including network I/O)
+3. Run full `python -m pytest tests/ -v` — 125+ tests, zero regressions
+---
+## Test Plan
+### New Unit Tests: `tests/test_fusion.py`
+| Test | What it verifies |
+|---|---|
+| `test_basic_proportional_allocation` | 3 clusters with [0.5, 0.3, 0.2] → ~[50, 30, 20] slots |
+| `test_floor_protects_minority` | Tiny importance still gets ≥ `min_slots` |
+| `test_sum_always_equals_total` | No slots lost or gained during allocation |
+| `test_single_cluster` | One cluster gets all slots |
+| `test_equal_importances` | N clusters get total/N each |
+| `test_remainder_distribution` | Remainder goes to largest fractional part |
+### New Unit Test: `tests/test_clustering.py`
+| Test | What it verifies |
+|---|---|
+| `test_hungarian_preserves_indices` | Slight perturbation doesn't shuffle indices |
+### Regression
+- All 125 existing tests must pass
+- `test_e2e_recs.py` must complete successfully
+---
+## Risks and Mitigations
+| Risk | Impact | Mitigation |
+|---|---|---|
+| **Concurrent searches slower than prefetch** | Higher latency | `asyncio.gather()` runs them truly concurrently. Each is ~5-15ms. Wall-clock ~ max(all), not sum(all). |
+| **Floor forces too many slots** | With 7 clusters, floor=3 requires 21 minimum slots. If total<21... | `allocate_quotas()` will clamp: if `K × min_slots > total`, reduce floor proportionally. At `total_slots=100` and `MAX_CLUSTERS=7`, minimum is 21, well within budget. |
+| **Hungarian matching with different K** | New clustering produces fewer/more clusters than before | Handle rectangular cost matrices. `linear_sum_assignment` natively supports non-square matrices. Unmatched new clusters get fresh indices. |
+| **`paper_metadata` missing for suppression join** | `get_suppressed_categories()` returns empty set | **Real gap found** — Turso metadata is not cached to `paper_metadata`. Fix: add `cache_turso_metadata()` calls in search/rec/saved routers. |
+| **Turso categories vs arXiv categories format** | Turso stores human-readable categories ("AI/ML"), arXiv uses codes ("cs.AI") | **Real gap found** — both formats coexist in `paper_metadata.category`. Suppression will work within each format. Cross-format inconsistency is minor and self-corrects as Turso dominates. |
+| **`search_by_vector` already does 2× over-fetch internally** | Asking for `quota*3` then `search_by_vector` internally doubles it | **Real gap found** — `search_by_vector()` at line 234 already fetches `limit*2` when `exclude_ids` is set. So asking for `quota*3` will actually fetch `quota*6` from Qdrant. This is fine (more candidates for reranker) but should be noted for tuning. |
+---
+## Verification Checklist
+Before declaring Phase 4 complete:
+- [ ] `python -m pytest tests/ -v` — all tests pass (130+ including new tests)
+- [ ] `test_fusion.py` — 6+ quota allocation tests pass
+- [ ] `test_clustering.py` — Hungarian matching test passes
+- [ ] `test_e2e_recs.py` — end-to-end recommendations generate correctly
+- [ ] Recommendations include papers from minority clusters (quota working)
+- [ ] Cluster indices remain stable across consecutive saves
+- [ ] Category suppression activates after ≥3 dismissals of same category
+- [ ] Search pipeline is completely unaffected (RRF still used for search)
+- [ ] Latency comparable to Phase 3.5 baseline
+- [ ] All 3 recommendation tiers still cascade correctly (Tier 1 → 2 → 3)
+---
+## References
+- PinnerSage (Pal et al., KDD 2020) — Ward + medoid + importance sampling, no RRF
+- Taobao ULIM (Meng et al., RecSys 2025) — quota allocation, +5.54% clicks
+- Pinterest Bucketized-ANN (SIGIR 2023) — minority representation protection
+- Twitter kNN-Embed (arXiv:2205.06205) — per-cluster proportional drawing
+- Bruch et al. (SIGIR 2022) — RRF optimises Recall not nDCG
+- YouTube (Xia et al., 2023) — 3× gain from richer negative treatment
+- Doc 06 §"The fusion fault in Doc 03" — full RRF critique
+- Doc 06 §"Clustering specifics" — Hungarian matching recommendation
+- Doc 06 §"Negative signals" — three-layer negative design
+---
+*Last updated: 2026-04-23*

docs/research/03-MultiInterest-Recommender-Architecture.md CHANGED Viewed

@@ -266,7 +266,7 @@ Each cluster gets feed slots proportional to its importance, with a floor of 3 t
 **Note:** RRF *is* correct for the search bar (fusing dense + sparse for the *same* query). Only the recommendation pipeline needs quota.
-**Status:** ⚠️ Code still uses RRF. Scheduled for Phase 4.
 ---

 **Note:** RRF *is* correct for the search bar (fusing dense + sparse for the *same* query). Only the recommendation pipeline needs quota.
+**Status:** ⚠️ Code still uses RRF. Phase 4 planned — see `docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`.
 ---

docs/research/07-LLM-Summaries-Reranker-and-Scaling-Research.md ADDED Viewed

	@@ -0,0 +1,426 @@

+# ResearchIT Phase 4 Implementation Plan and Phase 5 Preview — Research Report for Amin
+This report synthesizes 2024–2026 sources (RecSys/SIGIR/KDD/NeurIPS/ACL/EMNLP papers, production blogs from Pinterest, Spotify, YouTube, Netflix, and documentation from BAAI, Jina, Mixedbread, Anthropic) into an implementation-ready plan. The headline recommendation is to run Phase 4a (Claude summaries) and 4d (use-cases doc) in parallel over weeks 1–3 after a one-week ADR sprint, then spend weeks 4–9 on 4b (distilled reranker) — total ~10–12 weeks for Phase 4 with buffer. Nearly every Phase 5 workstream (exploration, IPS, propensity logging, telemetry schema) must be architected *before* Phase 4 code lands, even though the workstreams themselves are gated on user-count thresholds. The single most valuable decision to make now is the telemetry event schema, because retrofitting propensity, policy-id, and position fields after you have real-user data is painful and blocks all later counterfactual evaluation.
+## A. Phase 4a — Claude-API-generated per-cluster interest summaries
+### A.1 Prompt engineering
+The closest published analogue to Amin's use case is **Scholar Inbox** (Flicke et al., ACL 2025 Demo, arXiv 2504.08385), which generates 4-level hierarchical labels (field → subfield → subsubfield → method) from t-SNE paper clusters using Qwen; their appendix §6.1 contains the exact prompt. Microsoft's **TnT-LLM** (KDD 2024) and **TopicGPT** (Pham et al., NAACL 2024) converge on the same pattern: structured XML-tagged inputs, constrained vocabulary, and JSON output. The recommended template for ResearchIT:
+```
+You are summarizing a research interest cluster for a specific user.
+USER PROFILE CONTEXT (tone only, not content):
+{short profile string}
+CLUSTER MEDOID PAPER (most representative):
+<medoid><title>{...}</title><abstract>{...}</abstract></medoid>
+NEAREST NEIGHBOR PAPERS:
+<papers>
+<paper id="1"><title>...</title><abstract>...</abstract></paper>
+... (up to 20)
+</papers>
+TASK: Produce JSON {"label": "<1-sentence 'You're reading about X, particularly Y' framing>", "themes": [<≤5-word bullet>, ... up to 4]}
+RULES:
+- Every technical term in "label" and "themes" MUST appear verbatim in at least one provided title or abstract.
+- Do NOT introduce methods, datasets, or concepts not present in inputs.
+- If fewer than 3 papers share a theme, omit it.
+- Prefer specific phrases ("retrieval-augmented generation evaluation") over generic ones ("NLP research").
+- Output JSON only.
+```
+**Start zero-shot with this constrained prompt; add 2–3 hand-written few-shot examples only to anchor the "You're reading about X" voice.** Spotify Research's Dec 2024 "Contextualized Recommendations Through Personalized Narratives Using LLMs" post found zero-shot adequate but converged on 3–5 "golden" style examples for tone. The Anthropic cookbook's `using_citations.ipynb` demonstrates the **Citations API**, which returns structured citation objects and explicitly "will not return citations pointing to documents or locations that were not provided as valid sources" — **use the Citations API for ResearchIT**, it eliminates the hallucination vector at the API level.
+### A.2 Regeneration frequency
+The 2024–2026 literature (Google's arXiv 2510.20260 on "Balancing Fine-tuning and RAG for Dynamic LLM Recommendation Updates"; Spotify's production narratives cache per-item) strongly favors **event-triggered regeneration over fixed nightly cadence**. Concrete hybrid policy:
+Regenerate when the medoid paper changes, when Jaccard distance between old and new paper-ID sets exceeds 0.3, or when a cluster is added/merged/split. Apply a **7-day TTL fallback** even when nothing changes (captures embedding/context drift). **Do not regenerate nightly** — it is roughly 7× the cost for negligible UX gain on Ward clusters whose membership is stable over the timescale of a single day.
+### A.3 Pricing (April 2026) and cost estimate
+Verified current pricing from platform.claude.com/docs, cross-checked against Finout/MetaCTO/PE Collective reporting: **Haiku 4.5 at $1/$5 per MTok in/out**, **Sonnet 4.6 at $3/$15**, **Opus 4.7 at $5/$25** (released April 16, 2026, with a new tokenizer that can inflate token counts up to 35%). Cache reads are 10% of base input; cache writes 125% (5-minute) or 200% (1-hour). Batch API gives a flat 50% discount with ≤24h turnaround and stacks with caching. Haiku 3 is deprecated April 19, 2026 — do not build against it.
+For 1,000 users × 5 clusters × 20-paper contexts (~6,000 input tokens each) regenerated weekly, monthly traffic is ~130M input + ~3.25M output tokens. Total monthly cost by model:
+- **Haiku 4.5 + Batch API: ~$73/month; with prompt caching on stable prefix, ~$50–60/month**
+- Sonnet 4.6 + Batch API: ~$220/month (~$150–180 with caching)
+- Opus 4.6/4.7 + Batch API: ~$366/month (~$280 with caching)
+**Recommendation: Haiku 4.5 + Batch API is the right default.** The task (label a cluster from provided abstracts) sits comfortably within Haiku's capability. Reserve Sonnet for offline A/B quality evaluation on a minority of calls. Skip Opus entirely for this task. Prompt caching savings are modest because each cluster's paper context is unique per cluster; the real economic lever is the **shared cross-user dedup** (§A.7), not prompt caching within a single call.
+### A.4 Content-addressed caching
+Construct the cache key as `sha256(sorted(paper_ids) + prompt_version + model + schema_version)`. Sort paper IDs before hashing for order-independence; include prompt and model version so stale summaries don't survive a template change; **omit user ID** from the shared cache key (that's the entire point — §A.7). Use an immutable, content-addressed store (`summaries[hash] = {label, themes, generated_at, model, tokens_used}`) — never overwrite; let old entries age out on a 90-day LRU. This mirrors CDN asset hashing (`main.a3f2b1c9.js`) and matches the Anthropic Claude Code cache-invalidation discussion (issue #29230) recommending SHA-256 of all source files be part of the cache key.
+Expected exact 20-paper dedup rate is low (papers are drawn from 3M+ arXiv), but a **two-tier cache** with a "narrow" key (medoid + top-5 neighbors) as fallback increases hit rate substantially.
+### A.5 Explainable-recommender UX in academic search
+None of Scholar Inbox, Connected Papers, Elicit, ResearchRabbit, Semantic Scholar, Consensus, or Undermind currently displays a **personalized "You're reading about X" per-user cluster narrative**. Scholar Inbox's Scholar Map labels are the closest analogue but are global/shared across users. This means ResearchIT's Phase 4a is **genuinely novel UX for academic search**, and the right place to borrow heavily is Spotify (which reports up to 4× CTR on niche content when LLM narratives personalize discovery) and Wang et al.'s "LLMs for User Interest Exploration in Large-scale Recommendation Systems" (RecSys 2024, arXiv 2405.16363), an architecturally identical recipe (interest clusters + constrained LLM descriptions). Lubos et al.'s UMAP 2024 user study on "LLM-generated Explanations for Recommender Systems" confirms users rate LLM explanations highly for decision support.
+UX recommendation: lead with the 1-sentence "You're reading about X, particularly Y" framing, then an expandable bullet list of 3–5 sub-themes, with **source paper titles as linkable chips** under each bullet (the Anthropic Citations / deterministic-quoting pattern, which kills trust issues by letting users verify). A subtle "regenerated on {date}" timestamp plus a manual refresh button gives users control.
+### A.6 Hallucination prevention
+The 2024–2026 state-of-the-art for grounding evaluation is **MiniCheck** (Tang, Laban, Durrett, EMNLP 2024, arXiv 2404.10774) — a 770M-parameter fine-tuned Flan-T5 that matches GPT-4 fact-checking accuracy at ~400× lower cost. Ranked strongest-to-weakest, grounding techniques are: (1) deterministic quoting (surface verbatim source text in the UI); (2) **Anthropic Citations API** (native, recommended); (3) prompt-based "use only phrases from source" rules; (4) post-hoc NLI verification with MiniCheck-FT5; (5) constrained decoding (overkill for 1-sentence labels).
+Recommended stack: Anthropic Citations API + explicit "verbatim-phrase" rule in prompt + post-hoc substring verification on noun phrases (reject and regenerate if >1 unsupported phrase). Run MiniCheck-FT5 offline on a sample as an ongoing faithfulness metric. Zhou et al. (Findings EMNLP 2023) "context-faithful prompting" shows instruction-only grounding measurably reduces hallucination but is not sufficient alone — combine with a verification layer.
+### A.7 Per-user vs shared summaries
+**Use a hybrid two-stage design.** Stage 1 generates a **shared, content-addressed, public-paper-only** cluster description (the Claude call gets only paper titles/abstracts, never user profile text) — identical cluster content produces identical summary across users and days, enabling aggressive dedup. Stage 2 wraps the shared summary with per-user framing either via client-side string templating ("You're reading about {shared_label}") or via a lightweight per-user LLM pass cached at `(user_id, shared_hash)`.
+This matches Spotify's item-level-narrative + per-user-context split and Google's arXiv 2510.20260 offline-bulk/online-lookup separation. **Privacy payoff:** shared summaries are pure functions of public arXiv content, so they can ride Anthropic's Batch API with ZDR safely, be logged freely, and be cached cross-user. User profile text never leaves your infrastructure (or does so only in a heavily-filtered form for Stage 2). This is the architectural decision (ADR A2) that must be made **before** building the caching layer, because switching from per-user to shared requires a full cache-schema migration post-launch.
+## B. Phase 4b — Distilled cross-encoder reranker
+### B.1 FlashRank recipe and student candidates
+**FlashRank (PrithivirajDamodaran) does not train its own students** — it repackages existing open checkpoints as quantized ONNX. The default "Nano" is `ms-marco-TinyBERT-L-2-v2` (14M params, ~17MB fp32, ~6MB INT8), "Small" is `ms-marco-MiniLM-L-12-v2`, and "Medium" is `rank-T5-flan`. The engineering pattern to steal is ONNX + INT8 dynamic quantization + the `tokenizers` Rust library only (no PyTorch/transformers at runtime), keeping cold-start under 500ms on serverless.
+For Amin's 6ms-for-20-pairs CPU budget (≈0.3ms/pair), **the only candidates that fit with headroom are 2-layer students**:
+| Model | Params | INT8 CPU latency/pair | BEIR nDCG@10 |
+|---|---|---|---|
+| **ms-marco-TinyBERT-L-2-v2** | 14M | ~0.3–1.0ms | ~43–45 |
+| ms-marco-MiniLM-L-4-v2 | 19M | ~1.5–2ms | ~46 |
+| ms-marco-MiniLM-L-6-v2 | 22M | ~3–5ms (tight on budget) | ~48 |
+| jina-reranker-v1-turbo-en | 38M | ~3–5ms | 49.60 (95% of jina-base) |
+| jina-reranker-v1-tiny-en | 33M | ~2–3ms | 48.54 (92.5%) |
+| mxbai-rerank-xsmall-v1 | 71M | ~8–12ms (over budget) | 43.9 |
+Tonellotto et al.'s "Shallow Cross-Encoders" (SIGIR 2024, arXiv 2403.20222) found that at latency ≤10ms on CPU, TinyBERT-gBCE reaches nDCG@10 of 0.652 on TREC-DL-2019, a +51% gain over MonoBERT-Large (0.431). **The architectural choice (2L vs 12L) matters more than the teacher weights at tight latency.** Don't pick a bigger student.
+### B.2 Domain adaptation — how much does arXiv-specific fine-tuning buy?
+**Typical gain from in-domain distillation at the 2-layer scale: +1 to +3 nDCG@10 points on SciDocs**, not 10. MedCPT (PubMed, Jin et al. arXiv 2307.00589) surpasses BM25 only after ~150M query-article pairs, showing diminishing returns for modest training budgets. The listwise-distillation paper arXiv 2505.19274 demonstrates that a general RankT5-3B teacher is competitive with in-domain rerankers on SciDocs/SciFact/NFCorpus, within noise. **No BGE-reranker-v2 checkpoint fine-tuned on scientific text exists on Hugging Face as of April 2026** (searched).
+### B.3 Distillation objectives
+The 2025 reproducibility study (arXiv 2603.03010) benchmarks nine loss functions across nine backbones with SPLADE-v3 top-1000 candidates. Average rank across out-of-domain BEIR:
+1. InfoNCE (rank 1.83)
+2. **MarginMSE** (2.17) — Hofstätter-style pairwise distillation
+3. DistillRankNet (3.61)
+4. ADR-MSE (3.66)
+5. Hinge (3.99)
+6. BCE (5.74) — significantly worse than every other
+Critically, "**MarginMSE with BM25-mined negatives is statistically equivalent to InfoNCE with ColBERTv2 hard negatives**" — loss formulation matters more than negative-pool quality. BAAI/BGE uses MarginMSE + self-knowledge-distillation from ensembles. Jina uses explicit KL on logits from the full-size teacher. Yang, He, Yang's SIGIR 2024 paper proposes CKL (contrastively-weighted KL) outperforming MarginMSE+plain KL on MS MARCO + BEIR zero-shot, but the gap is small.
+**Recommended loss:** `L = α·MarginMSE(student, teacher, pos, neg) + β·KL(σ(student/T), σ(teacher/T)) + γ·BCE(pos, 1)` with α=1.0, β=0.5, γ=0.1, T=1.0. MarginMSE alone is a fine MVP.
+### B.4 Integration architecture
+Three options: (A) TinyBERT score as one feature in a second LightGBM pass; (B) TinyBERT as a direct re-ranker on top-20 replacing LightGBM at that stage; (C) two-stage LightGBM with TinyBERT in between. Bing's LambdaMART over hundreds of features (including BERT scores), Pinterest's TransActV2 feeding neural scores into GBDT, Google/DeepMind's DASALC+TFR-BERT, and TREC TOT 2025 (arXiv 2601.15518) all converge on **the neural score as one feature among many in a final LambdaMART**, not as a terminal reranker.
+**Recommendation: Option C (≈Option A).** Keep the upstream LightGBM-lambdarank, score the top-20 with TinyBERT (~0.3ms/pair × 20 = ~6ms), and feed the student scores back into a second LightGBM pass that has access to the full personalization feature set. **Do not do Option B** — replacing LightGBM with TinyBERT at top-20 throws away user features, citation-graph features, and temporal decay that LightGBM already incorporates. Engineered features for LightGBM-2: `tinybert_score`, `tinybert_rank_position`, `tinybert_score_normalized_within_query`, and the interaction `tinybert_score − bm25_score`.
+### B.5 Hard negative mining
+The 2024 standard is **NV-Retriever** (arXiv 2407.15831) "positive-aware" filtering: mine top-100 ANN neighbors, then filter with the teacher cross-encoder, dropping candidates whose teacher score is within 0.3 of the positive (likely false negatives or duplicates). For academic papers, supplement with SPECTER/SciNCL citation-graph negatives: **SPECTER** uses 2 "citation-of-citation" hard negatives per query; **SciNCL** (Ostendorff et al.) improves on this by sampling from a continuous citation embedding space (PyTorch-BigGraph over S2ORC) with controlled distance margins (k_min=3998, k_max=4000 on a 52M-node graph), delivering +1.8 points on SciDocs. Recommended mix per (seed, positive): 3 SciNCL-style citation-of-citation negatives, 5 teacher-filtered ANN negatives (top 10–100 with teacher score below 95th percentile), 2 random in-batch. Critically, re-score all candidates with BGE-reranker-v2-m3 and **drop any within 0.3 teacher-score of the positive**.
+### B.6 Evaluation and distillation quality gap
+Typical retention rates from the 2024–2025 literature: jina-v1-base → jina-v1-turbo retains 95% (52.45 → 49.60); TinyBERT-4L retains ~96.8% of BERT-base on GLUE; MiniLM-L6 → MiniLM-L2 rerank retains ~85–90%. **For Amin's ~20× compression from BGE-reranker-v2-m3 (278M) → TinyBERT-L2 (14M), expect 82–88% retention of nDCG@10.** If below 80%, something is wrong (bad negatives, insufficient data, teacher-label leakage into eval).
+Run evaluations on SciDocs (focusing on Co-view / Co-read / Cite / Co-cite tasks), SciRepEval proximity tasks, the BEIR scientific subset (NFCorpus, SciDocs, SciFact, TREC-COVID), and held-out unarXive 2024–2026 queries with citation-graph ground truth. **CPU latency protocol: 50 warmup inferences discarded, 1000 measured inferences at seq_len=128, batch=20; report P50/P95/P99, not mean** (Pinterest standard).
+### B.7 Off-the-shelf scientific-domain rerankers
+**There is no well-maintained small (<50M param) scientific-domain cross-encoder reranker on Hugging Face as of April 2026 that beats MS MARCO-trained TinyBERT on SciDocs at the 6ms budget.** SPECTER/SPECTER2/SciNCL are bi-encoders (embedders), not rerankers. MedCPT is biomedical-specific. Third-party SciBERT cross-encoders exist but are not validated at MS-MARCO MiniLM-L6 quality. No BAAI bge-reranker fine-tuned on scientific corpus published.
+**Decision tree:**
+- **If Amin already has a pseudo-label pipeline producing >200K (query, doc, teacher_score) triples** → distill TinyBERT-L-2 from bge-reranker-v2-m3 on arXiv data. Expect +1–3 nDCG over off-the-shelf.
+- **If Amin wants MVP now** → deploy `cross-encoder/ms-marco-TinyBERT-L-2-v2` with INT8 ONNX (HF already ships `onnx/model_qint8_avx512_vnni.onnx`), measure on held-out eval. If gap vs teacher is <3 nDCG@10, ship; distill later if needed.
+**Strong recommendation: go off-the-shelf first.** Distillation is ~2–4 weeks of solo-dev work and the marginal gain at 2-layer scale is usually small. Time is better spent on hard-negative mining and LightGBM-2 feature engineering.
+### B.8 ONNX / FastAPI hot path
+Latency ranking for BERT-base-class inference on x86 with AVX-512 VNNI:
+- PyTorch eager fp32: baseline (1.0×)
+- PyTorch INT8 dynamic CPU: 0.4×
+- ONNX Runtime fp32: 0.3×
+- **ONNX Runtime + INT8 dynamic AVX-512 VNNI: 0.15–0.25× (up to 6× over ORT fp32)**
+- torch.compile: 1.5–2× over eager but still behind ONNX on CPU
+For TinyBERT-L-2-v2 on Render's standard ~2 vCPU x86: fp32 PyTorch seq=128 ≈3–5ms/pair; **INT8 ONNX ≈0.3–1.0ms/pair single-thread; batched 20 pairs ≈2–4ms total wall-clock on AVX-512 VNNI hardware** (2–3× slower without VNNI). Production code pattern:
+```python
+import onnxruntime as ort
+from tokenizers import Tokenizer
+sess_options = ort.SessionOptions()
+sess_options.intra_op_num_threads = 2  # match Render vCPUs
+sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
+sess_options.add_session_config_entry("session.intra_op.allow_spinning", "0")
+session = ort.InferenceSession(
+    "model_qint8_avx512_vnni.onnx", sess_options, providers=["CPUExecutionProvider"]
+)
+tokenizer = Tokenizer.from_pretrained("cross-encoder/ms-marco-TinyBERT-L-2-v2")
+tokenizer.enable_truncation(max_length=128)
+tokenizer.enable_padding(length=128)
+def score_pairs(query, docs):
+    enc = tokenizer.encode_batch([(query, d) for d in docs])
+    return session.run(None, {
+        "input_ids":      np.array([e.ids for e in enc], dtype=np.int64),
+        "attention_mask": np.array([e.attention_mask for e in enc], dtype=np.int64),
+        "token_type_ids": np.array([e.type_ids for e in enc], dtype=np.int64),
+    })[0].squeeze(-1).tolist()
+```
+Critical tips: pin padding length to enable kernel fusion; use `tokenizers` (Rust, ~0.1ms for 20 pairs) not `transformers.AutoTokenizer` (~5ms); cache sessions globally; disable thread spinning; skip QAT (dynamic INT8 costs <0.5 nDCG).
+### B.9 Latency scaling top-20 → top-50 → top-100
+Linearity is approximately valid but with caveats. K=20→50 ≈ 2.5× latency (6ms → 15ms) with modest sub-linear batching gains of 5–10% from amortized Python/tokenization overhead. K=100 ≈ 4.5× rather than 5×. Memory pressure kicks in at K≥64 with seq_len=512 but not at seq_len=128. Render's 2-vCPU boxes saturate at intra_op_num_threads=2.
+| K | Strategy | Expected latency |
+|---|---|---|
+| 20 | single batch of 20 | 2–6ms |
+| 50 | single batch of 50 | 6–15ms |
+| 100 | 2 batches of 50, pipelined | 12–25ms |
+| 200 | upgrade to MiniLM-L-4 or go async | 30–50ms |
+**Beyond K=50, the right move is NOT to batch harder but to prune harder upstream** — make LightGBM-1 more selective. Pinterest and Bing aggressively trim before the expensive stage.
+## C. Phase 4d — Use-cases and information-gain design doc
+### C.1 User personas
+Foundational literature: Bates's "berrypicking" (Online Review 1989) — real scholarly search is iterative, multi-source, goal-mutating, not one-shot. Ellis/Wilson's six activities (starting, chaining, browsing, differentiating, monitoring, extracting) map cleanly: monitoring = stay-current mode; chaining+differentiating+extracting = literature-review mode. Al-Shboul & Abrizah (2014, Journal of Academic Librarianship) is the explicit persona-template reference. Gordon et al. (Taylor & Francis 2020/2021) quantify scholarly pain: only 15.4% of physicists feel successful at staying current; 28.6% feel unsuccessful. Mysore et al. (CHIIR 2023) and Soufan, Ruthven, Azzopardi (CHIIR 2024) empirically confirm berrypicking in modern AI/ML workflows. Niwanputri et al. (SIGIR ICTIR 2025) "Untangling Cognitive Processes in Academic Information Searching" is the 2025 SIGIR anchor. **Scholar Inbox (Flicke et al. 2025, arXiv 2504.08385)** is the closest comparable system — they released an 800k-rating dataset and use an active-learning rating onboarding pattern.
+Drop-in persona cards for the doc:
+| # | Persona | Profile state | Mode | Day-1 signal | UX demand |
+|---|---|---|---|---|---|
+| P1 | Brand-new (cold start) | Empty EWMA | Exploration-forced | Categories + 5–10 ratings | Active-learning onboarding (Scholar Inbox) |
+| P2 | PhD student, active | 50–500 interactions, 2–4 tight clusters | Stay-current/deep | Daily skim, narrow topic | Don't flood with diversity early |
+| P3 | Senior researcher/PI | 1k+ interactions, 8–15 clusters | Mixed monitoring | Scan many, save few, dismiss often | No single cluster >40% |
+| P4 | Cross-disciplinary | Multiple distinct medoids | Parallel stay-current | Per-cluster cadence diverges | Cluster-balanced delivery |
+| P5 | Lapsed (3-mo gap) | α_long preserved, α_short stale | Re-orient | High dismissal first 3 sessions | "What changed" framing |
+| P6 | Cold-restart pivot | Has history, wants new field | Explicit pivot | System seeds new cluster | "Start new interest" UI |
+| P7 | Literature-review session | Any profile + deep-session intent | Deep single-cluster | Many click-throughs, long dwells | Suppress MMR, amplify depth |
+| P8 | Stay-current daily | Any profile, 10-min daily | Monitoring | Fast skim, binary save/dismiss | Strong MMR, proportional cluster coverage |
+### C.2 Information gain per interaction
+Foundational: Joachims (KDD 2002) clicks as relative pairwise preferences; Joachims et al. (TOIS 2007) eye-tracking validates ~80% reliability for "click i, skip i−1" pairs; Yi et al. (RecSys 2014) dwell time ≥30s as valid-engagement threshold; Xie et al. (WWW 2023) "valid read" = click + sufficient dwell; Yin et al. (WSDM 2013) "Silence is also evidence" — short dwell after click is negative, not missing. **The central paper** is Wang et al. RecSys 2023 (arXiv 2308.12256): dislike as feature only → −0.34% dislike rate (not significant); dislike as **feature AND training label** → −2.44% dislike rate, **−9.60% repeated dislike on same creator**, −2.05% dismissing users, and counterfactually **60.8% reduction in similar-content recommendations versus 22% when dislike is feature-only**. Implicit skip as negative label delivered +0.40% user enjoyment, +0.61% DAU≥1h.
+Drop-in information-gain table (normalized to click = 1.0 baseline):
+| Interaction | Sign | Relative strength | EWMA update | ~Bits info |
+|---|---|---|---|---|
+| Explicit category at onboarding | + | 5–10× | α_long seed | 3–5 |
+| Save / bookmark | + | 3–5× | α_short + α_long | ~2 |
+| Click-through to arXiv (no dwell) | + | 1.0× | α_short | ~0.5 |
+| Long dwell (>30s) on abstract | + | 2–3× | α_short elevated | ~1 |
+| Short dwell (<5s) after click | − weak | −0.5× | small α_neg | ~0.3 |
+| Share / export to bib | + | 4–6× | α_short + α_long strong | ~2–3 |
+| Dismiss (feature only) | − | −1× | Layer-1 only | ~0.3 |
+| **Dismiss (feature + training label + similar suppression)** | − | **−3× to −4×** | All three layers | ~1.5–2 |
+| "Don't recommend cluster" mute | − | −10× | Hard filter persistent | 3+ |
+| Passive skip / scroll-past | − very weak | −0.1× | Aggregate only | ~0.05 |
+| Revisit saved paper | + | 2× | α_long | ~1 |
+**Product principles derived:** every save must move the EWMA profile measurably (if α_short=0.40 doesn't produce a visible medoid shift after one save, the profile is broken); dismissals must be 1-click because their information value is ~3× passive skip; dwell must be normalized per device/context; explicit negatives must enter both the LightGBM feature vector AND the training label — feature-only is essentially wasted.
+### C.3 Longitudinal journeys
+Time-drift literature (Koren KDD 2009 timeSVD++; Mansoury CIKM 2020 feedback loop; TDLRP-MF MDPI Systems 2025; TransActV2 arXiv 2506.02267) validates Amin's α_short/α_long split. The temporal-drift papers consistently show α_short ≈ 10× α_long is healthy; Amin's 13× ratio is in range. Per-persona day-1/7/30/90 table: P1 progresses from explicit ratings + popularity-biased exploration to 1–2 tight clusters by week, to 2–4 stable medoids at 30 days, to indistinguishable from P2 at 90 days. P5 on return at d=90 starts with stale α_short; decay α_long by (1−α_long)^90 ≈ 0.065 to partially refresh. P7 is session-scoped only (MMR λ down, cluster depth up, session-TTL long). P8 is steady monitoring at 10-min daily, evolving slowly in α_long regime.
+### C.4 Instrumentation priorities
+Production references: Spotify Event Delivery Infrastructure (8M events/s, schema-first, session-context qualifies every signal); Pinterest TransActV2 (arXiv 2506.02267, real-time top-100 sequence, **p99 latency as production-critical metric not mean**); YouTube Covington RecSys 2016 + Wang 2023 (80B signals/day, separate logging for watch/search/subscribe/dismiss/satisfaction); OpenTelemetry Weaver (2025) for schema-first telemetry with SDK generation. The schema must be frozen before any real-user logging (ADR A4) because post-launch migrations are painful.
+Minimum event families to log: session_start/end + mode_declared; feed_request/served with slot_index, cluster_id, medoid_id, popularity_prior_weight, mmr_lambda, exploration_flag; positive (click, dwell_end, save, bookmark, share, export_bib, revisit) with dwell_ms, scroll_depth, device_context; negative (dismiss, mute_cluster, hide_author, explicit_dislike) with reason_code, layer_applied; profile ops (ewma_step, cluster_rebuild, medoid_shift) with α_used, silhouette_delta; model ops with per-stage latency; health/error events (empty_candidate_set, stale_profile_warning, popularity_fallback_triggered). **Log p50/p95/p99 latency percentiles per stage.** Nightly aggregations for SLO dashboards: personalized-to-popularity ratio (target ≥0.85 after day 7), cluster-share Gini (alert >0.7), exploration-slot fire rate (target 1/10 ±50% drift alert), per-cluster dismiss rate (>35% → mute candidate), save-to-click ratio, α_short day-over-day distance (alert if zero for 14 days), time-between-sessions (detects lapsed users).
+### C.5 Product principles
+Netflix North-Star thinking (Gibson Biddle) suggests **"saves per active week"** as ResearchIT's primary leading indicator — tied to customer value, directly moves α_long, not gameable by dismissals. Spotify contextual-session principle: a skip in stay-current mode ≠ a skip in lit-review mode. Pinterest tail-latency principle: operational metrics on p99 not mean. Stated principles for ResearchIT: every save must measurably move the profile; dismissals are always 1-click, always logged as both feature and label; three-layer negatives have distinct half-lives (session/α_neg=0.15/persistent-until-unmuted); context qualifies every signal; exploration is a budget not an afterthought; cluster balance beats global top-K for cross-disciplinary users; cold-start is active not passive (Scholar Inbox pattern); latency SLOs on p99; stale profiles must announce themselves; never dark-launch a ranker change without a popularity-baseline A/B.
+### C.6 Mode-switching / intent-conditioned recommendation
+Broder (SIGIR Forum 2002) navigational/informational/transactional extends to informational-narrow (lit-review) vs informational-broad (stay-current). **Jannach, Mobasher et al. TORS 2024 (arXiv 2406.16350) "A Survey on Intent-Aware Recommender Systems"** is the 2024 anchor — categorizes diversification-based, intent-prediction, and latent-intent modeling; identifies gap of offline-only evaluation. RecSys 2024 reproducibility study "A Worrying Reproducibility Study of Intent-Aware Recommendation Models" is cautionary: most intent-aware claims don't replicate. **Industry validates explicit mode switching over fully-latent intent** (Pinterest Homefeed vs Related-Pins vs Search; Spotify Deep-Focus vs What's-New).
+Recommendation: start with an **explicit two-mode toggle** in UI ("Stay Current" / "Lit Review"): stay-current has high MMR λ, per-cluster quota on, small popularity prior, 10-min session TTL; lit-review has low MMR λ, high single-cluster depth, citation-chain exposure, 60-min session TTL. Add latent intent fallback: if session shows 3 consecutive clicks into one cluster with long dwells, quietly switch to lit-review. Defer sophisticated latent-intent models.
+### C.7 Failure modes and detection
+Chaney, Stewart, Engelhardt (RecSys 2018) prove feedback loops amplify homogeneity; Mansoury et al. (CIKM 2020) quantify bubble intensification across rounds; Nguyen et al. (WWW 2014) first longitudinal filter-bubble measurement; Tang et al. (arXiv 2508.11239, Aug 2025) "Mitigating Filter Bubble from Community Detection" defines filter-bubble index = fraction of recs inside user's own community — **directly operationalizable using Ward clusters as the Louvain analog**. Drop-in detection rules:
+| Failure | Detection rule | Mitigation |
+|---|---|---|
+| Feed collapse | 7-day rolling cluster-share Gini >0.7 OR top-cluster share >0.6 | Force MMR λ up; inject exploration; cap per-cluster at 40% |
+| Stale profile | α_short unchanged for 14 days AND last session >30 days | "Refresh interests" card; boost popularity prior; Scholar-Inbox-style re-prompt |
+| Cluster fragmentation | Cluster count >K_max OR >40% clusters with <3 neighbors | Lower Ward threshold; merge |
+| Cluster over-merging | Silhouette week-over-week Δ <−0.15 | Raise Ward threshold; split top-variance cluster |
+| Filter bubble | Filter-bubble index >0.95 for 30 days | Cross-cluster sampling; raise exploration budget |
+| Popularity collapse | popularity_fallback >0.2 DAU/day | Ranker may be broken; verify LightGBM not degenerate |
+| Latency regression | p99 > SLO for 1h | Standard SRE playbook |
+| Dismissal ineffective | In-cluster rec rate within 7 days of dismiss > baseline | Verify three-layer pipeline; check layer-2 re-training |
+| Feedback-loop amplification | Avg pairwise served-item similarity trending up 4+ weeks | CD-CGCN community-aware negative sampling |
+| Cold-start stuck | Personalized score share <0.3 at day 7 | Push active-learning prompts; lower warm threshold |
+## D. Phase 5 preview at Phase-4-level detail
+### D.1 Epsilon-greedy exploration
+**Spotify BaRT** (McInerney, Lacker, Hansen, Higley, Bouchard, Gruson, Mehrotra; RecSys 2018; DOI 10.1145/3240323.3240354) is the canonical reference. Two-stage contextual bandit over Home shelves (rows + explanations) and cards (playlists). Reward = factorization machine over user × item × explanation × context features predicting a binary stream event (≥30s listen). Epsilon-greedy per-slot: with probability ε pick uniformly among candidates, otherwise argmax. Conditional exploration separates "explore the item" from "explore the explanation" sharing one reward model — this keeps propensities tractable. Training uses counterfactual risk minimization with IPS on logs. Heavier exploration for new users, lighter for established.
+**Pinterest "Warmer for Less"** (arXiv 2512.17277, Dec 2025) targets industrial cold-start items: **targeted lightweight augmentations (~+5% params) to the main model can match heavier bespoke approaches**. Strongly validates leaning on BGE-M3 content embeddings + light corrections for new arXiv papers rather than a separate CF/graph cold-start pipeline.
+Literature consensus on exploration budget clusters at **5–15%, with 10% as default**. For ResearchIT:
+- **Pre-launch → 100 users: ε-greedy at ε=0.10, slot-reservation pattern** (reserve 1/10 feed slots for exploration candidates — cleaner and lower-variance than per-slot coin flips).
+- **100–500 users: stratified exploration** (ε distributed over arXiv primary categories the user hasn't engaged with × medoid-to-item cosine uncertainty).
+- **500–1K users, >1K eng/week: Beta-Bernoulli Thompson sampling at category level.**
+- **>5K users, >10K eng/week: neural-linear bandit (mtNLB-style, KDD 2024 DOI 10.1145/3637528.3671649) reusing LightGBM scorer as representation — only if ε-greedy shows regret plateau.**
+Thompson vs ε-greedy: Chapelle & Li (NeurIPS 2011) and Vermorel & Mohri (2005) show vanilla ε-greedy routinely matches or beats TS/UCB at small N. TS at item level across 1.6M items with <1K users is infeasible; TS at category or cluster level is tractable. Other contextual bandit references: LinUCB (Li, Chu, Langford, Schapire WWW 2010); NeuralUCB (Zhou ICML 2020); NeUClust (Atalar et al. arXiv 2410.14586, Oct 2024) — contextual-combinatorial for list recommendations; ENR (CIKM 2023) epistemic neural nets for scalable TS; Ban/Qi/He WebConf 2024 tutorial.
+### D.2 LightFM collaborative filtering
+LightFM (Kula 2015, arXiv 1507.08439) is legacy-but-still-competitive; in 2026 it remains perfect for Render's CPU-only deployment because every user/item embedding is a sum of feature embeddings (including a unique-ID feature), enabling **strong cold-start with metadata — exactly ResearchIT's setting**. Alternatives: implicit ALS (industrial baseline but no cold-items); LightGCN (SIGIR 2020 arXiv 2002.02126, ~16% avg lift on standard datasets but training overhead); two-tower (Google, needs GPU); UltraGCN (marginal gains). 500-user rule-of-thumb: LightFM with WARP loss crosses above content-only when users×interactions >5K; at 500 users × ~10 positive interactions = ~5K, exactly threshold.
+**Integration: Pattern 2 (CF score as a LightGBM feature).** Spotify and Pinterest production consistently run CF + content-based candidate generators in parallel with a learned ranker blending them; within the ranker, CF is one feature among many. This gracefully handles users with weak CF signal because LightGBM learns to down-weight it. Don't do separate quota slots (worst at blending score scales). Warm-start uses LightFM's feature-averaging: a new user with claimed research categories/authors gets a warm embedding without any interaction history.
+### D.3 Dismissal-labeled LightGBM retraining
+**Minimum viable signal: ~1K dismissal events total** to distinguish systematic item-level dismissals from session noise. **For LightGBM retraining with dismissals as labels: ~10K events.** At 500 users × 5% dismissal rate × 50 impressions/week = ~125 dismissals/week → ~10K takes ~80 weeks of steady use. **Action: add dismissals as features now; add as labels only at scale.** Asymmetric loss via LightGBM's `is_unbalance=True` or explicit `scale_pos_weight`; a dismissal costs more than a missed save because it actively damaged the session. Focal loss (Lin et al.) and class-balanced loss (Cui CVPR 2019) supportable via LightGBM custom objective but only worth it when imbalance exceeds ~1:20.
+Session-overfitting mitigations: include "fraction of session slots dismissed so far" and "dominant category of session dismissals" as features so LightGBM can learn to discount anomalous sessions; decay dismissal weight by session-age; **within-session negative sampling** (contrast dismissed items against other items shown *in the same session*, not global catalog) — the Wang et al. 2023 pattern. IPS/SNIPS/DR corrections require propensity logging from day 1; for ResearchIT's known policy, exploration slots have propensity = ε / num_candidates, exploit slots ≈1. Apply 99th-percentile weight clipping. SNIPS is the best default (Eugene Yan's benchmarking); DR via Open Bandit Pipeline for robustness; arXiv 2509.00333 (Sept 2025) IPS-weighted BPR + propensity regularizer is a concrete code pattern.
+### D.4 Other Phase 5+ previews
+**Semantic IDs / TIGER** (Rajput et al. NeurIPS 2023, arXiv 2305.05065): item = tuple of discrete codewords from RQ-VAE over content embedding; Transformer seq2seq decodes next-item autoregressively. +29% NDCG@5, +17.3% Recall@5 on Beauty vs S³-Rec. **ActionPiece** (Hou et al. ICML 2025 Spotlight, arXiv 2502.13581) is context-aware tokenization (same action → different tokens depending on neighbors) and outperforms TIGER-style context-independent semantic IDs. Spotify Research Sept 2025 "Semantic IDs for Generative Search and Recommendation" (Penha et al.) shows task-specific Semantic IDs fail to generalize cross-task. **Would TIGER work on CPU for 1.6M corpus?** RQ-VAE training is feasible (hours), but autoregressive Transformer decoding with beam=10 hits hundreds of ms/request on Render CPU. **Defer indefinitely** — it solves embedding-table-cost at scale, which is not ResearchIT's pain. Entry threshold: >10K users AND ANN on 1.6M becomes the bottleneck AND a GPU becomes available.
+**PinnerFormer** (Pancha et al. KDD 2022, arXiv 2205.04507): single-vector user embedding from transformer over recent engagement sequence; novel dense-all-action loss predicts a random positive action within a 14-day future window from any random sequence position. Batch daily inference closes most of the gap to realtime (0.243 vs 0.251 Recall). **Defer indefinitely for solo-dev pre-launch.** A cheap equivalent is mean of BGE-M3 vectors over recent engagements — already what Amin's medoid retrieval does (PinnerSage's original approach). Entry threshold: ≥10K users AND ≥50 avg interactions/user AND a clear need for sequence modeling AND GPU availability.
+**DPP / Sliding Spectrum Decomposition.** Classic DPP: Kulesza & Taskar 2011; Chen, Zhang, Zhou KDD 2018 (YouTube-scale). SSD: Huang, Wang, Peng, Wang KDD 2021 (arXiv 2107.05204) — originally Xiaohongshu, adopted by Pinterest in early 2025. Pinterest's April 2026 engineering blog ("Evolution of Multi-Objective Optimization at Pinterest Home feed") documents DPP → SSD migration with >2% time-spent-impression week-1 lift. SSD in PyTorch is cleaner than DPP (avoids PSD enforcement, log-dets, Cholesky stability). **For ResearchIT: MMR is fine at 500 users.** Upgrade entry threshold: feed size ≥20 AND ≥2 diversity axes (category × recency × reading-difficulty) AND visible user complaints of "too-similar" results >5% rate.
+**Calibration of LightGBM scores.** Default binary log-loss training is often near-calibrated; miscalibration mostly appears with `lambdarank`/`rank_xendcg` objectives — then calibration is **essential before multi-objective fusion or thresholding**. Platt scaling (sigmoid(a·score + b)) is small-data-friendly and parametric; isotonic regression is non-parametric and needs ~≥1K calibration points; beta calibration (Kull, Silva Filho, Flach AISTATS 2017) sits between. LinkedIn's in-model isotonic calibration layer and Google's "Scale Calibration of Deep Ranking Models" (Yan et al. KDD 2022) are recent pointers. **For ResearchIT:** isotonic regression on held-out 10–20% of training interactions, refit weekly. When it matters: thresholding (p(save)>0.3), ranking-fusion (combining CF + LightGBM + exploration bonus). When it doesn't: pure ranking by raw LightGBM output. Do this right after 4b (~2 days of work).
+**Active learning for cold-start.** Nature Scientific Reports 2025 "Active learning algorithm for alleviating the user cold-start problem of recommender systems" uses decision-tree-based item selection with Like/Dislike/Unknown answers, 20-query cap, 3 like-constraints per user — but found online evaluation with 50 real users could not confirm offline lift. MDPI 2024 review and CIKM 2025 "Harnessing Light for Cold-Start Recommendations" confirm uncertainty+popularity hybrid queries as dominant pattern. **Practical pattern for ResearchIT: 2×3 grid at signup** — 2 triplets of 3 papers each spanning 6 arXiv subfields, user picks best per triplet, yielding a seed medoid from ~2 queries. This is Netflix's post-signup "pick 3 you like" flow. Entry threshold: ≥50 signups/week AND measurable onboarding drop-off.
+### D.5 Scaling infrastructure (SQLite → Supabase)
+SQLite's single-writer lock ceiling: **~50 writes/second with WAL on SSD, ~10 in default mode**. Any long INSERT blocks all writes. FTS5 shares this limit. For ResearchIT at 500 users × 50 events/session × few sessions/week, still fine. Breaks when: concurrent cluster-snapshot writes + live event logging conflict; >100 concurrent users with mutable state; ML-training jobs run alongside API writes. Supabase Postgres features for recsys: pgvector 0.7 with halfvec (50% memory savings) and parallel HNSW builds (30× faster); Row-Level Security for lab/team multi-tenancy (one `lab_id` column, policy `lab_id = auth.jwt()->>'lab_id'`); realtime subscriptions. Free tier is 500MB; paid starts ~$25/mo.
+**Migration trigger: hit ~500MB SQLite OR visible writer contention OR concurrent cluster-snapshot + event-log conflicts.** Use immutable snapshot tables (`clusters_v42`, `clusters_v43`) with pointer-table atomic swap; Qdrant/Zilliz collection aliases for zero-downtime rebuilds; keep last 2 snapshots for rollback. Vector cache invalidation: version cluster_snapshot_id on cached candidates; background job refills.
+### D.6 A/B testing at ~500 users
+Statistical power at N=500 (α=0.05, 80% power, 50/50 split): binary metric with baseline p=0.10 has **MDE ≈ 5.5 percentage points absolute** (10% → 15.5%, ~55% relative); continuous metric MDE ≈ 0.25σ Cohen's d. **Only large lifts are detectable at this scale.** CUPED (Deng, Xu, Kohavi, Walker WSDM 2013) reduces required N by 2–3× on predictable metrics; 2024/2025 extensions include arXiv 2410.09027 (Lin & Crespo, Etsy) and arXiv 2510.03468 (CUPED + trimmed mean for heavy tails).
+**For solo pre-launch: scipy.stats + evidently.ai-style notebook now.** GrowthBook (self-hosted, open-source, SQL-based) is the right upgrade at ≥1K users with ≥1 concurrent experiment/month. Skip Statsig (vendor dependency). Skip switchback unless adding shared team feeds where spillover matters. Experiment templates: exploration-% ablation (5/10/15 with primary = 7-day save rate, secondary = session length + dismissal rate); CF on/off at 50/50 user-level randomization; dismissal-feature vs dismissal-label over ≥4 weeks.
+### D.7 Multi-tenancy / group recommendation
+Masthoff (2015 survey) taxonomy holds: Average/Additive Utilitarian; **Least Misery** (good for veto scenarios like "labmate dislikes biology → don't recommend to whole lab"); Most Pleasure; **Average Without Misery** (recommended compromise — average but filter below per-individual threshold); Approval Voting / Borda / Kemeny rank aggregation. Fairness-aware 2024–2025: Stratigi et al. (JIIS 2021) SDAA/SIAA sequential satisfaction-balancing; FAccT 2025 "Group Fair Rated Preference Aggregation: Ties Are (Mostly) All You Need" (Fate-Break and Fate-Rate). LLM-based group rec 2025: arXiv 2505.05016 "Pitfalls of Growing Group Complexity" — LLMs often implicitly do Average; explicit prompts for Least Misery change behavior. Academic-collaboration-context group-RS papers remain rare — you'd be doing mostly greenfield work.
+**Recommendation for ResearchIT**: **Average-Without-Misery with a tunable misery threshold**, enforced via Postgres RLS per-lab. Lab profile surfaces only aggregate signals (counts, category histograms) — never individual read/save events — unless explicitly opted-in; GDPR consent language must be explicit because "labmate X saved this" is a personal-data disclosure. Entry threshold: real user demand (multiple lab opt-ins requested) post-launch; **not in Phase 5 core scope**.
+## E. Offline evaluation scale-up
+**Regression testing in CI.** Frozen eval set as a Git-LFS artifact with version-pinned manifest (split date, author allowlist, citation-pair count, dataset hash) — never mutate without bumping `eval_set_v1.0.parquet → v1.1.parquet`. Pytest + GitHub Actions on every PR touching `retrieval/rerank/rank/diversify/`. Threshold-based assertions: hard fail if nDCG@10 drops >3% absolute or Recall@50 drops >2%; soft warn (xfail strict=False) if ILS/entropy moves >10%. Use bootstrap 1000-replicate 95% CIs to fail only when the baseline is excluded. PRs that intentionally move metrics must update `eval/baselines/main.json` with an `EVAL_DELTA_JUSTIFICATION`. CPU budget: freeze to 5k-query subsample (~5 min on Render free tier); full eval is nightly cron. **Tooling: DIY pytest now (~200 LOC, zero deps). Evidently AI** (open-source) has a built-in GitHub Action wrapping Python tests and failing CI on threshold violations with 15+ ranking metrics. DeepEval is overkill for ranking.
+**Per-stage attribution.** IJCAI-22 "Neural Re-ranking in Multi-stage Recommender Systems: A Review" and Pinterest's WebConf 2023 "End-to-End Diversification" paper: each stage needs its own intermediate ground truth plus a joint evaluation. For ResearchIT: retrieval = Recall@200 (ceiling for all downstream); rerank = nDCG@50 on retrieved set + Precision@10; diversify = ΔnDCG@10 and ΔILS/entropy pre-vs-post. Log `stage_metrics.jsonl` per eval with `{run_id, stage, metric, value, params_hash}`; a "regression diagnosis" script compares PR vs main across stages. Hron et al. 2021 "On component interactions in two-stage recommender systems" is the theoretical grounding — retrieval-rerank interactions are non-trivial. Pinterest reports retrieval-layer diversification gives +8% diversity in candidate set but only +1% at final rank — stage-specific diversity deltas matter.
+**Experiment tracking.** Append-only `eval_runs.jsonl` now (`{run_id, git_sha, timestamp, dataset_hash, config_hash, metrics, stage_metrics}` with Streamlit/Jupyter for plotting). Adopt MLflow locally (SQLite backend) at Phase 4b when distillation creates many hyperparameter-tuning runs. Skip W&B unless/until a collaborator appears (free tier fine but cloud dependency). Skip DVC (Git-LFS + manifests cover 80% of value). Signal to upgrade from JSONL to MLflow: "I can't find the run from 3 weeks ago in grep."
+**Synthetic user generation.** RecSim NG (Google 2021), RecBole simulators, **Balog & Zhai 2024 "User Simulation for Evaluating Information Access Systems" (Foundations & Trends, 261 pages) is the foundational survey**. 2025 LLM-agent simulators: UserSimCRS v2 (Balog & Zhai 2025), RecUserSim (Chen et al. WWW 2025 arXiv 2507.22897). Sim4IA workshop at SIGIR 2024 is the community reference. Concrete plan: extract 2–5k author personas from unarXive 2022 author graphs spanning deep specialists, bridge authors, early-career, prolific surveyers, methodology-transfer; choice model `p(save|paper) = σ(α·cos(paper,centroid) + β·cited_by_persona + γ·recency − δ·already_saved)`; add drift by slightly updating centroid with each saved paper. Evaluation: longitudinal nDCG trajectories, calibration of saved/dismissed ratio (expect 15–25%), exploration metric for bridge authors. Budget 2–3 weeks; start 100 personas × 30 days, scale to 2k later. Always triangulate against held-out real data.
+**Cluster evaluation.** Silhouette coefficient + Davies-Bouldin index daily (Chicco et al. 2025 PeerJ — SC+DBI superior to Dunn/CH on convex clusters). Stability across time is the production-critical metric: Hungarian match day-over-day via `scipy.optimize.linear_sum_assignment` with cost = −|C_i ∩ C_j|; per-cluster Jaccard after matching; aggregate mean Jaccard and fraction with J≥0.8. Complement with Adjusted Rand Index across consecutive days and object-level stability (Toms et al. WorldCat; Toussi 2017). Alert threshold: mean Jaccard <0.7 for 3 consecutive days. **Cluster snapshot versioning is architecturally necessary before Phase 4a** because summaries will be keyed to cluster IDs.
+**Counterfactual evaluation.** Required from day 1 of Phase 4 — every displayed recommendation must log `p(shown|context)` under the active policy. Without propensities, IPS/SNIPS/DR are retroactively impossible. Inject 5% ε-greedy exploration for non-degenerate propensities. Estimator choice (per Eugene Yan benchmarking + JTIE 2025 reproducible study): **SNIPS is best default** (no hyperparameter, lower variance than IPS); Direct Method alongside for low-variance potentially-biased imputation; Switch-DR in moderate-overlap regimes. **Tooling: Open Bandit Pipeline** (Saito et al.) in Python. JTIE 2025 reporting template: always report oracle decomposition, overlap diagnostics, estimator components, and effective sample size. **ESS <100 = unreliable; don't ship.**
+## F. Planning and requirements
+### F.1 Architectural decisions blocking Phase 4 start (ADR sprint, week 0)
+These seven decisions must be captured as ADRs *before* any Phase 4 code lands:
+- **A1 Cluster snapshot versioning.** SQLite table `cluster_snapshots(snapshot_date, cluster_id, paper_id, centroid_blob)`, 30-day retention, Hungarian-matched stable IDs as separate column. Without this, Phase 4a cache invalidation is guesswork.
+- **A2 Per-user vs shared cluster summaries.** **Recommended: shared.** Per-cluster cached once per `(cluster_stable_id, snapshot_date)`. Per-user adds 3–5× Claude cost with marginal UX gain pre-launch. Shared ≈$50–80/month; per-user easily $500+. Schema-migration-hard to change later.
+- **A3 LightGBM v1 vs v2.** **Recommended: one-stage LambdaMART in 4b; two-stage deferred to Phase 5.** Single LambdaMART over {bi-encoder score, BM25, recency, category match, author overlap} captures 80% of two-stage value at 30% complexity.
+- **A4 Telemetry event schema v1 (frozen before any logging).** Minimum fields: `event_id, user_id, session_id, timestamp, event_type, paper_id, position, cluster_id, cluster_stable_id, policy_id, propensity, ranker_version, rerank_version, candidate_source, ab_bucket`. Retrofitting is painful. OpenTelemetry OTEP 0152/0243 on schema evolution are the canonical references.
+- **A5 Eval-set version pinning + baseline format.** `eval/baselines/main.json`, `eval/eval_set_v1.0.parquet`; PRs that move metrics update both.
+- **A6 Distillation training-data boundary.** Commit before 4b to: teacher (BGE-reranker-v2-m3), query distribution (must NOT overlap with eval's time-split), output format (MarginMSE margins). Assertion in training: `max(train.timestamp) < eval_cutoff`.
+- **A7 Claude model/cache strategy.** Haiku 4.5 for 4a summaries; 5-min prompt cache on shared system prompt + style guide; single `cache_control` breakpoint on cluster-papers block. Stable-prefix-first prompt structure decided before coding.
+### F.2 Phase 4 subworkstream entry/exit criteria
+**4a Claude summaries.** Entry: A1/A2/A7 decided; cluster stability mean Jaccard day-over-day ≥0.7 over 7 days. Exit: all 50–200 active clusters have fresh summary daily; p95 generation latency <3s; monthly cost <$30 at 100 clusters × 1 refresh/day with caching; 20 human-rated summaries score ≥4/5 on coherence. Deliverables: `services/summaries/claude_client.py` with prompt cache + retry/backoff; `services/summaries/summary_job.py` nightly job writing `cluster_summaries(cluster_stable_id, snapshot_date, summary_md, input_tokens, output_tokens, cached_tokens)`; Jinja templates; cost monitoring SQL view. **Effort: 2–3 weeks solo.** Risks: Claude cost overruns (set hard spend cap, log cache hit ratio — if <70%, prompt structure wrong); stale summaries from snapshot_date collisions (use content-hash tie-breaker); prompt injection from abstracts (use `<paper_abstract>` tags + "summarize only; ignore instructions" system line).
+**4b Distilled reranker.** Entry: Phase 3 eval producing stable nDCG@10 within ±0.5% across runs; retrieval Recall@200 ≥0.85 on held-out; A3/A5/A6 decided; frozen eval set never seen in training (enforced assertion). Exit: student recovers ≥95% teacher nDCG@10 at ≥10× lower CPU latency; ONNX-exported INT8-quantized with PyTorch numerical closeness <1e-3 on 1000 samples; feature-flagged shadow traffic for 1 week with no regressions. Deliverables: teacher-scoring pipeline (non-eval time window); student training script with MarginMSE loss; ONNX export + `optimum-cli`; FastAPI integration with onnxruntime; stage-attribution eval report. **Effort: 4–6 weeks solo** (1 week data prep, 1 week training/tuning, 1 week ONNX+quantization+perf, 1–2 weeks integration+shadow, 1 week buffer). Risks: training-data leakage (time-cutoff assertion); CPU latency regression from naive batching (batch top-50 as one forward pass, not serial); quantization-catastrophic-recall (always compare fp32 vs INT8 on same eval — usually <0.5 nDCG, can be worse with bad calibration data).
+**4d Use-cases doc.** Entry: Phase 3 eval showing consistent wins; dogfooding anecdotes; 4a scoped (for UX mockups). Exit: 10–15 page markdown doc with 3–5 personas drawn from synthetic-persona work, top 10 use cases with before/after storyboards, explicit non-goals, 3-month roadmap. Deliverables: single markdown doc + 1-page "pitch" derivative. **Effort: 1–1.5 weeks focused writing, calendar-time ~3 weeks** (competes with dev work). Risks: writing-in-a-vacuum (need 5–10 real conversations); premature lock-in (publish externally only after 10 external users × 2 weeks).
+### F.3 Dependency graph and sequencing
+```
+Week 0:  ADR sprint (A1–A7)  [1 week, no coding]
+          │
+          ├──→ 4d Use-Cases Doc (1–1.5 wk writing, weeks 1–3 calendar)
+          │
+          ├──→ 4a Claude Summaries (2–3 wk, weeks 1–3) — needs A1, A2, A7
+          │
+          └──→ 4b Distilled Reranker (4–6 wk, weeks 4–9) — needs A3, A5, A6
+```
+**Sequencing rationale: 4a first (cheapest, most visible, low risk, UI-validating, infrastructure reused by 4d); 4d in parallel (writing surfaces missing features); 4b last (largest quality lift but biggest risk, benefits from 4a UI being in prod and 4d clarifying what matters).** Add 30% buffer to every estimate — solo-dev posts uniformly show actual timelines are 1.5–2× initial estimates. **Realistic Phase 4 total: 10–12 weeks with parallelization and buffer; ~8 weeks if nothing breaks (it will).**
+Week-by-week plan:
+| Week | 4a | 4b | 4d | Cross-cutting |
+|---|---|---|---|---|
+| 0 | — | — | — | ADR sprint A1–A7 |
+| 1 | Claude client + cache | Teacher scoring script | Persona draft | CI regression harness v1 |
+| 2 | Nightly summary job + DB | 500k-pair sampling + MarginMSE training | Use case storyboards | Synthetic persona sim v0 |
+| 3 | UI integration + human eval | Training runs (MLflow) | External review + polish | Stage-attribution diagnostic |
+| 4 | Cost polish; freeze | ONNX + INT8 export | done | — |
+| 5 | monitoring buffer | CPU perf optimization | — | Cluster stability alerts live |
+| 6 | — | FastAPI integration + flag | — | — |
+| 7 | — | Shadow traffic + debug | — | — |
+| 8 | — | Full rollout + eval report | — | Phase 4 retrospective |
+| 9–10 | — | buffer | — | Plan Phase 5 entry threshold review |
+### F.4 "Good enough" exit criteria
+4a: summaries ship to 100% of clusters, cost within budget, no correctness incidents 2 weeks. 4b: ≥95% teacher nDCG@10 recovery, CPU p95 <200ms top-50 rerank, 1 week shadow clean. 4d: 3 external readers provide feedback → 1 revision → published. General rubric for solo dev: primary objective met + smallest acceptable safety net = ship. Resist the "perfect" standard — solo devs chasing "done" on every phase never launch. Log tech debt in `TODO.md`; every 6–8 weeks, 2-week refactoring cycle (Matt Robertson solo-dev pattern).
+### F.5 Phase 5 entry thresholds
+| Workstream | Entry threshold | Rationale |
+|---|---|---|
+| ε-greedy exploration | **Day 1 of Phase 4 (even with 1 user)** | Required architectural decision, not future workstream — without exploration no propensities, without propensities no retrospective IPS |
+| LightFM / hybrid CF | ≥100 users OR ≥500 saves total | CF beats pure content only once interaction signal overlaps; below ~500 saves, content+recency wins |
+| Dismissal retraining (as labels) | ≥5K dismissal events AND propensity-logged | Fewer means IPS variance explodes (ESS<100); propensities must come from day 1 or impossible to apply later |
+| Semantic IDs (TIGER) | ≥10K users AND ANN bottleneck measurable AND GPU available | Solves embedding-table-cost at scale — not ResearchIT's pain at 10K users × 1.6M papers |
+| PinnerFormer | ≥10K users AND ≥50 avg interactions/user AND basic sequence features built AND GPU available | Dense-all-action loss needs 14-day future prediction window per user; <50 interactions/user has nothing to learn |
+| DPP / SSD diversity | MMR clustering complaints >5% of user feedback | 500+ LOC complexity not worth it until MMR visibly fails |
+| Calibration (isotonic) | Before any multi-objective score fusion | ~2 days of work; schedule right after 4b |
+| Active learning onboarding | ≥50 signups/week AND measurable funnel drop-off | Nature 2025 study couldn't confirm offline lift online with 50 real users |
+| SQLite → Supabase | ~500MB DB OR writer contention OR cluster-job + event-log collisions | SQLite fine for ResearchIT workload until one of these fires |
+| GrowthBook (from scipy) | ≥1K users AND ≥1 concurrent experiment/month | scipy + notebook covers pre-launch |
+| Lab/group profiles | Multiple explicit lab opt-in requests post-launch | Not in Phase 5 core; greenfield for academic context |
+### F.6 Cross-cutting risks
+Telemetry gaps bite hardest in Phase 5 (IPS impossible without propensities): **freeze schema before any logging (A4); include policy_id, propensity, shown_position, ranker_version**. Training data leakage produces phantom lift in 4b: eval-time-cutoff assertion in training script; never use eval queries as teacher-scoring queries. Claude cost overruns: Haiku + shared summaries + caching + hard dashboard cap + daily cost view. Cluster instability causes mis-cached summaries and UI label-jumping: Hungarian-matched stable IDs + Jaccard <0.7 alert. Solo-dev estimation drift: multiply all estimates by 1.5, parallelize ADR+writing with dev, commit to a hard "good enough" definition per workstream. Evaluation-overfit (CI green but real users unhappy): run synthetic-persona longitudinal sim alongside static eval; once you have real users, weight live metrics > offline. Eval-set rot: every 6 months recompute with new cutoff, bump version, re-baseline intentionally.
+## Conclusion
+Phase 4 is mostly a 10–12 week engineering effort bounded by two real constraints — solo-dev capacity and a 6ms CPU budget for the cross-encoder — and one architectural constraint: **every downstream Phase 5 workstream depends on decisions made in week 0 of Phase 4**. The ADR sprint is the non-negotiable entry gate. Within Phase 4, the highest-leverage sequencing is 4a (Claude summaries, shared-not-per-user, Haiku 4.5 + Batch API, ~$50–80/mo) in parallel with 4d writing, then 4b (distilled reranker, off-the-shelf TinyBERT-L-2-v2 INT8 ONNX first, distill only if held-out gap >3 nDCG, Option-C LightGBM integration). The novel contribution of Phase 4a is that **no other academic recommender currently shows personalized "You're reading about X" cluster narratives** — Scholar Inbox's shared labels are the closest analogue. The novel contribution of 4b for a solo dev is recognizing that the Shallow Cross-Encoders finding (SIGIR 2024) plus FlashRank's ONNX packaging pattern plus HF-shipped AVX-512-VNNI INT8 models means 6ms for 20 pairs on CPU is genuinely achievable without custom distillation — distillation is the more-complex fallback, not the default. For Phase 5, the single most valuable action that costs nothing now is **logging propensity and policy_id from day 1**, which unlocks SNIPS/DR counterfactual evaluation for every later workstream. The dismissal-as-label YouTube finding (Wang et al. 2023: 22% → 60.8% similar-content reduction when dismissals are both features AND labels) is the best-justified Phase 5 quality lever, but it needs ~10K dismissals and is ~80 weeks away at pre-launch scale — so in the interim, dismissals enter as features only, and the real Phase 5 quality investment should be (in order) calibration of LightGBM scores, ε-greedy exploration at 10%, stratified exploration by unused arXiv category, and LightFM-as-LightGBM-feature once interactions cross 5K. Everything else — TIGER, PinnerFormer, DPP, group rec, active learning, neural bandits — should be deferred until a specific production pain signal fires.

docs/walkthroughs/02-Phase2-MultiInterest-Recommender.md CHANGED Viewed

@@ -17,7 +17,7 @@ EWMA profiles update (background, non-blocking)
     ↓
 Ward clustering → K distinct interest medoids (auto K per user)
     ↓
-Qdrant prefetch + RRF fusion (~15-25ms, single API call)
     ↓
 Heuristic re-ranking of ~100 candidates (~1-2ms)
     ↓

     ↓
 Ward clustering → K distinct interest medoids (auto K per user)
     ↓
+Qdrant prefetch + RRF fusion (~15-25ms, single API call)  [⚠️ Replaced by Quota Fusion in Phase 4]
     ↓
 Heuristic re-ranking of ~100 candidates (~1-2ms)
     ↓

docs/walkthroughs/03-Code-Summary-and-Test-Plan.md CHANGED Viewed

@@ -36,7 +36,7 @@ The current application is a fully functional FastAPI + HTMX research paper disc
 ## 2. Comprehensive Testing Plan
-The current test suite has **86 passing tests** executing via `pytest`. Our testing strategy is split into three layers: Automated, Manual, and Analytics-based evaluation.
 ### A. Automated Testing (Current & Ongoing)

 ## 2. Comprehensive Testing Plan
+The current test suite has **125 passing tests** (as of Phase 3.5) executing via `pytest`. Our testing strategy is split into three layers: Automated, Manual, and Analytics-based evaluation.
 ### A. Automated Testing (Current & Ongoing)

docs/walkthroughs/04-Next-Steps-and-Phase-Plan.md CHANGED Viewed

@@ -15,29 +15,27 @@
 |---|---|---|
 | Qdrant Cloud (1.6M BGE-M3 papers) | ✅ Live | BQ enabled, HNSW m=32, `arxiv_bgem3_dense` collection |
 | Phase 1: Zero-ML Recommender | ✅ Complete | Qdrant BEST_SCORE with raw IDs, 55 tests |
-| Phase 2a: EWMA Profiles | ✅ Complete | Long-term (α=0.10), Short-term (α=0.40), Negative (α=0.15) |
-| Phase 2b: Ward Clustering + Prefetch+RRF | ✅ Complete | Adaptive gap-based threshold, 2+ clusters detected on real data |
-| Phase 2c: Heuristic Re-ranking + MMR | ✅ Complete | 4-feature scorer, MMR λ=0.6, exploration injection |
 | SQLite (interactions, profiles, clusters, metadata cache) | ✅ Live | WAL mode, async via aiosqlite |
 | HTMX Frontend | ✅ Live | Search, save, dismiss, recommendations |
-| Test Suite | ✅ 88 tests passing | Unit, integration, and E2E simulation |
 ### What's NOT Built Yet
 | Component | Planned In | Blocked By |
 |---|---|---|
-| **Hybrid Search (BGE-M3 encode + Zilliz sparse)** | **Phase 3 (NEXT)** | BGE-M3 model loading (~570MB, ~15s cold start) |
-| Recommendation fixes (RRF→quota, α tuning) | Phase 4 | Code refactor only |
-| LightGBM lambdarank re-ranker | Phase 6 | Need ≥500 labeled save/dismiss interactions |
 | Cold-start onboarding (category picker / ORCID) | Phase 5 | Not yet designed |
-| Negative profile used in retrieval | Phase 4 | Stored but not wired |
-| Pre-populated metadata store | Phase 4 | arXiv API is the latency bottleneck (~7.6s cold) |
 | LLM interest summaries per cluster | Phase 8 | Needs Claude/Groq API integration |
-> **Note on search architecture:** The current arXiv keyword API search was always a Phase 1 placeholder.
-> The entire point of building 1.6M BGE-M3 embeddings in Qdrant (with BQ + HNSW) is to power
-> vector-based semantic search. Replacing the arXiv API with Qdrant dense + Zilliz sparse
-> hybrid search is the **#1 priority** for the next phase.
 ### Dataset Coverage
@@ -47,7 +45,7 @@
 | Newest paper | `2505.04101` (~May 2025) |
 | Total papers | 1,596,587 |
 | Payload stored in Qdrant | `arxiv_id` only |
-| Metadata source | arXiv API (live) → SQLite cache |
 ---
@@ -126,7 +124,7 @@ The 6 research documents contain several contradictions. Here is each one and it
 This is what PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN actually deploy.
-**Current status**: RRF is still in the codebase. Needs to be replaced.
 ### 2. EWMA α_long = 0.10 vs 0.03
@@ -136,7 +134,7 @@ This is what PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN actually depl
 **Resolution**: PinnerSage tested λ=0.1 and **explicitly rejected it as too recent-biased**. Their optimal was λ=0.01. Doc 06 recommends α_long=0.03 as a compromise.
-**Current status**: α=0.10 is in the codebase. Should be tuned down to 0.03.
 ### 3. BGE-reranker-v2 in the Hot Path
@@ -231,46 +229,52 @@ Final results → fetch metadata → render
 ### Phase 4: Recommendation Pipeline Fixes (~1 week)
 Corrections to the existing recommendation pipeline based on Doc 06's findings.
 #### 4.1 Replace RRF with Importance-Weighted Quota Fusion
-**Why**: RRF lets dominant clusters swamp minor interests — the exact failure mode multi-interest models exist to prevent.
-**What to change**: In `app/routers/recommendations.py`, replace `multi_interest_search()` (which uses Qdrant's server-side RRF) with per-cluster separate ANN queries, then allocate feed slots proportional to cluster importance with a floor of F_min=3.
-**New flow**:
 ```
 clusters = compute_clusters(...)
-weights = normalize_importance(clusters)
-for each cluster k:
-    slots_k = max(floor(total_slots × weight_k), 3)
-    candidates_k = qdrant search with medoid_k (limit = slots_k × 3)
-    rerank within cluster_k via LightGBM / heuristic
-    take top slots_k
-deduplicate across clusters (assign to highest-ranked)
-MMR over the merged union
 ```
-#### 4.2 Tune α_long from 0.10 → 0.03
-**Why**: PinnerSage explicitly rejected 0.10 as too recent-biased.
-**What to change**: Single constant in `app/recommend/profiles.py`.
-#### 4.3 Wire the Negative Profile into Re-ranking
-**Why**: Currently computed and stored but never used. YouTube showed a 3× gain from using dislikes as both features and labels.
-**What to add**: In `app/recommend/reranker.py`, add a negative-similarity penalty:
-```python
-neg_penalty = cosine_sim(candidate, neg_profile) * penalty_weight
-final_score = base_score - neg_penalty
-```
-#### 4.4 Pre-populate Metadata Store
-**Why**: The arXiv API is the #1 latency bottleneck (~7.6 seconds cold for 50 papers).
-**What to do**: Download the Kaggle arXiv metadata dataset (~4GB JSON). Bulk-insert all 1.6M papers' metadata into SQLite's `paper_metadata` table. The arXiv API becomes a fallback for genuinely new papers only.
-**Impact**: Metadata fetch drops from ~7,600ms to <5ms.
 ---
@@ -379,8 +383,9 @@ Add LightFM hybrid model with switching strategy:
 - ≥10 interactions: LightFM
 Retrain LightGBM with dismissals as negative labels (YouTube's 3× gain from dual labels).
-#### 9.3 Category-Level Negative Suppression
-If ≥3 dismissals hit the same arXiv category within a week, suppress that category for 2 weeks.
 ---
@@ -388,17 +393,13 @@ If ≥3 dismissals hit the same arXiv category within a week, suppress that cate
 If you can only do three things, do these:
-### 1. Build hybrid semantic search (Phase 3)
-**Impact**: Replaces the arXiv keyword API placeholder with real vector-based search. This is what the 1.6M BGE-M3 embeddings in Qdrant were built for. Transforms the product from a keyword aggregator into a semantic discovery engine.
-**Effort**: 4 new service files + router swap. ~2-3 weeks.
-### 2. Pre-populate the metadata store (Phase 4.4)
-**Impact**: Drops cold metadata fetch from 7,600ms to <5ms. Single biggest latency win.
-**Effort**: Download Kaggle dataset, write a bulk-insert script, run once.
-### 3. Replace RRF with quota fusion in recommendations (Phase 4.1)
 **Impact**: Prevents the dominant cluster from drowning out minority interests. Fixes the core multi-interest failure mode.
-**Effort**: Refactor `_multi_interest_recommend()` in recommendations.py.
 ---
@@ -415,5 +416,7 @@ If you can only do three things, do these:
 | — | [Phase 1 Walkthrough](PHASE1-Zero-ML-Recommender.md) | Zero-ML recommender code tour | ✅ Complete |
 | — | [Phase 2 Recommender Walkthrough](02-Phase2-MultiInterest-Recommender.md) | Multi-interest engine implementation | ✅ Complete |
 | — | [Code Summary & Test Plan](03-Code-Summary-and-Test-Plan.md) | Codebase summary and testing strategy | ✅ Complete |
-| — | [Phase 2 Hybrid Search Plan](../phases/PHASE2-Hybrid-Search-Plan.md) | BGE-M3 + Zilliz hybrid search (not yet built) | 📋 Planned |
 | — | **This Document** | Revised phase plan synthesizing all research | ✅ Current |

 |---|---|---|
 | Qdrant Cloud (1.6M BGE-M3 papers) | ✅ Live | BQ enabled, HNSW m=32, `arxiv_bgem3_dense` collection |
 | Phase 1: Zero-ML Recommender | ✅ Complete | Qdrant BEST_SCORE with raw IDs, 55 tests |
+| Phase 2a: EWMA Profiles | ✅ Complete | Long-term (α=0.03 ✅), Short-term (α=0.40), Negative (α=0.15) |
+| Phase 2b: Ward Clustering + Prefetch+RRF | ✅ Complete | L2-norm + adaptive gap threshold, 2+ clusters on real data |
+| Phase 2c: Heuristic Re-ranking + MMR | ✅ Complete | 5-feature scorer (neg penalty wired), MMR λ=0.6, exploration |
+| Phase 3: Hybrid Semantic Search | ✅ Complete | BGE-M3 + Qdrant dense + Zilliz sparse + RRF, 123 tests |
+| Phase 3.5: Turso Metadata DB | ✅ Complete | 1.23GB metadata + citations, search ~10.7s → ~1.75s |
 | SQLite (interactions, profiles, clusters, metadata cache) | ✅ Live | WAL mode, async via aiosqlite |
 | HTMX Frontend | ✅ Live | Search, save, dismiss, recommendations |
+| Test Suite | ✅ 125 tests passing | Unit, integration, E2E simulation, search pipeline |
 ### What's NOT Built Yet
 | Component | Planned In | Blocked By |
 |---|---|---|
+| **Rec pipeline fixes (RRF→quota, Hungarian, neg suppression)** | **Phase 4 (NEXT)** | Code refactor only |
 | Cold-start onboarding (category picker / ORCID) | Phase 5 | Not yet designed |
+| LightGBM lambdarank re-ranker | Phase 6 | Need ≥500 labeled save/dismiss interactions |
 | LLM interest summaries per cluster | Phase 8 | Needs Claude/Groq API integration |
+> **Note**: Hybrid Search (Phase 3), Turso Metadata (Phase 3.5), α_long tuning, L2
+> normalization, and negative profile wiring are all DONE. The next priority is fixing
+> the recommendation fusion from RRF → quota (Phase 4).
 ### Dataset Coverage
 | Newest paper | `2505.04101` (~May 2025) |
 | Total papers | 1,596,587 |
 | Payload stored in Qdrant | `arxiv_id` only |
+| Metadata source | Turso DB (primary) → arXiv API (fallback) → SQLite cache |
 ---
 This is what PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN actually deploy.
+**Current status**: RRF is still in the codebase. Phase 4 plan created — see `docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`.
 ### 2. EWMA α_long = 0.10 vs 0.03
 **Resolution**: PinnerSage tested λ=0.1 and **explicitly rejected it as too recent-biased**. Their optimal was λ=0.01. Doc 06 recommends α_long=0.03 as a compromise.
+**Current status**: ✅ Already fixed — α=0.03 in `app/recommend/profiles.py:30`.
 ### 3. BGE-reranker-v2 in the Hot Path
 ### Phase 4: Recommendation Pipeline Fixes (~1 week)
+> **Detailed plan**: [`docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`](../phases/PHASE4-Recommendation-Pipeline-Fixes.md)
 Corrections to the existing recommendation pipeline based on Doc 06's findings.
+Items 4.2 (α_long tuning) and 4.3-old (negative profile wiring) are already done.
 #### 4.1 Replace RRF with Importance-Weighted Quota Fusion
+**Why**: RRF lets dominant clusters swamp minor interests — the exact failure mode multi-interest models exist to prevent. PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN all use quota, not RRF.
+**What to build**:
+- New `app/recommend/fusion.py` — `allocate_quotas()` function
+- Refactor `_multi_interest_recommend()` to use `asyncio.gather()` for concurrent per-cluster searches
+- Deduplicate across clusters (first-occurrence wins)
 ```
 clusters = compute_clusters(...)
+quotas = allocate_quotas([c.importance for c in clusters], total=100, min=3)
+results = asyncio.gather(search_by_vector(medoid_k, limit=quota_k*3) for each k)
+deduplicate → rerank → MMR → serve
 ```
+#### ~~4.2 Tune α_long from 0.10 → 0.03~~ ✅ ALREADY DONE (Phase 2a)
+α_long is already 0.03 in `app/recommend/profiles.py:30`.
+#### ~~4.3-old Wire the Negative Profile~~ ✅ ALREADY DONE (Phase 2c)
+Negative EWMA is already Feature 5 in `app/recommend/reranker.py` with 0.15 penalty weight.
+#### 4.3 Hungarian Matching for Cluster Stability
+**Why**: Cluster indices shuffle when users save new papers, breaking analytics and future UI.
+**What to build**: `stabilize_cluster_ids()` in `clustering.py` using `scipy.optimize.linear_sum_assignment`. Cost matrix of medoid cosine distances; trivial at K≤7.
+#### 4.4 Category-Level Negative Suppression
+**Why**: YouTube (2023) showed 3× gain from richer negative treatment.
+**Decisions resolved**:
+- **Primary category only** — avoids over-suppression from secondary tags
+- **14-day window** — standard default (τ_neg = 14 days)
+- **Per-item temporal decay** → deferred to Phase 6 (LightGBM feature)
+**What to build**: `get_suppressed_categories()` in `db.py` (SQL join: interactions × paper_metadata), filter in `_multi_interest_recommend()` after reranking.
+#### ~~4.5 Pre-populate Metadata Store~~ ✅ ALREADY DONE (Phase 3.5 — Turso)
+Turso cloud DB with 1.23GB of metadata + citation counts. Search time: ~10.7s → ~1.75s.
 ---
 - ≥10 interactions: LightFM
 Retrain LightGBM with dismissals as negative labels (YouTube's 3× gain from dual labels).
+#### ~~9.3 Category-Level Negative Suppression~~ → Moved to Phase 4.4
+If ≥3 dismissals hit the same primary arXiv category within 14 days, suppress that category.
+**Decision**: Primary category only, τ_neg = 14 days. See Phase 4 plan.
 ---
 If you can only do three things, do these:
+### 1. ~~Build hybrid semantic search (Phase 3)~~ ✅ DONE
+### 2. ~~Pre-populate the metadata store (Phase 3.5)~~ ✅ DONE
+### 3. Replace RRF with quota fusion in recommendations (Phase 4.1) ← NEXT
 **Impact**: Prevents the dominant cluster from drowning out minority interests. Fixes the core multi-interest failure mode.
+**Effort**: New `fusion.py` + refactor `_multi_interest_recommend()`. ~1 week for all 3 Phase 4 items.
 ---
 | — | [Phase 1 Walkthrough](PHASE1-Zero-ML-Recommender.md) | Zero-ML recommender code tour | ✅ Complete |
 | — | [Phase 2 Recommender Walkthrough](02-Phase2-MultiInterest-Recommender.md) | Multi-interest engine implementation | ✅ Complete |
 | — | [Code Summary & Test Plan](03-Code-Summary-and-Test-Plan.md) | Codebase summary and testing strategy | ✅ Complete |
+| — | [Phase 2 Hybrid Search Plan](../phases/PHASE2-Hybrid-Search-Plan.md) | BGE-M3 + Zilliz hybrid search prototype | ✅ Superseded by Phase 3 |
+| — | [Phase 3 Hybrid Semantic Search](../phases/PHASE3-Hybrid-Semantic-Search.md) | Full hybrid search implementation plan | ✅ Complete |
+| — | [Phase 4 Recommendation Fixes](../phases/PHASE4-Recommendation-Pipeline-Fixes.md) | Quota fusion, Hungarian matching, negative suppression | 📋 Planned |
 | — | **This Document** | Revised phase plan synthesizing all research | ✅ Current |

tests/test_clustering.py CHANGED Viewed

@@ -15,6 +15,7 @@ import numpy as np
 from app.recommend.clustering import (
     compute_clusters,
     InterestCluster,
     MIN_PAPERS_FOR_CLUSTERING,
     MAX_CLUSTERS,
@@ -110,6 +111,7 @@ def test_importance_is_sorted_descending():
 def test_few_papers_returns_single_cluster():
     """When papers < MIN_PAPERS_FOR_CLUSTERING, return a single catch-all cluster."""
     ids = ["p1", "p2", "p3"]
     rng = np.random.RandomState(11)
     embs = rng.randn(3, 1024).astype(np.float32)
     # Normalise
@@ -155,6 +157,237 @@ def test_find_medoid():
     assert idx == 1, f"Expected medoid idx 1, got {idx}"
 # ── DB persistence test ──────────────────────────────────────────────────────
 @pytest.fixture

 from app.recommend.clustering import (
     compute_clusters,
+    stabilize_cluster_ids,
     InterestCluster,
     MIN_PAPERS_FOR_CLUSTERING,
     MAX_CLUSTERS,
 def test_few_papers_returns_single_cluster():
     """When papers < MIN_PAPERS_FOR_CLUSTERING, return a single catch-all cluster."""
     ids = ["p1", "p2", "p3"]
+    assert len(ids) < MIN_PAPERS_FOR_CLUSTERING, "test precondition: ids must be below threshold"
     rng = np.random.RandomState(11)
     embs = rng.randn(3, 1024).astype(np.float32)
     # Normalise
     assert idx == 1, f"Expected medoid idx 1, got {idx}"
+# ── Hungarian matching / cluster ID stabilisation (Phase 4.2) ────────────────
+def _make_two_cluster_pair(seed: int = 0) -> tuple[list, list]:
+    """
+    Build two well-separated InterestCluster lists sharing the same embedding
+    space so Hungarian matching can correctly align them.
+    Returns (new_clusters, old_clusters) where new_clusters[0] corresponds
+    semantically to old_clusters[0].
+    """
+    rng = np.random.RandomState(seed)
+    dim = 1024
+    # Two distinct topic centers
+    center_a = rng.randn(dim).astype(np.float32)
+    center_a /= np.linalg.norm(center_a)
+    center_b = rng.randn(dim).astype(np.float32)
+    center_b /= np.linalg.norm(center_b)
+    def _near(center, n=5, spread=0.001):
+        # NOTE: spread is scaled small because random noise in 1024-d has
+        # magnitude ~sqrt(dim)*spread, so spread=0.05 gives noise≈1.6 which
+        # dominates the unit-length center. 0.001 keeps cosine sim ≥ 0.99.
+        vecs = []
+        for _ in range(n):
+            v = center + rng.randn(dim).astype(np.float32) * spread
+            v /= np.linalg.norm(v)
+            vecs.append(v)
+        return vecs
+    medoid_a_new = _near(center_a)[0]
+    medoid_b_new = _near(center_b)[0]
+    medoid_a_old = _near(center_a)[0]
+    medoid_b_old = _near(center_b)[0]
+    old = [
+        InterestCluster(cluster_idx=0, medoid_paper_id="old_a", medoid_embedding=medoid_a_old,
+                        paper_ids=["old_a"], importance=5.0),
+        InterestCluster(cluster_idx=1, medoid_paper_id="old_b", medoid_embedding=medoid_b_old,
+                        paper_ids=["old_b"], importance=3.0),
+    ]
+    # new clusters have swapped order (b first, a second) → naive assignment would shuffle
+    new = [
+        InterestCluster(cluster_idx=0, medoid_paper_id="new_b", medoid_embedding=medoid_b_new,
+                        paper_ids=["new_b"], importance=3.0),
+        InterestCluster(cluster_idx=1, medoid_paper_id="new_a", medoid_embedding=medoid_a_new,
+                        paper_ids=["new_a"], importance=5.0),
+    ]
+    return new, old
+def test_stabilize_matches_semantically_equivalent_clusters():
+    """
+    When topic A was cluster 0 and remains cluster 0 after recluster (just
+    re-ordered by importance), stabilise_cluster_ids should restore idx=0 for A.
+    """
+    new, old = _make_two_cluster_pair()
+    # new[0] is topic B, new[1] is topic A
+    # old[0] is topic A (idx=0), old[1] is topic B (idx=1)
+    stabilised = stabilize_cluster_ids(new, old)
+    # After stabilisation, the cluster containing "new_a" should have idx=0
+    # and "new_b" should have idx=1
+    idx_map = {c.medoid_paper_id: c.cluster_idx for c in stabilised}
+    assert idx_map["new_a"] == 0, f"Topic A should be idx 0, got {idx_map}"
+    assert idx_map["new_b"] == 1, f"Topic B should be idx 1, got {idx_map}"
+def test_stabilize_preserves_all_clusters():
+    """Output length must equal input length."""
+    new, old = _make_two_cluster_pair()
+    stabilised = stabilize_cluster_ids(new, old)
+    assert len(stabilised) == len(new)
+def test_stabilize_unique_indices():
+    """All cluster indices in the output must be unique."""
+    new, old = _make_two_cluster_pair()
+    stabilised = stabilize_cluster_ids(new, old)
+    indices = [c.cluster_idx for c in stabilised]
+    assert len(indices) == len(set(indices)), f"Duplicate indices: {indices}"
+def test_stabilize_no_old_clusters_returns_unchanged():
+    """With no old clusters, return new clusters as-is."""
+    new, _ = _make_two_cluster_pair()
+    result = stabilize_cluster_ids(new, [])
+    assert result == new
+def test_stabilize_no_new_clusters_returns_empty():
+    """With no new clusters, return empty list."""
+    _, old = _make_two_cluster_pair()
+    result = stabilize_cluster_ids([], old)
+    assert result == []
+def test_stabilize_rejects_unrelated_match():
+    """
+    Doc 06 requirement: Hungarian must NOT inherit an old cluster's identity
+    when the cosine similarity is below the threshold (default 0.5).  A user's
+    genuinely-new topic should get a fresh index, not steal an old NLP idx
+    just because Hungarian found the "least bad" assignment.
+    """
+    rng = np.random.RandomState(7)
+    dim = 1024
+    def _rand_unit():
+        v = rng.randn(dim).astype(np.float32)
+        return v / np.linalg.norm(v)
+    # Two very different topics: old_topic_vec vs new_topic_vec (orthogonal-ish)
+    old_vec = _rand_unit()
+    new_vec = _rand_unit()
+    # Force near-orthogonality so cosine sim << 0.5
+    # (random 1024-dim unit vectors already average near 0, so this should hold)
+    cos_sim = float(new_vec @ old_vec)
+    assert abs(cos_sim) < 0.3, f"test precondition failed: cos_sim={cos_sim}"
+    old = [InterestCluster(cluster_idx=5, medoid_paper_id="old_topic",
+                           medoid_embedding=old_vec, paper_ids=[], importance=1.0)]
+    new = [InterestCluster(cluster_idx=0, medoid_paper_id="new_topic",
+                           medoid_embedding=new_vec, paper_ids=[], importance=1.0)]
+    stabilised = stabilize_cluster_ids(new, old)
+    # The unrelated new cluster must NOT inherit idx=5
+    assert stabilised[0].cluster_idx != 5, \
+        "Unrelated topic inherited old cluster's index (threshold not enforced)"
+def test_stabilize_custom_threshold():
+    """Custom min_cosine_sim should control matching strictness."""
+    rng = np.random.RandomState(13)
+    dim = 1024
+    base = rng.randn(dim).astype(np.float32)
+    base /= np.linalg.norm(base)
+    # Slightly perturbed — spread=0.001 in 1024-d gives cos_sim ~ 0.9995
+    perturbed = base + rng.randn(dim).astype(np.float32) * 0.001
+    perturbed /= np.linalg.norm(perturbed)
+    old = [InterestCluster(cluster_idx=2, medoid_paper_id="old",
+                           medoid_embedding=base, paper_ids=[], importance=1.0)]
+    new = [InterestCluster(cluster_idx=0, medoid_paper_id="new",
+                           medoid_embedding=perturbed, paper_ids=[], importance=1.0)]
+    # With default threshold 0.5, match succeeds (~0.9995 cos sim)
+    default_result = stabilize_cluster_ids(new, old)
+    assert default_result[0].cluster_idx == 2
+    # With threshold 0.99999 (stricter than actual 0.9995 sim), match rejected
+    strict_result = stabilize_cluster_ids(new, old, min_cosine_sim=0.99999)
+    assert strict_result[0].cluster_idx != 2
+def test_stabilize_more_new_than_old():
+    """K grew from 1 → 2: matched cluster keeps idx, new gets fresh idx."""
+    rng = np.random.RandomState(21)
+    dim = 1024
+    base = rng.randn(dim).astype(np.float32)
+    base /= np.linalg.norm(base)
+    close = base + rng.randn(dim).astype(np.float32) * 0.001
+    close /= np.linalg.norm(close)
+    far = rng.randn(dim).astype(np.float32)
+    far /= np.linalg.norm(far)
+    old = [InterestCluster(cluster_idx=0, medoid_paper_id="o",
+                           medoid_embedding=base, paper_ids=[], importance=1.0)]
+    new = [
+        InterestCluster(cluster_idx=0, medoid_paper_id="n1",
+                        medoid_embedding=close, paper_ids=[], importance=2.0),
+        InterestCluster(cluster_idx=1, medoid_paper_id="n2",
+                        medoid_embedding=far, paper_ids=[], importance=1.0),
+    ]
+    result = stabilize_cluster_ids(new, old)
+    idx_map = {c.medoid_paper_id: c.cluster_idx for c in result}
+    assert idx_map["n1"] == 0  # inherits old idx
+    assert idx_map["n2"] != 0  # fresh idx
+def test_stabilize_fewer_new_than_old():
+    """K shrank from 2 → 1: the surviving cluster keeps its idx."""
+    rng = np.random.RandomState(25)
+    dim = 1024
+    base = rng.randn(dim).astype(np.float32)
+    base /= np.linalg.norm(base)
+    other = rng.randn(dim).astype(np.float32)
+    other /= np.linalg.norm(other)
+    close = base + rng.randn(dim).astype(np.float32) * 0.001
+    close /= np.linalg.norm(close)
+    old = [
+        InterestCluster(cluster_idx=7, medoid_paper_id="oA",
+                        medoid_embedding=base, paper_ids=[], importance=2.0),
+        InterestCluster(cluster_idx=9, medoid_paper_id="oB",
+                        medoid_embedding=other, paper_ids=[], importance=1.0),
+    ]
+    new = [InterestCluster(cluster_idx=0, medoid_paper_id="nA",
+                           medoid_embedding=close, paper_ids=[], importance=1.0)]
+    result = stabilize_cluster_ids(new, old)
+    assert len(result) == 1
+    assert result[0].cluster_idx == 7  # inherits the matching old idx
+def test_stabilize_new_cluster_gets_fresh_index():
+    """
+    If new_clusters has more clusters than old, the extras get fresh indices
+    not conflicting with any matched index.
+    """
+    rng = np.random.RandomState(99)
+    dim = 1024
+    emb = lambda: (lambda v: v / np.linalg.norm(v))(rng.randn(dim).astype(np.float32))
+    old = [
+        InterestCluster(cluster_idx=0, medoid_paper_id="old_a", medoid_embedding=emb(),
+                        paper_ids=[], importance=1.0),
+    ]
+    new = [
+        InterestCluster(cluster_idx=0, medoid_paper_id="new_a", medoid_embedding=old[0].medoid_embedding.copy(),
+                        paper_ids=[], importance=1.0),
+        InterestCluster(cluster_idx=1, medoid_paper_id="new_brand", medoid_embedding=emb(),
+                        paper_ids=[], importance=1.0),
+    ]
+    stabilised = stabilize_cluster_ids(new, old)
+    indices = {c.medoid_paper_id: c.cluster_idx for c in stabilised}
+    assert indices["new_a"] == 0, "Matched cluster should inherit old index 0"
+    assert indices["new_brand"] != 0, "New unmatched cluster must not collide with idx 0"
 # ── DB persistence test ──────────────────────────────────────────────────────
 @pytest.fixture

tests/test_db.py CHANGED Viewed

@@ -116,3 +116,319 @@ async def test_metadata_cache_batch(tmp_db):
     assert "paper0" in result
     assert "paper2" in result
     assert "paper99" not in result

     assert "paper0" in result
     assert "paper2" in result
     assert "paper99" not in result
+# ── Phase 4.3: cache_turso_metadata_batch ────────────────────────────────────
+@pytest.mark.asyncio
+async def test_cache_turso_metadata_batch_writes_all(tmp_db):
+    """Turso dicts should be written to paper_metadata verbatim."""
+    import app.db as db
+    await db.init_db()
+    papers = [
+        {
+            "arxiv_id": "1706.03762",
+            "title": "Attention Is All You Need",
+            "abstract": "Transformers.",
+            "authors": '["Vaswani"]',
+            "category": "cs.CL",
+            "published": "2017-06-12",
+            "year": 2017,
+            "citation_count": 50000,
+        },
+        {
+            "arxiv_id": "2001.00001",
+            "title": "Another Paper",
+            "abstract": "...",
+            "authors": '["Smith"]',
+            "category": "cs.CV",
+            "published": "2020-01-01",
+            "year": 2020,
+        },
+    ]
+    await db.cache_turso_metadata_batch(papers)
+    cached = await db.get_cached_metadata("1706.03762")
+    assert cached is not None
+    assert cached["title"] == "Attention Is All You Need"
+    assert cached["category"] == "cs.CL"
+    cached2 = await db.get_cached_metadata("2001.00001")
+    assert cached2 is not None
+    assert cached2["category"] == "cs.CV"
+@pytest.mark.asyncio
+async def test_cache_turso_metadata_batch_empty(tmp_db):
+    """Empty input must not crash."""
+    import app.db as db
+    await db.init_db()
+    await db.cache_turso_metadata_batch([])
+    # No exception = success
+@pytest.mark.asyncio
+async def test_cache_turso_metadata_batch_skips_missing_arxiv_id(tmp_db):
+    """Rows without arxiv_id should be skipped, others persisted."""
+    import app.db as db
+    await db.init_db()
+    papers = [
+        {"title": "No ID", "category": "cs.LG"},  # missing arxiv_id
+        {"arxiv_id": "good.123", "title": "Good", "category": "cs.AI",
+         "abstract": "", "authors": "[]", "published": "2024-01-01"},
+    ]
+    await db.cache_turso_metadata_batch(papers)
+    cached = await db.get_cached_metadata("good.123")
+    assert cached is not None
+    assert cached["title"] == "Good"
+@pytest.mark.asyncio
+async def test_cache_turso_metadata_batch_upserts(tmp_db):
+    """Second write for same arxiv_id should overwrite the first."""
+    import app.db as db
+    await db.init_db()
+    paper_v1 = {"arxiv_id": "p1", "title": "V1", "category": "cs.LG",
+                "abstract": "", "authors": "[]", "published": "2024-01-01"}
+    paper_v2 = {"arxiv_id": "p1", "title": "V2", "category": "cs.CV",
+                "abstract": "", "authors": "[]", "published": "2024-01-01"}
+    await db.cache_turso_metadata_batch([paper_v1])
+    await db.cache_turso_metadata_batch([paper_v2])
+    cached = await db.get_cached_metadata("p1")
+    assert cached["title"] == "V2"
+    assert cached["category"] == "cs.CV"
+# ── Phase 4.3: get_suppressed_categories ──────────────────────────────────────
+@pytest.mark.asyncio
+async def test_suppressed_empty_for_new_user(tmp_db):
+    import app.db as db
+    await db.init_db()
+    result = await db.get_suppressed_categories("never-dismissed")
+    assert result == set()
+@pytest.mark.asyncio
+async def test_suppressed_below_threshold_not_returned(tmp_db):
+    """Two dismissals in one category (< threshold=3) should NOT suppress."""
+    import app.db as db
+    await db.init_db()
+    # Seed metadata
+    for i, aid in enumerate(["p1", "p2"]):
+        await db.cache_metadata({
+            "arxiv_id": aid, "title": f"t{i}", "abstract": "",
+            "authors": "[]", "category": "cs.CV", "published": "2024-01-01",
+        })
+    # Two dismissals — below threshold=3
+    await db.log_interaction("u1", "p1", "not_interested")
+    await db.log_interaction("u1", "p2", "not_interested")
+    result = await db.get_suppressed_categories("u1")
+    assert "cs.CV" not in result
+@pytest.mark.asyncio
+async def test_suppressed_at_threshold_returned(tmp_db):
+    """Three dismissals in same category should suppress that category."""
+    import app.db as db
+    await db.init_db()
+    for i, aid in enumerate(["p1", "p2", "p3"]):
+        await db.cache_metadata({
+            "arxiv_id": aid, "title": f"t{i}", "abstract": "",
+            "authors": "[]", "category": "physics.optics", "published": "2024-01-01",
+        })
+    for aid in ["p1", "p2", "p3"]:
+        await db.log_interaction("u1", aid, "not_interested")
+    result = await db.get_suppressed_categories("u1")
+    assert "physics.optics" in result
+@pytest.mark.asyncio
+async def test_suppressed_only_counts_not_interested(tmp_db):
+    """Saves should NOT count toward suppression."""
+    import app.db as db
+    await db.init_db()
+    for aid in ["p1", "p2", "p3"]:
+        await db.cache_metadata({
+            "arxiv_id": aid, "title": "t", "abstract": "",
+            "authors": "[]", "category": "cs.CL", "published": "2024-01-01",
+        })
+    # 3 saves (not dismissals) in same category
+    for aid in ["p1", "p2", "p3"]:
+        await db.log_interaction("u1", aid, "save")
+    result = await db.get_suppressed_categories("u1")
+    assert "cs.CL" not in result
+@pytest.mark.asyncio
+async def test_suppressed_partitions_categories(tmp_db):
+    """Different categories should be independent."""
+    import app.db as db
+    await db.init_db()
+    # 3 dismissals in cs.AI, 1 in cs.LG
+    for aid in ["a1", "a2", "a3"]:
+        await db.cache_metadata({
+            "arxiv_id": aid, "title": "t", "abstract": "",
+            "authors": "[]", "category": "cs.AI", "published": "2024-01-01",
+        })
+        await db.log_interaction("u1", aid, "not_interested")
+    await db.cache_metadata({
+        "arxiv_id": "lone", "title": "t", "abstract": "",
+        "authors": "[]", "category": "cs.LG", "published": "2024-01-01",
+    })
+    await db.log_interaction("u1", "lone", "not_interested")
+    result = await db.get_suppressed_categories("u1")
+    assert "cs.AI" in result
+    assert "cs.LG" not in result
+@pytest.mark.asyncio
+async def test_suppressed_ignores_other_users(tmp_db):
+    """One user's dismissals must not affect another user's suppressions."""
+    import app.db as db
+    await db.init_db()
+    for aid in ["p1", "p2", "p3"]:
+        await db.cache_metadata({
+            "arxiv_id": aid, "title": "t", "abstract": "",
+            "authors": "[]", "category": "cs.CV", "published": "2024-01-01",
+        })
+        await db.log_interaction("userA", aid, "not_interested")
+    result_a = await db.get_suppressed_categories("userA")
+    result_b = await db.get_suppressed_categories("userB")
+    assert "cs.CV" in result_a
+    assert result_b == set()
+@pytest.mark.asyncio
+async def test_suppressed_empty_category_excluded(tmp_db):
+    """Papers with empty category string should not produce a '' suppression."""
+    import app.db as db
+    await db.init_db()
+    for aid in ["e1", "e2", "e3"]:
+        await db.cache_metadata({
+            "arxiv_id": aid, "title": "t", "abstract": "",
+            "authors": "[]", "category": "", "published": "2024-01-01",
+        })
+        await db.log_interaction("u1", aid, "not_interested")
+    result = await db.get_suppressed_categories("u1")
+    assert "" not in result
+@pytest.mark.asyncio
+async def test_suppressed_custom_threshold(tmp_db):
+    """Threshold=2 should trigger at 2 dismissals."""
+    import app.db as db
+    await db.init_db()
+    for aid in ["x1", "x2"]:
+        await db.cache_metadata({
+            "arxiv_id": aid, "title": "t", "abstract": "",
+            "authors": "[]", "category": "math.NT", "published": "2024-01-01",
+        })
+        await db.log_interaction("u1", aid, "not_interested")
+    result = await db.get_suppressed_categories("u1", threshold=2)
+    assert "math.NT" in result
+    result_high = await db.get_suppressed_categories("u1", threshold=5)
+    assert "math.NT" not in result_high
+# ── Phase 4.5: Instrumentation columns ───────────────────────────────────────
+@pytest.mark.asyncio
+async def test_instrumentation_columns_exist(tmp_db):
+    """The interactions table should have ranker_version, candidate_source, cluster_id columns."""
+    import app.db as db
+    import aiosqlite
+    await db.init_db()
+    async with aiosqlite.connect(tmp_db) as conn:
+        cur = await conn.execute("PRAGMA table_info(interactions)")
+        columns = {row[1] for row in await cur.fetchall()}
+    assert "ranker_version" in columns
+    assert "candidate_source" in columns
+    assert "cluster_id" in columns
+@pytest.mark.asyncio
+async def test_log_interaction_stores_instrumentation_fields(tmp_db):
+    """log_interaction should persist ranker_version, candidate_source, cluster_id."""
+    import app.db as db
+    import aiosqlite
+    await db.init_db()
+    await db.log_interaction(
+        user_id="u1",
+        paper_id="p1",
+        event_type="save",
+        source="recommendation",
+        ranker_version="v4.1_test",
+        candidate_source="cluster_0",
+        cluster_id=0,
+    )
+    async with aiosqlite.connect(tmp_db) as conn:
+        conn.row_factory = aiosqlite.Row
+        cur = await conn.execute(
+            "SELECT ranker_version, candidate_source, cluster_id FROM interactions WHERE paper_id = 'p1'"
+        )
+        row = dict(await cur.fetchone())
+    assert row["ranker_version"] == "v4.1_test"
+    assert row["candidate_source"] == "cluster_0"
+    assert row["cluster_id"] == 0
+@pytest.mark.asyncio
+async def test_log_interaction_instrumentation_defaults_to_null(tmp_db):
+    """Omitting instrumentation fields should store NULLs (backward compat)."""
+    import app.db as db
+    import aiosqlite
+    await db.init_db()
+    await db.log_interaction("u1", "p2", "save", source="search")
+    async with aiosqlite.connect(tmp_db) as conn:
+        conn.row_factory = aiosqlite.Row
+        cur = await conn.execute(
+            "SELECT ranker_version, candidate_source, cluster_id FROM interactions WHERE paper_id = 'p2'"
+        )
+        row = dict(await cur.fetchone())
+    assert row["ranker_version"] is None
+    assert row["candidate_source"] is None
+    assert row["cluster_id"] is None
+@pytest.mark.asyncio
+async def test_migration_idempotent(tmp_db):
+    """Calling init_db() twice must not crash (ALTER TABLE migration is safe)."""
+    import app.db as db
+    await db.init_db()
+    await db.init_db()  # second call — migration should be idempotent
+    # No exception = success
+@pytest.mark.asyncio
+async def test_instrumentation_exploration_tag(tmp_db):
+    """Exploration papers should be stored with candidate_source='exploration'."""
+    import app.db as db
+    import aiosqlite
+    await db.init_db()
+    await db.log_interaction(
+        user_id="u1",
+        paper_id="explore_paper",
+        event_type="save",
+        source="recommendation",
+        ranker_version="v4.1_quota_hungarian_suppression",
+        candidate_source="exploration",
+        cluster_id=None,
+    )
+    async with aiosqlite.connect(tmp_db) as conn:
+        conn.row_factory = aiosqlite.Row
+        cur = await conn.execute(
+            "SELECT candidate_source, cluster_id FROM interactions WHERE paper_id = 'explore_paper'"
+        )
+        row = dict(await cur.fetchone())
+    assert row["candidate_source"] == "exploration"
+    assert row["cluster_id"] is None

tests/test_fusion.py ADDED Viewed

	@@ -0,0 +1,231 @@

+"""
+Tests for importance-weighted quota fusion.
+Covers:
+  - Proportional allocation (dominant cluster gets most slots)
+  - Floor guarantee (every cluster gets at least min_slots)
+  - Total slots == sum of allocated slots (or >= when floors force it)
+  - Remainder distributed correctly
+  - Single cluster gets all slots
+  - Equal importances → roughly equal allocation
+  - Zero importances fall back to equal distribution
+  - merge_quota_results deduplication and order
+"""
+from app.recommend.fusion import allocate_quotas, merge_quota_results
+# ── allocate_quotas ───────────────────────────────────────────────────────────
+def test_proportional_allocation():
+    """Dominant cluster should receive proportionally more slots."""
+    importances = [7.0, 3.0]
+    slots = allocate_quotas(importances, total_slots=100, min_slots=3)
+    assert len(slots) == 2
+    assert slots[0] > slots[1], "Dominant cluster (imp=7) should get more slots than minor (imp=3)"
+def test_floor_guarantee():
+    """Every cluster must receive at least min_slots regardless of importance."""
+    # One huge cluster and one tiny one
+    importances = [99.0, 1.0]
+    slots = allocate_quotas(importances, total_slots=100, min_slots=3)
+    assert all(s >= 3 for s in slots), f"Floor violated: {slots}"
+def test_total_slots_met():
+    """Sum of allocated slots should equal total_slots when no floor pressure."""
+    importances = [5.0, 3.0, 2.0]
+    total = 100
+    slots = allocate_quotas(importances, total_slots=total, min_slots=3)
+    assert sum(slots) == total, f"Expected sum={total}, got {sum(slots)} from {slots}"
+def test_floor_overrides_total():
+    """When many clusters with min_slots exceed total, allocation may go over."""
+    # 7 clusters × 3 min_slots = 21 > 20 total
+    importances = [1.0] * 7
+    slots = allocate_quotas(importances, total_slots=20, min_slots=3)
+    assert all(s >= 3 for s in slots), f"Floor violated under pressure: {slots}"
+    assert len(slots) == 7
+def test_single_cluster_gets_all():
+    """A single cluster should receive all slots (or min_slots if larger)."""
+    slots = allocate_quotas([5.0], total_slots=50, min_slots=3)
+    assert slots == [50]
+def test_equal_importances_roughly_equal():
+    """Equal importances should produce roughly equal slot counts."""
+    importances = [1.0, 1.0, 1.0]
+    slots = allocate_quotas(importances, total_slots=99, min_slots=3)
+    assert len(slots) == 3
+    assert slots == [33, 33, 33], f"Expected equal split [33,33,33], got {slots}"
+def test_zero_importances_fallback():
+    """All-zero importances should not crash; falls back to equal distribution."""
+    importances = [0.0, 0.0, 0.0]
+    slots = allocate_quotas(importances, total_slots=30, min_slots=3)
+    assert len(slots) == 3
+    assert sum(slots) == 30
+    assert all(s >= 3 for s in slots)
+def test_empty_importances():
+    """Empty input returns empty list."""
+    assert allocate_quotas([], total_slots=100) == []
+def test_remainder_distributed():
+    """With 3 equal clusters and 100 slots, remainder 1 goes to someone."""
+    importances = [1.0, 1.0, 1.0]
+    # 100 / 3 = 33.333 → floor is 33 each, remainder = 1
+    slots = allocate_quotas(importances, total_slots=100, min_slots=3)
+    assert sum(slots) == 100
+    assert sorted(slots) == [33, 33, 34]
+def test_two_cluster_sum_correct():
+    """70/30 split on 100 slots: sum should be exactly 100."""
+    slots = allocate_quotas([70.0, 30.0], total_slots=100, min_slots=3)
+    assert sum(slots) == 100
+    assert slots[0] >= slots[1]
+    assert slots[1] >= 3
+def test_doc06_worked_example():
+    """
+    Doc 06 worked example:
+      importances = [0.55, 0.30, 0.15], total=30, min=3
+      raw = [16.5, 9.0, 4.5]
+      floor = [16, 9, 4]  (sum=29)
+      remainder = 1 → largest frac (0.5 at idx 0) gets it
+      final = [17, 9, 4]
+    """
+    slots = allocate_quotas([0.55, 0.30, 0.15], total_slots=30, min_slots=3)
+    assert slots == [17, 9, 4], f"Doc 06 example expected [17, 9, 4], got {slots}"
+    assert sum(slots) == 30
+def test_doc06_tiny_cluster_floor():
+    """
+    Doc 06 tiny-cluster edge case:
+      importances = [0.60, 0.25, 0.10, 0.05], total=30, min=3
+      raw = [18.0, 7.5, 3.0, 1.5]
+      floor applied: [18, 7, 3, 3]  -- smallest cluster gets 3 not 1
+    """
+    slots = allocate_quotas([0.60, 0.25, 0.10, 0.05], total_slots=30, min_slots=3)
+    # The smallest cluster must get at least min_slots (3), not 1
+    assert slots[3] >= 3, f"Floor violated: smallest cluster got {slots[3]}"
+    # The dominant cluster still dominates
+    assert slots[0] > slots[1] > slots[2]
+def test_fractional_priority_deterministic():
+    """
+    Remainder should go to clusters with the largest fractional parts.
+    importances=[10,10,10], total=20, min=3
+      raw = [6.667, 6.667, 6.667]
+      floor = [6, 6, 6]  (sum=18)
+      remainder = 2 → all fractions equal (0.667), first two get +1 (stable sort)
+      final = [7, 7, 6]
+    """
+    slots = allocate_quotas([10.0, 10.0, 10.0], total_slots=20, min_slots=3)
+    assert sum(slots) == 20
+    # With 2 remainder slots and 3 equal clusters, counts should be [7, 7, 6] in some order
+    assert sorted(slots, reverse=True) == [7, 7, 6]
+def test_fractional_priority_prefers_larger_frac():
+    """
+    Cluster with larger fractional part should receive remainder bonus first.
+    importances=[2, 3] on 10 slots, min=3:
+      raw = [4.0, 6.0]
+      floor = [4, 6]  (sum=10, remainder=0)
+      final = [4, 6]
+    """
+    slots = allocate_quotas([2.0, 3.0], total_slots=10, min_slots=3)
+    assert slots == [4, 6]
+def test_many_clusters_floor_overflow():
+    """
+    10 clusters, each needs min=3, but total=20 means 10×3=30 > 20.
+    Floor guarantee overrides total — sum exceeds total_slots.
+    """
+    slots = allocate_quotas([1.0] * 10, total_slots=20, min_slots=3)
+    assert len(slots) == 10
+    assert all(s >= 3 for s in slots)
+    # Floor overflow: sum exceeds requested total because min_slots dominates
+    assert sum(slots) == 30
+def test_zero_importances_respects_floor_edge():
+    """
+    Zero-importance with total < n × min should still respect floor.
+    """
+    slots = allocate_quotas([0.0, 0.0, 0.0], total_slots=6, min_slots=3)
+    assert all(s >= 3 for s in slots)
+    assert len(slots) == 3
+def test_dominant_cluster_does_not_starve_minority():
+    """
+    Critical Doc 06 fairness test:
+    User 70% NLP, 30% RL — RL must not get zero slots (the RRF failure mode).
+    """
+    slots = allocate_quotas([70.0, 30.0], total_slots=30, min_slots=3)
+    assert slots[1] >= 3, f"Minority RL cluster starved: got {slots[1]}"
+    assert slots[0] > slots[1]  # but dominance is still preserved
+    assert sum(slots) == 30
+def test_allocation_order_matches_input():
+    """Output order must match input order (importance-ranked already by caller)."""
+    slots = allocate_quotas([50.0, 25.0, 25.0], total_slots=100, min_slots=3)
+    # Cluster 0 is the largest, gets most slots; clusters 1 and 2 tied
+    assert slots[0] >= slots[1]
+    assert slots[0] >= slots[2]
+# ── merge_quota_results ───────────────────────────────────────────────────────
+def test_merge_respects_quota():
+    """Each cluster contributes at most its quota to the result."""
+    cluster_a = ["a1", "a2", "a3", "a4", "a5"]
+    cluster_b = ["b1", "b2", "b3"]
+    result = merge_quota_results([cluster_a, cluster_b], quotas=[3, 3])
+    a_count = sum(1 for r in result if r.startswith("a"))
+    b_count = sum(1 for r in result if r.startswith("b"))
+    assert a_count <= 3, f"Cluster A exceeded quota: {a_count}"
+    assert b_count <= 3, f"Cluster B exceeded quota: {b_count}"
+def test_merge_deduplicates():
+    """Papers appearing in multiple clusters should appear only once."""
+    cluster_a = ["shared", "a1", "a2"]
+    cluster_b = ["shared", "b1", "b2"]
+    result = merge_quota_results([cluster_a, cluster_b], quotas=[3, 3])
+    assert result.count("shared") == 1, "Duplicate 'shared' should appear only once"
+def test_merge_preserves_order():
+    """Cluster A results appear before Cluster B results."""
+    cluster_a = ["a1", "a2"]
+    cluster_b = ["b1", "b2"]
+    result = merge_quota_results([cluster_a, cluster_b], quotas=[2, 2])
+    assert result == ["a1", "a2", "b1", "b2"]
+def test_merge_empty_cluster():
+    """An empty cluster contributes nothing; others still fill their quota."""
+    cluster_a = ["a1", "a2", "a3"]
+    cluster_b: list[str] = []
+    result = merge_quota_results([cluster_a, cluster_b], quotas=[3, 3])
+    assert result == ["a1", "a2", "a3"]
+def test_merge_empty_input():
+    """No clusters → empty result."""
+    assert merge_quota_results([], []) == []

tests/test_integration.py CHANGED Viewed

@@ -3,6 +3,7 @@ Integration tests: full HTTP request/response cycle via FastAPI TestClient.
 Tests the complete pipeline: search → save → recommendations.
 """
 import pytest
 from fastapi.testclient import TestClient
@@ -148,7 +149,8 @@ def test_recommendations_after_save(client, monkeypatch):
         return ["1706.03762"]
     monkeypatch.setattr(qs, "recommend", fake_recommend)
-    # Also mock metadata fetch so we don't hit arXiv API in this test
     import app.arxiv_svc as arxiv
     async def fake_batch(ids):
         return {
@@ -162,7 +164,8 @@ def test_recommendations_after_save(client, monkeypatch):
                 "year": 2017,
             }
         }
-    monkeypatch.setattr(arxiv, "fetch_metadata_batch", fake_batch)
     client.get("/")
     client.post("/api/papers/0704.0002/save", data={"source": "search"})
@@ -173,6 +176,110 @@ def test_recommendations_after_save(client, monkeypatch):
 # ── Full pipeline smoke test ───────────────────────────────────────────────────
 def test_full_pipeline_smoke(client, monkeypatch):
     """
     1. User visits home → gets cookie
@@ -211,6 +318,7 @@ def test_full_pipeline_smoke(client, monkeypatch):
         return ["2302.11382"]
     monkeypatch.setattr(qs, "recommend", fake_rec)
     async def fake_meta(ids):
         return {
             "2302.11382": {
@@ -223,7 +331,8 @@ def test_full_pipeline_smoke(client, monkeypatch):
                 "year": 2023,
             }
         }
-    monkeypatch.setattr(arxiv, "fetch_metadata_batch", fake_meta)
     resp = client.get("/api/recommendations")
     assert resp.status_code == 200

 Tests the complete pipeline: search → save → recommendations.
 """
 import pytest
+from unittest.mock import AsyncMock
 from fastapi.testclient import TestClient
         return ["1706.03762"]
     monkeypatch.setattr(qs, "recommend", fake_recommend)
+    # Also mock metadata fetch so we don't hit Turso DB in this test
+    import app.turso_svc as turso
     import app.arxiv_svc as arxiv
     async def fake_batch(ids):
         return {
                 "year": 2017,
             }
         }
+    monkeypatch.setattr(turso, "fetch_metadata_batch", fake_batch)
+    monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
     client.get("/")
     client.post("/api/papers/0704.0002/save", data={"source": "search"})
 # ── Full pipeline smoke test ───────────────────────────────────────────────────
+def test_quota_pipeline_preserves_minority_cluster(client, monkeypatch):
+    """
+    Phase 4.1 end-to-end check: with 5+ saves forming 2 distinct interests,
+    the quota pipeline must surface papers from BOTH clusters in the final feed.
+    This is the exact failure mode RRF was causing.
+    """
+    import numpy as np
+    import app.qdrant_svc as qs
+    import app.turso_svc as turso
+    import app.arxiv_svc as arxiv
+    import app.recommend.profiles as prof_mod
+    # Set up cookie
+    client.get("/")
+    # 5 saved papers, split into two topics (3 "NLP", 2 "RL") via embeddings
+    saved_ids = ["nlp_a", "nlp_b", "nlp_c", "rl_a", "rl_b"]
+    rng = np.random.RandomState(42)
+    nlp_center = rng.randn(1024).astype(np.float32)
+    nlp_center /= np.linalg.norm(nlp_center)
+    rl_center = rng.randn(1024).astype(np.float32)
+    rl_center /= np.linalg.norm(rl_center)
+    def _near(center):
+        v = center + rng.randn(1024).astype(np.float32) * 0.05
+        return (v / np.linalg.norm(v)).tolist()
+    saved_vectors = {
+        "nlp_a": _near(nlp_center),
+        "nlp_b": _near(nlp_center),
+        "nlp_c": _near(nlp_center),
+        "rl_a": _near(rl_center),
+        "rl_b": _near(rl_center),
+    }
+    # Candidate pool: 50 NLP-ish, 50 RL-ish
+    candidate_vectors = {}
+    nlp_candidates = [f"nlp_cand_{i}" for i in range(50)]
+    rl_candidates = [f"rl_cand_{i}" for i in range(50)]
+    for cid in nlp_candidates:
+        candidate_vectors[cid] = _near(nlp_center)
+    for cid in rl_candidates:
+        candidate_vectors[cid] = _near(rl_center)
+    async def fake_get_paper_vectors(ids):
+        combined = {**saved_vectors, **candidate_vectors}
+        return {aid: combined[aid] for aid in ids if aid in combined}
+    # search_by_vector returns candidates aligned with whichever centre
+    # the query is closer to
+    async def fake_search_by_vector(query_vector, limit, exclude_ids=None):
+        qv = np.array(query_vector, dtype=np.float32)
+        qv /= np.linalg.norm(qv)
+        if float(qv @ nlp_center) > float(qv @ rl_center):
+            pool = nlp_candidates
+        else:
+            pool = rl_candidates
+        exclude = exclude_ids or set()
+        return [p for p in pool if p not in exclude][:limit]
+    monkeypatch.setattr(qs, "get_paper_vectors", fake_get_paper_vectors)
+    monkeypatch.setattr(qs, "search_by_vector", fake_search_by_vector)
+    # Skip EWMA short-term lookup — returns None
+    async def fake_load_profile(uid, kind):
+        return None
+    monkeypatch.setattr(prof_mod, "load_profile", fake_load_profile)
+    async def fake_interaction_count(uid, kind):
+        return 0
+    monkeypatch.setattr(prof_mod, "get_interaction_count", fake_interaction_count)
+    # Metadata: provide category so templates render
+    async def fake_meta(ids):
+        return {
+            aid: {
+                "arxiv_id": aid,
+                "title": f"Title {aid}",
+                "abstract": "...",
+                "authors": "[]",
+                "category": "cs.CL" if aid.startswith("nlp") else "cs.LG",
+                "published": "2024-01-01",
+                "year": 2024,
+            }
+            for aid in ids
+        }
+    monkeypatch.setattr(turso, "fetch_metadata_batch", fake_meta)
+    from unittest.mock import AsyncMock
+    monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
+    # Save 5 papers to cross the MIN_PAPERS_FOR_CLUSTERING threshold
+    for aid in saved_ids:
+        client.post(f"/api/papers/{aid}/save", data={"source": "search"})
+    resp = client.get("/api/recommendations")
+    assert resp.status_code == 200
+    # The response should include recs from BOTH candidate pools (quota working)
+    has_nlp_rec = any(f"nlp_cand_{i}" in resp.text for i in range(50))
+    has_rl_rec = any(f"rl_cand_{i}" in resp.text for i in range(50))
+    assert has_nlp_rec, "No NLP cluster recs — dominant cluster failed to surface"
+    assert has_rl_rec, "Minority RL cluster starved — quota fusion is not working"
 def test_full_pipeline_smoke(client, monkeypatch):
     """
     1. User visits home → gets cookie
         return ["2302.11382"]
     monkeypatch.setattr(qs, "recommend", fake_rec)
+    import app.turso_svc as turso
     async def fake_meta(ids):
         return {
             "2302.11382": {
                 "year": 2023,
             }
         }
+    monkeypatch.setattr(turso, "fetch_metadata_batch", fake_meta)
+    monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
     resp = client.get("/api/recommendations")
     assert resp.status_code == 200

tests/test_search_router.py CHANGED Viewed

@@ -1,10 +1,13 @@
 """
-Layer 3: Search router integration tests — Phase 3.
 Tests /search endpoint with mocked hybrid_search_svc.
 Validates: ranking preservation, arXiv fallback, saved/dismissed state,
 HTMX partials, and that empty queries don't trigger hybrid search.
 No network, no model, no external services needed.
 """
 import pytest
@@ -41,15 +44,17 @@ def test_search_hybrid_returns_papers(client, monkeypatch):
     """
     /search?q=... should use hybrid search and render paper cards.
     We mock hybrid_search_svc.search() to return known IDs and
-    arxiv_svc.fetch_metadata_batch() to return metadata for those IDs.
     """
     import app.hybrid_search_svc as hs
     import app.arxiv_svc as arxiv
     monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
         "1706.03762", "2301.00001",
     ]))
-    monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={
         "1706.03762": {
             "arxiv_id": "1706.03762",
             "title": "Attention Is All You Need",
@@ -69,6 +74,8 @@ def test_search_hybrid_returns_papers(client, monkeypatch):
             "year": 2023,
         },
     }))
     resp = client.get("/search?q=transformer+attention")
     assert resp.status_code == 200
@@ -82,13 +89,15 @@ def test_search_hybrid_preserves_ranking(client, monkeypatch):
     returned by hybrid_search_svc.search() — i.e., paper A before paper B.
     """
     import app.hybrid_search_svc as hs
     import app.arxiv_svc as arxiv
     # Hybrid search returns A first, then B
     monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
         "2401.00001", "1706.03762",
     ]))
-    monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={
         "2401.00001": {
             "arxiv_id": "2401.00001",
             "title": "First Paper Should Appear First",
@@ -102,6 +111,7 @@ def test_search_hybrid_preserves_ranking(client, monkeypatch):
             "category": "cs.CL", "published": "2017-06-12", "year": 2017,
         },
     }))
     resp = client.get("/search?q=test+query")
     # First paper should appear before second paper in HTML
@@ -144,12 +154,14 @@ def test_search_sets_saved_dismissed_flags(client, monkeypatch):
     based on the user's state.
     """
     import app.hybrid_search_svc as hs
     import app.arxiv_svc as arxiv
     monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
         "1706.03762", "2301.00001",
     ]))
-    monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={
         "1706.03762": {
             "arxiv_id": "1706.03762", "title": "Saved Paper",
             "abstract": "...", "authors": '["A"]',
@@ -161,6 +173,7 @@ def test_search_sets_saved_dismissed_flags(client, monkeypatch):
             "category": "cs.AI", "published": "2023-01-01", "year": 2023,
         },
     }))
     # First: visit home to get cookie, then save a paper
     client.get("/")
@@ -180,16 +193,19 @@ def test_search_htmx_partial_with_hybrid(client, monkeypatch):
     same as before the hybrid search swap.
     """
     import app.hybrid_search_svc as hs
     import app.arxiv_svc as arxiv
     monkeypatch.setattr(hs, "search", AsyncMock(return_value=["1706.03762"]))
-    monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={
         "1706.03762": {
             "arxiv_id": "1706.03762", "title": "HTMX Test Paper",
             "abstract": "...", "authors": '["A"]',
             "category": "cs.CL", "published": "2017-06-12", "year": 2017,
         },
     }))
     resp = client.get(
         "/search?q=transformer",

 """
+Layer 3: Search router integration tests — Phase 3 + 3.5.
 Tests /search endpoint with mocked hybrid_search_svc.
 Validates: ranking preservation, arXiv fallback, saved/dismissed state,
 HTMX partials, and that empty queries don't trigger hybrid search.
+Phase 3.5: Turso is now the primary metadata source, arXiv API is fallback.
+All tests mock turso_svc.fetch_metadata_batch to avoid hitting the real DB.
 No network, no model, no external services needed.
 """
 import pytest
     """
     /search?q=... should use hybrid search and render paper cards.
     We mock hybrid_search_svc.search() to return known IDs and
+    turso_svc.fetch_metadata_batch() to return metadata for those IDs.
     """
     import app.hybrid_search_svc as hs
+    import app.turso_svc as turso
     import app.arxiv_svc as arxiv
     monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
         "1706.03762", "2301.00001",
     ]))
+    # Phase 3.5: Turso is the primary metadata source
+    monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
         "1706.03762": {
             "arxiv_id": "1706.03762",
             "title": "Attention Is All You Need",
             "year": 2023,
         },
     }))
+    # arXiv fallback returns empty (Turso found everything)
+    monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
     resp = client.get("/search?q=transformer+attention")
     assert resp.status_code == 200
     returned by hybrid_search_svc.search() — i.e., paper A before paper B.
     """
     import app.hybrid_search_svc as hs
+    import app.turso_svc as turso
     import app.arxiv_svc as arxiv
     # Hybrid search returns A first, then B
     monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
         "2401.00001", "1706.03762",
     ]))
+    # Phase 3.5: Turso is the primary metadata source
+    monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
         "2401.00001": {
             "arxiv_id": "2401.00001",
             "title": "First Paper Should Appear First",
             "category": "cs.CL", "published": "2017-06-12", "year": 2017,
         },
     }))
+    monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
     resp = client.get("/search?q=test+query")
     # First paper should appear before second paper in HTML
     based on the user's state.
     """
     import app.hybrid_search_svc as hs
+    import app.turso_svc as turso
     import app.arxiv_svc as arxiv
     monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
         "1706.03762", "2301.00001",
     ]))
+    # Phase 3.5: Turso is the primary metadata source
+    monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
         "1706.03762": {
             "arxiv_id": "1706.03762", "title": "Saved Paper",
             "abstract": "...", "authors": '["A"]',
             "category": "cs.AI", "published": "2023-01-01", "year": 2023,
         },
     }))
+    monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
     # First: visit home to get cookie, then save a paper
     client.get("/")
     same as before the hybrid search swap.
     """
     import app.hybrid_search_svc as hs
+    import app.turso_svc as turso
     import app.arxiv_svc as arxiv
     monkeypatch.setattr(hs, "search", AsyncMock(return_value=["1706.03762"]))
+    # Phase 3.5: Turso is the primary metadata source
+    monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
         "1706.03762": {
             "arxiv_id": "1706.03762", "title": "HTMX Test Paper",
             "abstract": "...", "authors": '["A"]',
             "category": "cs.CL", "published": "2017-06-12", "year": 2017,
         },
     }))
+    monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
     resp = client.get(
         "/search?q=transformer",