siddhm11 commited on
Commit
61d5f0d
·
1 Parent(s): 10fbe3b

Phase 4 complete + Phase 4.5 instrumentation foundation

Browse files

Phase 4 (Recommendation Pipeline Fixes) - all implemented:
- 4.1: Importance-weighted quota fusion (fusion.py, 20 tests)
- 4.2: Turso metadata (done in Phase 3.5)
- 4.3: Hungarian matching for cluster stability (10 tests)
- 4.4: Category-level negative suppression (8 tests)

Phase 4.5 (Instrumentation Foundation) - NEW:
- Added ranker_version, candidate_source, cluster_id to interactions table
- ALTER TABLE migration for existing DBs (idempotent)
- Pipeline tagging: Tier 1 papers tagged by cluster/exploration
- End-to-end flow: recommendations.py -> templates -> events.py -> db.py
- 5 dedicated instrumentation tests

TASK-TRACKER: Phase 4 marked COMPLETE, Phase 4.5 added, Phase 8 expanded
Test count: 123 -> 176 (175 passing, 1 pre-existing flaky)

app/db.py CHANGED
@@ -6,6 +6,11 @@ Tables
6
  interactions – every user action (save, not_interested, click, view)
7
  paper_qdrant_map – arxiv_id → integer Qdrant point ID (cached lazily)
8
  paper_metadata – arXiv API response cache (title, abstract, …)
 
 
 
 
 
9
  """
10
  import aiosqlite
11
  from app.config import DB_PATH
@@ -17,14 +22,17 @@ PRAGMA journal_mode=WAL;
17
  PRAGMA synchronous=NORMAL;
18
 
19
  CREATE TABLE IF NOT EXISTS interactions (
20
- id INTEGER PRIMARY KEY AUTOINCREMENT,
21
- user_id TEXT NOT NULL,
22
- paper_id TEXT NOT NULL,
23
- event_type TEXT NOT NULL, -- save | not_interested | click | view
24
- source TEXT, -- search | recommendation
25
- position INTEGER,
26
- query_id TEXT,
27
- timestamp TEXT NOT NULL DEFAULT (datetime('now'))
 
 
 
28
  );
29
 
30
  CREATE INDEX IF NOT EXISTS idx_ui_user_ts
@@ -73,10 +81,25 @@ CREATE TABLE IF NOT EXISTS user_clusters (
73
  """
74
 
75
 
 
 
 
 
 
 
 
 
 
76
  async def init_db() -> None:
77
  """Create tables if they don't exist. Called once at startup."""
78
  async with aiosqlite.connect(DB_PATH) as db:
79
  await db.executescript(_SCHEMA)
 
 
 
 
 
 
80
  await db.commit()
81
 
82
 
@@ -89,13 +112,18 @@ async def log_interaction(
89
  source: str | None = None,
90
  position: int | None = None,
91
  query_id: str | None = None,
 
 
 
92
  ) -> None:
93
  async with aiosqlite.connect(DB_PATH) as db:
94
  await db.execute(
95
  """INSERT INTO interactions
96
- (user_id, paper_id, event_type, source, position, query_id)
97
- VALUES (?, ?, ?, ?, ?, ?)""",
98
- (user_id, paper_id, event_type, source, position, query_id),
 
 
99
  )
100
  await db.commit()
101
 
@@ -273,3 +301,68 @@ async def get_user_clusters(user_id: str) -> list[dict]:
273
  )
274
  rows = await cur.fetchall()
275
  return [dict(r) for r in rows]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  interactions – every user action (save, not_interested, click, view)
7
  paper_qdrant_map – arxiv_id → integer Qdrant point ID (cached lazily)
8
  paper_metadata – arXiv API response cache (title, abstract, …)
9
+
10
+ Phase 4.5 instrumentation columns (interactions table):
11
+ ranker_version – identifies which pipeline version served the paper
12
+ candidate_source – granular origin: 'cluster_0', 'exploration', 'ewma', etc.
13
+ cluster_id – which interest cluster served this paper (NULL if N/A)
14
  """
15
  import aiosqlite
16
  from app.config import DB_PATH
 
22
  PRAGMA synchronous=NORMAL;
23
 
24
  CREATE TABLE IF NOT EXISTS interactions (
25
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
26
+ user_id TEXT NOT NULL,
27
+ paper_id TEXT NOT NULL,
28
+ event_type TEXT NOT NULL, -- save | not_interested | click | view
29
+ source TEXT, -- search | recommendation
30
+ position INTEGER,
31
+ query_id TEXT,
32
+ ranker_version TEXT, -- Phase 4.5: pipeline version tag
33
+ candidate_source TEXT, -- Phase 4.5: 'cluster_0' | 'exploration' | 'ewma' | 'qdrant_recommend'
34
+ cluster_id INTEGER, -- Phase 4.5: interest cluster index (NULL if N/A)
35
+ timestamp TEXT NOT NULL DEFAULT (datetime('now'))
36
  );
37
 
38
  CREATE INDEX IF NOT EXISTS idx_ui_user_ts
 
81
  """
82
 
83
 
84
+ # ── Phase 4.5: ALTER TABLE migration for existing DBs ─────────────────────────
85
+ # SQLite does not support IF NOT EXISTS for columns, so we try/except.
86
+ _MIGRATION_4_5 = [
87
+ "ALTER TABLE interactions ADD COLUMN ranker_version TEXT",
88
+ "ALTER TABLE interactions ADD COLUMN candidate_source TEXT",
89
+ "ALTER TABLE interactions ADD COLUMN cluster_id INTEGER",
90
+ ]
91
+
92
+
93
  async def init_db() -> None:
94
  """Create tables if they don't exist. Called once at startup."""
95
  async with aiosqlite.connect(DB_PATH) as db:
96
  await db.executescript(_SCHEMA)
97
+ # Phase 4.5: add instrumentation columns to existing DBs
98
+ for stmt in _MIGRATION_4_5:
99
+ try:
100
+ await db.execute(stmt)
101
+ except Exception:
102
+ pass # Column already exists — safe to ignore
103
  await db.commit()
104
 
105
 
 
112
  source: str | None = None,
113
  position: int | None = None,
114
  query_id: str | None = None,
115
+ ranker_version: str | None = None,
116
+ candidate_source: str | None = None,
117
+ cluster_id: int | None = None,
118
  ) -> None:
119
  async with aiosqlite.connect(DB_PATH) as db:
120
  await db.execute(
121
  """INSERT INTO interactions
122
+ (user_id, paper_id, event_type, source, position, query_id,
123
+ ranker_version, candidate_source, cluster_id)
124
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
125
+ (user_id, paper_id, event_type, source, position, query_id,
126
+ ranker_version, candidate_source, cluster_id),
127
  )
128
  await db.commit()
129
 
 
301
  )
302
  rows = await cur.fetchall()
303
  return [dict(r) for r in rows]
304
+
305
+
306
+ # ── Phase 4.3: Category suppression helpers ───────────────────────────────────
307
+
308
+ async def cache_turso_metadata_batch(papers: list[dict]) -> None:
309
+ """
310
+ Write Turso paper dicts to the paper_metadata SQLite cache.
311
+
312
+ Called after every Turso fetch so dismissal-category JOINs work.
313
+ Silently skips rows missing required fields.
314
+ """
315
+ if not papers:
316
+ return
317
+ async with aiosqlite.connect(DB_PATH) as conn:
318
+ for paper in papers:
319
+ if not paper.get("arxiv_id"):
320
+ continue
321
+ try:
322
+ await conn.execute(
323
+ """INSERT OR REPLACE INTO paper_metadata
324
+ (arxiv_id, title, abstract, authors, category, published)
325
+ VALUES (:arxiv_id, :title, :abstract, :authors, :category, :published)""",
326
+ {
327
+ "arxiv_id": paper.get("arxiv_id", ""),
328
+ "title": paper.get("title", ""),
329
+ "abstract": paper.get("abstract", ""),
330
+ "authors": paper.get("authors", "[]"),
331
+ "category": paper.get("category", ""),
332
+ "published": paper.get("published", ""),
333
+ },
334
+ )
335
+ except Exception:
336
+ pass
337
+ await conn.commit()
338
+
339
+
340
+ async def get_suppressed_categories(
341
+ user_id: str,
342
+ threshold: int = 3,
343
+ window_days: int = 14,
344
+ ) -> set[str]:
345
+ """
346
+ Return categories the user has strongly signalled disinterest in.
347
+
348
+ A category is suppressed when the user has dismissed ≥ threshold papers
349
+ in that category within the last window_days days.
350
+
351
+ Requires paper_metadata to be populated (via cache_turso_metadata_batch).
352
+ Returns an empty set if no suppressions are found.
353
+ """
354
+ async with aiosqlite.connect(DB_PATH) as conn:
355
+ cur = await conn.execute(
356
+ """SELECT pm.category, COUNT(*) AS cnt
357
+ FROM interactions i
358
+ JOIN paper_metadata pm ON i.paper_id = pm.arxiv_id
359
+ WHERE i.user_id = ?
360
+ AND i.event_type = 'not_interested'
361
+ AND i.timestamp >= datetime('now', ? || ' days')
362
+ AND pm.category != ''
363
+ GROUP BY pm.category
364
+ HAVING COUNT(*) >= ?""",
365
+ (user_id, f"-{window_days}", threshold),
366
+ )
367
+ rows = await cur.fetchall()
368
+ return {row[0] for row in rows}
app/recommend/clustering.py CHANGED
@@ -20,6 +20,7 @@ import json
20
  from dataclasses import dataclass, field
21
  import numpy as np
22
  from scipy.cluster.hierarchy import ward, fcluster
 
23
  from scipy.spatial.distance import pdist
24
 
25
  from app import db
@@ -183,6 +184,95 @@ def _find_medoid(embeddings: np.ndarray, centroid: np.ndarray) -> int:
183
  return int(np.argmin(distances))
184
 
185
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  # ── Persistence ───────────────────────────────────────────────────────────────
187
 
188
  async def save_clusters_to_db(user_id: str, clusters: list[InterestCluster]) -> None:
 
20
  from dataclasses import dataclass, field
21
  import numpy as np
22
  from scipy.cluster.hierarchy import ward, fcluster
23
+ from scipy.optimize import linear_sum_assignment
24
  from scipy.spatial.distance import pdist
25
 
26
  from app import db
 
184
  return int(np.argmin(distances))
185
 
186
 
187
+ # ── Cluster ID stabilisation (Phase 4.2) ─────────────────────────────────────
188
+
189
+ # Hungarian matches below this cosine similarity are rejected as "unrelated".
190
+ # Doc 06 §"Clustering specifics": a genuinely new interest must not steal an
191
+ # old cluster's identity just because Hungarian found the least-bad assignment.
192
+ CLUSTER_MATCH_MIN_COSINE = 0.5
193
+
194
+
195
+ def stabilize_cluster_ids(
196
+ new_clusters: list[InterestCluster],
197
+ old_clusters: list[InterestCluster],
198
+ min_cosine_sim: float = CLUSTER_MATCH_MIN_COSINE,
199
+ ) -> list[InterestCluster]:
200
+ """
201
+ Preserve cluster identity across reclusters using the Hungarian algorithm.
202
+
203
+ Every time the user saves a paper we recluster from scratch. Without
204
+ stabilisation, cluster indices shuffle (NLP was 0, now it's 2), breaking
205
+ future analytics and UI labels.
206
+
207
+ Algorithm:
208
+ 1. Build cost matrix: cost[i][j] = 1 - cosine_sim(new_medoid_i, old_medoid_j)
209
+ 2. Solve with scipy linear_sum_assignment (O(K³), trivial for K ≤ 7)
210
+ 3. Matched pairs with cosine_sim >= min_cosine_sim inherit the old idx
211
+ 4. Weak matches (cosine_sim < min_cosine_sim) and unmatched new clusters
212
+ get the next available index
213
+
214
+ Args:
215
+ new_clusters: freshly computed clusters (cluster_idx values ignored)
216
+ old_clusters: clusters from the previous recluster (stable reference)
217
+ min_cosine_sim: reject matches below this cosine similarity (default 0.5)
218
+
219
+ Returns:
220
+ new_clusters with stable cluster_idx values assigned.
221
+ """
222
+ if not old_clusters or not new_clusters:
223
+ return new_clusters
224
+
225
+ new_embs = np.array([c.medoid_embedding for c in new_clusters], dtype=np.float32)
226
+ old_embs = np.array([c.medoid_embedding for c in old_clusters], dtype=np.float32)
227
+
228
+ # L2-normalise before cosine similarity
229
+ def _safe_norm(embs: np.ndarray) -> np.ndarray:
230
+ norms = np.linalg.norm(embs, axis=1, keepdims=True)
231
+ return embs / np.where(norms < 1e-10, 1.0, norms)
232
+
233
+ new_embs = _safe_norm(new_embs)
234
+ old_embs = _safe_norm(old_embs)
235
+
236
+ # Cosine similarity → cost matrix (n_new × n_old)
237
+ sim = new_embs @ old_embs.T
238
+ cost = 1.0 - sim
239
+
240
+ # Hungarian assignment — works on rectangular matrices
241
+ row_ind, col_ind = linear_sum_assignment(cost)
242
+
243
+ # Accept only pairs whose cosine similarity clears the threshold.
244
+ # Weak matches would steal an old cluster's identity for an unrelated topic.
245
+ new_to_stable: dict[int, int] = {}
246
+ for r, c in zip(row_ind, col_ind):
247
+ if float(sim[r, c]) >= min_cosine_sim:
248
+ new_to_stable[int(r)] = old_clusters[int(c)].cluster_idx
249
+
250
+ used_ids: set[int] = set(new_to_stable.values())
251
+ next_id = 0
252
+
253
+ result: list[InterestCluster] = []
254
+ for i, cluster in enumerate(new_clusters):
255
+ if i in new_to_stable:
256
+ stable_idx = new_to_stable[i]
257
+ else:
258
+ # No strong match — assign next free index
259
+ while next_id in used_ids:
260
+ next_id += 1
261
+ stable_idx = next_id
262
+ used_ids.add(stable_idx)
263
+ next_id += 1
264
+
265
+ result.append(InterestCluster(
266
+ cluster_idx=stable_idx,
267
+ medoid_paper_id=cluster.medoid_paper_id,
268
+ medoid_embedding=cluster.medoid_embedding,
269
+ paper_ids=cluster.paper_ids,
270
+ importance=cluster.importance,
271
+ ))
272
+
273
+ return result
274
+
275
+
276
  # ── Persistence ───────────────────────────────────────────────────────────────
277
 
278
  async def save_clusters_to_db(user_id: str, clusters: list[InterestCluster]) -> None:
app/recommend/fusion.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Importance-weighted quota fusion for multi-interest recommendations.
3
+
4
+ Replaces RRF for the recommendation pipeline (not search).
5
+
6
+ RRF is correct for search (different retrievers, same query).
7
+ For recommendations (different cluster queries, same user), RRF lets
8
+ the dominant cluster drown minority interests. Quota ensures every
9
+ interest cluster gets a guaranteed floor of slots.
10
+
11
+ Reference: doc 06 §3.1 — "importance-weighted quota with a floor"
12
+ w_k = importance_k / sum(importance_k)
13
+ slot_k = max(floor(F * w_k), F_min) # F = total, F_min = 3
14
+ # distribute remainder by largest fractional part
15
+ """
16
+ from __future__ import annotations
17
+
18
+
19
+ def allocate_quotas(
20
+ importances: list[float],
21
+ total_slots: int,
22
+ min_slots: int = 3,
23
+ ) -> list[int]:
24
+ """
25
+ Allocate recommendation slots proportionally to cluster importances,
26
+ with a guaranteed minimum per cluster.
27
+
28
+ Args:
29
+ importances: importance score per cluster, same order as clusters
30
+ total_slots: total candidate slots to distribute (e.g. 100)
31
+ min_slots: minimum slots guaranteed to every cluster (default 3)
32
+
33
+ Returns:
34
+ List of slot counts, same length and order as importances.
35
+ sum(result) >= total_slots (may exceed if floor constraints force it).
36
+ """
37
+ n = len(importances)
38
+ if n == 0:
39
+ return []
40
+ if n == 1:
41
+ return [max(total_slots, min_slots)]
42
+
43
+ total_imp = sum(importances)
44
+
45
+ if total_imp <= 0:
46
+ # Degenerate: equal distribution with floor guarantee
47
+ per = total_slots // n
48
+ result = [per] * n
49
+ for i in range(total_slots - per * n):
50
+ result[i] += 1
51
+ return [max(r, min_slots) for r in result]
52
+
53
+ # Proportional raw allocations
54
+ raw = [imp / total_imp * total_slots for imp in importances]
55
+
56
+ # Apply floor: max(floor(raw_i), min_slots)
57
+ floored = [max(int(r), min_slots) for r in raw]
58
+
59
+ remainder = total_slots - sum(floored)
60
+
61
+ if remainder <= 0:
62
+ # Floor guarantees already account for all slots (or more)
63
+ return floored
64
+
65
+ # Distribute remainder slots by largest fractional part of raw allocations
66
+ fracs = sorted(range(n), key=lambda i: raw[i] % 1.0, reverse=True)
67
+ for j in range(remainder):
68
+ floored[fracs[j % n]] += 1
69
+
70
+ return floored
71
+
72
+
73
+ def merge_quota_results(
74
+ per_cluster_ids: list[list[str]],
75
+ quotas: list[int],
76
+ ) -> list[str]:
77
+ """
78
+ Merge per-cluster search results respecting quota allocations.
79
+
80
+ Takes up to `quota_k` unique results from each cluster in round-robin
81
+ order across clusters (by importance rank), deduplicating globally.
82
+
83
+ Args:
84
+ per_cluster_ids: list of arxiv_id lists, one per cluster (importance order)
85
+ quotas: slot count for each cluster (same order)
86
+
87
+ Returns:
88
+ Merged list of arxiv_ids, deduplicated, quota-bounded per cluster.
89
+ """
90
+ seen: set[str] = set()
91
+ result: list[str] = []
92
+
93
+ for cluster_ids, quota in zip(per_cluster_ids, quotas):
94
+ count = 0
95
+ for aid in cluster_ids:
96
+ if count >= quota:
97
+ break
98
+ if aid not in seen:
99
+ result.append(aid)
100
+ seen.add(aid)
101
+ count += 1
102
+
103
+ return result
app/routers/events.py CHANGED
@@ -24,6 +24,9 @@ async def save_paper(
24
  source: str = Form(default="search"),
25
  position: int = Form(default=0),
26
  query_id: str = Form(default=""),
 
 
 
27
  user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
28
  ):
29
  user_id = user_id or str(uuid.uuid4())
@@ -35,6 +38,9 @@ async def save_paper(
35
  source=source,
36
  position=position or None,
37
  query_id=query_id or None,
 
 
 
38
  )
39
 
40
  us.record_positive(user_id, paper_id)
@@ -57,6 +63,9 @@ async def not_interested(
57
  source: str = Form(default="search"),
58
  position: int = Form(default=0),
59
  query_id: str = Form(default=""),
 
 
 
60
  user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
61
  ):
62
  user_id = user_id or str(uuid.uuid4())
@@ -68,6 +77,9 @@ async def not_interested(
68
  source=source,
69
  position=position or None,
70
  query_id=query_id or None,
 
 
 
71
  )
72
 
73
  us.record_negative(user_id, paper_id)
 
24
  source: str = Form(default="search"),
25
  position: int = Form(default=0),
26
  query_id: str = Form(default=""),
27
+ ranker_version: str = Form(default=""),
28
+ candidate_source: str = Form(default=""),
29
+ cluster_id: str = Form(default=""),
30
  user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
31
  ):
32
  user_id = user_id or str(uuid.uuid4())
 
38
  source=source,
39
  position=position or None,
40
  query_id=query_id or None,
41
+ ranker_version=ranker_version or None,
42
+ candidate_source=candidate_source or None,
43
+ cluster_id=int(cluster_id) if cluster_id else None,
44
  )
45
 
46
  us.record_positive(user_id, paper_id)
 
63
  source: str = Form(default="search"),
64
  position: int = Form(default=0),
65
  query_id: str = Form(default=""),
66
+ ranker_version: str = Form(default=""),
67
+ candidate_source: str = Form(default=""),
68
+ cluster_id: str = Form(default=""),
69
  user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
70
  ):
71
  user_id = user_id or str(uuid.uuid4())
 
77
  source=source,
78
  position=position or None,
79
  query_id=query_id or None,
80
+ ranker_version=ranker_version or None,
81
+ candidate_source=candidate_source or None,
82
+ cluster_id=int(cluster_id) if cluster_id else None,
83
  )
84
 
85
  us.record_negative(user_id, paper_id)
app/routers/recommendations.py CHANGED
@@ -6,16 +6,21 @@ GET /api/recommendations
6
  – Returns the recommendations partial HTML
7
 
8
  Recommendation pipeline (cascading fallback):
9
- Phase 2b: Multi-interest clustering → prefetch + RRF fusion (≥5 saves)
10
- Phase 2a: EWMA long-term vector → single vector search (≥3 saves)
11
- Phase 1: Qdrant BEST_SCORE Recommend API with raw IDs (≥1 save)
 
 
 
 
 
12
  """
13
- import json
14
  import uuid
15
  import numpy as np
16
  from fastapi import APIRouter, Request, Cookie
17
  from fastapi.responses import HTMLResponse
18
- from app import qdrant_svc, arxiv_svc, user_state as us
19
  from app.config import COOKIE_NAME, REC_LIMIT, REC_MIN_POSITIVES
20
  from app.templates_env import templates
21
  from app.recommend import profiles
@@ -23,16 +28,28 @@ from app.recommend.clustering import (
23
  compute_clusters,
24
  save_clusters_to_db,
25
  load_clusters_from_db,
 
26
  MIN_PAPERS_FOR_CLUSTERING,
27
  )
 
28
  from app.recommend.reranker import rerank_candidates
29
  from app.recommend.diversity import mmr_rerank, inject_exploration
30
 
31
  router = APIRouter(prefix="/api")
32
 
 
 
 
 
33
  # Minimum EWMA interactions before switching from ID-based to vector-based recs
34
  _MIN_EWMA_INTERACTIONS = 3
35
 
 
 
 
 
 
 
36
 
37
  @router.get("/recommendations", response_class=HTMLResponse)
38
  async def get_recommendations(
@@ -56,14 +73,27 @@ async def get_recommendations(
56
 
57
  seen = us.all_seen(user_id)
58
 
59
- # ── Tier 1: Multi-interest clustering + RRF (Phase 2b, ≥5 saves) ─────
60
- rec_arxiv_ids = await _multi_interest_recommend(user_id, state, seen, REC_LIMIT)
 
 
61
 
62
- # ── Tier 2: EWMA single-vector search (Phase 2a,3 saves) ───────────
 
 
 
 
 
63
  if not rec_arxiv_ids:
64
  rec_arxiv_ids = await _ewma_recommend(user_id, seen, REC_LIMIT)
65
-
66
- # ── Tier 3: Qdrant Recommend API (Phase 1 fallback, ≥1 save) ─────────
 
 
 
 
 
 
67
  if not rec_arxiv_ids:
68
  rec_arxiv_ids = await qdrant_svc.recommend(
69
  positive_arxiv_ids=state.positive_list,
@@ -71,16 +101,43 @@ async def get_recommendations(
71
  seen_arxiv_ids=seen,
72
  limit=REC_LIMIT,
73
  )
 
 
 
 
 
 
74
 
75
  if not rec_arxiv_ids:
76
  return _empty_resp()
77
 
78
- meta = await arxiv_svc.fetch_metadata_batch(rec_arxiv_ids)
79
- papers = [
80
- {**meta[aid], "saved": False, "dismissed": False}
81
- for aid in rec_arxiv_ids
82
- if aid in meta
83
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  resp = templates.TemplateResponse(
86
  request,
@@ -91,35 +148,34 @@ async def get_recommendations(
91
  return resp
92
 
93
 
94
- # ── Tier 1: Multi-interest clustering + prefetch RRF ─────────────────────────
95
-
96
- # Per-cluster candidate limits (descending by importance)
97
- _CLUSTER_LIMITS = [40, 30, 25, 20, 15, 15, 15]
98
-
99
 
100
  async def _multi_interest_recommend(
101
  user_id: str, state, seen: set[str], limit: int
102
- ) -> list[str]:
103
  """
104
- Full recommendation pipeline (Phase 2b + 2c):
105
  1. Ward clustering → identify distinct interests
106
- 2. Prefetch + RRF retrieve ~100 candidates
107
- 3. Heuristic re-rankingscore candidates
108
- 4. MMR diversityselect top-k with diversity
109
- 5. Exploration injection1-2 serendipitous papers
110
-
111
- Only activates when the user has MIN_PAPERS_FOR_CLUSTERING saves.
112
- Returns [] to trigger fallback to Tier 2.
 
 
 
113
  """
114
  positives = state.positive_list
115
  if len(positives) < MIN_PAPERS_FOR_CLUSTERING:
116
- return []
117
 
118
  try:
119
  # Fetch embeddings for all saved papers
120
  vectors = await qdrant_svc.get_paper_vectors(positives)
121
  if len(vectors) < MIN_PAPERS_FOR_CLUSTERING:
122
- return []
123
 
124
  # Build aligned arrays (only papers we got vectors for)
125
  aligned_ids = [pid for pid in positives if pid in vectors]
@@ -129,38 +185,89 @@ async def _multi_interest_recommend(
129
 
130
  # ── Step 1: Compute interest clusters ─────────────────────────────
131
  clusters = compute_clusters(aligned_ids, aligned_embs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  await save_clusters_to_db(user_id, clusters)
133
 
134
- # ── Step 2: Multi-interest retrieval via prefetch + RRF ───────────
135
- interest_vectors = []
136
- for i, cluster in enumerate(clusters):
137
- per_cluster_limit = _CLUSTER_LIMITS[i] if i < len(_CLUSTER_LIMITS) else 15
138
- interest_vectors.append(
139
- (cluster.medoid_embedding.tolist(), per_cluster_limit)
140
- )
141
 
 
142
  st_vec = await profiles.load_profile(user_id, "short_term")
143
- st_list = st_vec.tolist() if st_vec is not None else None
144
 
145
- candidate_ids = await qdrant_svc.multi_interest_search(
146
- interest_vectors=interest_vectors,
147
- short_term_vector=st_list,
148
- exclude_ids=seen,
149
- total_limit=100, # retrieve wide, narrow with re-ranking
150
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
  if not candidate_ids:
153
- return []
154
 
155
- # ── Step 3: Re-rank candidates ────────────────────────────────────
156
- # Fetch embeddings + metadata for candidates
157
  cand_vectors = await qdrant_svc.get_paper_vectors(candidate_ids)
158
- cand_meta = await arxiv_svc.fetch_metadata_batch(candidate_ids)
159
-
160
- # Only process candidates we have both vectors and metadata for
 
 
 
 
 
 
 
 
 
 
161
  valid_ids = [cid for cid in candidate_ids if cid in cand_vectors and cid in cand_meta]
162
  if not valid_ids:
163
- return candidate_ids[:limit] # fallback: return raw retrieval
164
 
165
  valid_embs = np.array([cand_vectors[cid] for cid in valid_ids], dtype=np.float32)
166
  valid_meta = [cand_meta[cid] for cid in valid_ids]
@@ -168,6 +275,7 @@ async def _multi_interest_recommend(
168
  lt_vec = await profiles.load_profile(user_id, "long_term")
169
  neg_vec = await profiles.load_profile(user_id, "negative")
170
 
 
171
  reranked_ids, reranked_scores, reranked_embs = rerank_candidates(
172
  candidate_ids=valid_ids,
173
  candidate_embeddings=valid_embs,
@@ -177,7 +285,19 @@ async def _multi_interest_recommend(
177
  negative_vec=neg_vec,
178
  )
179
 
180
- # ── Step 4: MMR diversity enforcement ─────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
181
  query_vec = lt_vec if lt_vec is not None else aligned_embs.mean(axis=0)
182
  mmr_selected = mmr_rerank(
183
  query_embedding=query_vec,
@@ -188,18 +308,38 @@ async def _multi_interest_recommend(
188
  top_k=limit,
189
  )
190
 
191
- # ── Step 5: Exploration injection ─────────────────────────────────
192
  final = inject_exploration(
193
  selected_ids=mmr_selected,
194
  all_candidate_ids=reranked_ids,
195
  n_explore=2,
196
  )
197
-
198
- return final[:limit + 2] # allow slightly over limit for exploration
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  except Exception as e:
201
  print(f"[recommendations] multi-interest search failed: {e}")
202
- return []
203
 
204
 
205
  # ── Tier 2: EWMA single-vector search ────────────────────────────────────────
@@ -227,5 +367,3 @@ async def _ewma_recommend(
227
  limit=limit,
228
  exclude_ids=seen,
229
  )
230
-
231
-
 
6
  – Returns the recommendations partial HTML
7
 
8
  Recommendation pipeline (cascading fallback):
9
+ Phase 2b / 4.1: Multi-interest clustering → quota fusion (≥5 saves)
10
+ Phase 2a: EWMA long-term vector → single vector search (≥3 saves)
11
+ Phase 1: Qdrant BEST_SCORE Recommend API with raw IDs (≥1 save)
12
+
13
+ Phase 4 changes vs Phase 2b:
14
+ - RRF replaced with importance-weighted quota fusion (doc 06 §3.1)
15
+ - Hungarian matching stabilises cluster IDs across reclusters (4.2)
16
+ - Category-level suppression filters strongly disliked topics (4.3)
17
  """
18
+ import asyncio
19
  import uuid
20
  import numpy as np
21
  from fastapi import APIRouter, Request, Cookie
22
  from fastapi.responses import HTMLResponse
23
+ from app import db, qdrant_svc, arxiv_svc, turso_svc, user_state as us
24
  from app.config import COOKIE_NAME, REC_LIMIT, REC_MIN_POSITIVES
25
  from app.templates_env import templates
26
  from app.recommend import profiles
 
28
  compute_clusters,
29
  save_clusters_to_db,
30
  load_clusters_from_db,
31
+ stabilize_cluster_ids,
32
  MIN_PAPERS_FOR_CLUSTERING,
33
  )
34
+ from app.recommend.fusion import allocate_quotas, merge_quota_results
35
  from app.recommend.reranker import rerank_candidates
36
  from app.recommend.diversity import mmr_rerank, inject_exploration
37
 
38
  router = APIRouter(prefix="/api")
39
 
40
+ # Phase 4.5: Pipeline version tag for instrumentation. Bump this on any
41
+ # change to the ranking logic so A/B attribution is possible.
42
+ _RANKER_VERSION = "v4.1_quota_hungarian_suppression"
43
+
44
  # Minimum EWMA interactions before switching from ID-based to vector-based recs
45
  _MIN_EWMA_INTERACTIONS = 3
46
 
47
+ # Candidate oversampling factor per cluster (fetch more than quota to handle dedup)
48
+ _OVERSAMPLE = 3
49
+
50
+ # Short-term session context: fixed supplementary pool size
51
+ _ST_SUPPLEMENT = 20
52
+
53
 
54
  @router.get("/recommendations", response_class=HTMLResponse)
55
  async def get_recommendations(
 
73
 
74
  seen = us.all_seen(user_id)
75
 
76
+ # Phase 4.5: paper_tags maps arxiv_id instrumentation metadata
77
+ # populated by whichever tier serves the result.
78
+ paper_tags: dict[str, dict] = {}
79
+ rec_arxiv_ids: list[str] = []
80
 
81
+ # ── Tier 1: Multi-interest clustering + quota fusion (5 saves) ──────
82
+ rec_arxiv_ids, paper_tags = await _multi_interest_recommend(
83
+ user_id, state, seen, REC_LIMIT,
84
+ )
85
+
86
+ # ── Tier 2: EWMA single-vector search (≥3 saves) ──────────────────────
87
  if not rec_arxiv_ids:
88
  rec_arxiv_ids = await _ewma_recommend(user_id, seen, REC_LIMIT)
89
+ for aid in rec_arxiv_ids:
90
+ paper_tags[aid] = {
91
+ "ranker_version": _RANKER_VERSION,
92
+ "candidate_source": "ewma_longterm",
93
+ "cluster_id": "",
94
+ }
95
+
96
+ # ── Tier 3: Qdrant Recommend API (≥1 save fallback) ───────────────────
97
  if not rec_arxiv_ids:
98
  rec_arxiv_ids = await qdrant_svc.recommend(
99
  positive_arxiv_ids=state.positive_list,
 
101
  seen_arxiv_ids=seen,
102
  limit=REC_LIMIT,
103
  )
104
+ for aid in rec_arxiv_ids:
105
+ paper_tags[aid] = {
106
+ "ranker_version": _RANKER_VERSION,
107
+ "candidate_source": "qdrant_recommend",
108
+ "cluster_id": "",
109
+ }
110
 
111
  if not rec_arxiv_ids:
112
  return _empty_resp()
113
 
114
+ # Phase 3.5: Turso primary, arXiv API fallback
115
+ meta = await turso_svc.fetch_metadata_batch(rec_arxiv_ids)
116
+ missing = [aid for aid in rec_arxiv_ids if aid not in meta]
117
+ if missing:
118
+ try:
119
+ arxiv_meta = await arxiv_svc.fetch_metadata_batch(missing)
120
+ meta.update(arxiv_meta)
121
+ except Exception as e:
122
+ print(f"[recommendations] arXiv fallback for {len(missing)} IDs failed: {e}")
123
+
124
+ # Cache to SQLite so category suppression JOINs work (Phase 4.3)
125
+ await db.cache_turso_metadata_batch(list(meta.values()))
126
+
127
+ papers = []
128
+ for aid in rec_arxiv_ids:
129
+ if aid not in meta:
130
+ continue
131
+ tags = paper_tags.get(aid, {})
132
+ papers.append({
133
+ **meta[aid],
134
+ "saved": False,
135
+ "dismissed": False,
136
+ # Phase 4.5 instrumentation — embedded in card, flows back via HTMX
137
+ "ranker_version": tags.get("ranker_version", _RANKER_VERSION),
138
+ "candidate_source": tags.get("candidate_source", ""),
139
+ "cluster_id": tags.get("cluster_id", ""),
140
+ })
141
 
142
  resp = templates.TemplateResponse(
143
  request,
 
148
  return resp
149
 
150
 
151
+ # ── Tier 1: Multi-interest clustering + quota fusion ─────────────────────────
 
 
 
 
152
 
153
  async def _multi_interest_recommend(
154
  user_id: str, state, seen: set[str], limit: int
155
+ ) -> tuple[list[str], dict[str, dict]]:
156
  """
157
+ Full recommendation pipeline (Phase 2b + Phase 4 corrections):
158
  1. Ward clustering → identify distinct interests
159
+ 2. Quota allocationper-cluster slot budgets (replaces RRF)
160
+ 3. Parallel per-cluster ANN searches retrieve candidates
161
+ 4. Hungarian matchingstabilise cluster IDs across reclusters
162
+ 5. Category suppressionremove strongly disliked topics
163
+ 6. Heuristic re-ranking → score candidates
164
+ 7. MMR diversity select top-k with diversity
165
+ 8. Exploration injection serendipitous papers
166
+
167
+ Returns ([], {}) to trigger fallback to Tier 2.
168
+ Phase 4.5: second element is {arxiv_id: {ranker_version, candidate_source, cluster_id}}.
169
  """
170
  positives = state.positive_list
171
  if len(positives) < MIN_PAPERS_FOR_CLUSTERING:
172
+ return [], {}
173
 
174
  try:
175
  # Fetch embeddings for all saved papers
176
  vectors = await qdrant_svc.get_paper_vectors(positives)
177
  if len(vectors) < MIN_PAPERS_FOR_CLUSTERING:
178
+ return [], {}
179
 
180
  # Build aligned arrays (only papers we got vectors for)
181
  aligned_ids = [pid for pid in positives if pid in vectors]
 
185
 
186
  # ── Step 1: Compute interest clusters ─────────────────────────────
187
  clusters = compute_clusters(aligned_ids, aligned_embs)
188
+
189
+ # ── Step 4.2: Stabilise cluster IDs with Hungarian matching ───────
190
+ old_clusters_data = await load_clusters_from_db(user_id)
191
+ if old_clusters_data:
192
+ from app.recommend.clustering import InterestCluster
193
+ old_clusters = [
194
+ InterestCluster(
195
+ cluster_idx=row["cluster_idx"],
196
+ medoid_paper_id=row["medoid_paper_id"],
197
+ medoid_embedding=np.array(
198
+ vectors[row["medoid_paper_id"]], dtype=np.float32
199
+ ) if row["medoid_paper_id"] in vectors else np.zeros(1024, dtype=np.float32),
200
+ paper_ids=[],
201
+ importance=row["importance"],
202
+ )
203
+ for row in old_clusters_data
204
+ ]
205
+ clusters = stabilize_cluster_ids(clusters, old_clusters)
206
+
207
  await save_clusters_to_db(user_id, clusters)
208
 
209
+ # ── Step 2: Quota allocation ───────────────────────────────────────
210
+ importances = [c.importance for c in clusters]
211
+ quotas = allocate_quotas(importances, total_slots=100, min_slots=3)
 
 
 
 
212
 
213
+ # ── Step 3: Parallel per-cluster ANN searches ─────────────────────
214
  st_vec = await profiles.load_profile(user_id, "short_term")
 
215
 
216
+ search_tasks = [
217
+ qdrant_svc.search_by_vector(
218
+ query_vector=c.medoid_embedding.tolist(),
219
+ limit=quota * _OVERSAMPLE,
220
+ exclude_ids=seen,
221
+ )
222
+ for c, quota in zip(clusters, quotas)
223
+ ]
224
+ per_cluster_results = await asyncio.gather(*search_tasks)
225
+
226
+ # Phase 4.5: Build paper → cluster mapping BEFORE merge (so we know
227
+ # which cluster each paper was retrieved from).
228
+ paper_cluster_map: dict[str, int] = {}
229
+ for cluster, result_ids in zip(clusters, per_cluster_results):
230
+ for aid in result_ids:
231
+ if aid not in paper_cluster_map: # first-occurrence wins
232
+ paper_cluster_map[aid] = cluster.cluster_idx
233
+
234
+ # Apply quota merge (dedup globally, respect per-cluster quotas)
235
+ candidate_ids = merge_quota_results(list(per_cluster_results), quotas)
236
+
237
+ # Supplement with short-term session context
238
+ if st_vec is not None:
239
+ seen_so_far = seen | set(candidate_ids)
240
+ st_results = await qdrant_svc.search_by_vector(
241
+ query_vector=st_vec.tolist(),
242
+ limit=_ST_SUPPLEMENT,
243
+ exclude_ids=seen_so_far,
244
+ )
245
+ for aid in st_results:
246
+ if aid not in set(candidate_ids):
247
+ candidate_ids.append(aid)
248
+ paper_cluster_map[aid] = -1 # short-term supplement
249
 
250
  if not candidate_ids:
251
+ return [], {}
252
 
253
+ # ── Step 5: Fetch candidate vectors + metadata ────────────────────
 
254
  cand_vectors = await qdrant_svc.get_paper_vectors(candidate_ids)
255
+ cand_meta = await turso_svc.fetch_metadata_batch(candidate_ids)
256
+ cand_missing = [cid for cid in candidate_ids if cid not in cand_meta]
257
+ if cand_missing:
258
+ try:
259
+ arxiv_cand_meta = await arxiv_svc.fetch_metadata_batch(cand_missing)
260
+ cand_meta.update(arxiv_cand_meta)
261
+ except Exception as e:
262
+ print(f"[recommendations] arXiv fallback for {len(cand_missing)} IDs failed: {e}")
263
+
264
+ # Cache fetched metadata to SQLite for category suppression
265
+ await db.cache_turso_metadata_batch(list(cand_meta.values()))
266
+
267
+ # Only process candidates with both vectors and metadata
268
  valid_ids = [cid for cid in candidate_ids if cid in cand_vectors and cid in cand_meta]
269
  if not valid_ids:
270
+ return candidate_ids[:limit], {}
271
 
272
  valid_embs = np.array([cand_vectors[cid] for cid in valid_ids], dtype=np.float32)
273
  valid_meta = [cand_meta[cid] for cid in valid_ids]
 
275
  lt_vec = await profiles.load_profile(user_id, "long_term")
276
  neg_vec = await profiles.load_profile(user_id, "negative")
277
 
278
+ # ── Step 6: Heuristic re-ranking ──────────────────────────────────
279
  reranked_ids, reranked_scores, reranked_embs = rerank_candidates(
280
  candidate_ids=valid_ids,
281
  candidate_embeddings=valid_embs,
 
285
  negative_vec=neg_vec,
286
  )
287
 
288
+ # ── Step 4.3: Category suppression ────────────────────────────────
289
+ suppressed = await db.get_suppressed_categories(user_id)
290
+ if suppressed:
291
+ kept = [
292
+ i for i, cid in enumerate(reranked_ids)
293
+ if cand_meta.get(cid, {}).get("category", "") not in suppressed
294
+ ]
295
+ if kept:
296
+ reranked_ids = [reranked_ids[i] for i in kept]
297
+ reranked_scores = [reranked_scores[i] for i in kept]
298
+ reranked_embs = reranked_embs[kept]
299
+
300
+ # ── Step 7: MMR diversity enforcement ─────────────────────────────
301
  query_vec = lt_vec if lt_vec is not None else aligned_embs.mean(axis=0)
302
  mmr_selected = mmr_rerank(
303
  query_embedding=query_vec,
 
308
  top_k=limit,
309
  )
310
 
311
+ # ── Step 8: Exploration injection ─────────────────────────────────
312
  final = inject_exploration(
313
  selected_ids=mmr_selected,
314
  all_candidate_ids=reranked_ids,
315
  n_explore=2,
316
  )
317
+ final = final[:limit + 2]
318
+
319
+ # Phase 4.5: Build per-paper instrumentation tags
320
+ exploration_set = set(final) - set(mmr_selected)
321
+ paper_tags: dict[str, dict] = {}
322
+ for aid in final:
323
+ cluster_idx = paper_cluster_map.get(aid)
324
+ if aid in exploration_set:
325
+ source = "exploration"
326
+ elif cluster_idx == -1:
327
+ source = "short_term_supplement"
328
+ elif cluster_idx is not None:
329
+ source = f"cluster_{cluster_idx}"
330
+ else:
331
+ source = "tier1_unknown"
332
+ paper_tags[aid] = {
333
+ "ranker_version": _RANKER_VERSION,
334
+ "candidate_source": source,
335
+ "cluster_id": str(cluster_idx) if cluster_idx is not None and cluster_idx >= 0 else "",
336
+ }
337
+
338
+ return final, paper_tags
339
 
340
  except Exception as e:
341
  print(f"[recommendations] multi-interest search failed: {e}")
342
+ return [], {}
343
 
344
 
345
  # ── Tier 2: EWMA single-vector search ────────────────────────────────────────
 
367
  limit=limit,
368
  exclude_ids=seen,
369
  )
 
 
app/routers/saved.py CHANGED
@@ -3,12 +3,12 @@ Saved papers router.
3
 
4
  GET /saved
5
  – Shows all papers the user has currently saved (positive_list)
6
- – Metadata fetched via arXiv API + SQLite cache
7
  """
8
  import uuid
9
  from fastapi import APIRouter, Request, Cookie
10
  from fastapi.responses import HTMLResponse
11
- from app import arxiv_svc, user_state as us
12
  from app.config import COOKIE_NAME
13
  from app.templates_env import templates
14
 
@@ -27,7 +27,18 @@ async def saved_papers(
27
 
28
  papers = []
29
  if saved_ids:
30
- meta = await arxiv_svc.fetch_metadata_batch(saved_ids)
 
 
 
 
 
 
 
 
 
 
 
31
  papers = [
32
  {**meta[aid], "saved": True, "dismissed": False}
33
  for aid in saved_ids
 
3
 
4
  GET /saved
5
  – Shows all papers the user has currently saved (positive_list)
6
+ – Metadata fetched via Turso DB (Phase 3.5), arXiv API fallback
7
  """
8
  import uuid
9
  from fastapi import APIRouter, Request, Cookie
10
  from fastapi.responses import HTMLResponse
11
+ from app import arxiv_svc, db, turso_svc, user_state as us
12
  from app.config import COOKIE_NAME
13
  from app.templates_env import templates
14
 
 
27
 
28
  papers = []
29
  if saved_ids:
30
+ # Phase 3.5: Turso primary, arXiv API fallback
31
+ meta = await turso_svc.fetch_metadata_batch(saved_ids)
32
+ missing = [aid for aid in saved_ids if aid not in meta]
33
+ if missing:
34
+ try:
35
+ arxiv_meta = await arxiv_svc.fetch_metadata_batch(missing)
36
+ meta.update(arxiv_meta)
37
+ except Exception as e:
38
+ print(f"[saved] arXiv fallback for {len(missing)} IDs failed: {e}")
39
+ # Phase 4.3: Cache to SQLite so dismissal category JOINs work
40
+ await db.cache_turso_metadata_batch(list(meta.values()))
41
+
42
  papers = [
43
  {**meta[aid], "saved": True, "dismissed": False}
44
  for aid in saved_ids
app/routers/search.py CHANGED
@@ -14,7 +14,7 @@ Phase 3.5: Metadata now fetched from Turso cloud DB (fast, includes citations)
14
  import uuid
15
  from fastapi import APIRouter, Request, Cookie
16
  from fastapi.responses import HTMLResponse
17
- from app import arxiv_svc, turso_svc, user_state as us, hybrid_search_svc
18
  from app.config import COOKIE_NAME, ARXIV_MAX_RESULTS
19
  from app.templates_env import templates
20
 
@@ -53,6 +53,9 @@ async def search(
53
  except Exception as e:
54
  print(f"[search] arXiv fallback for {len(missing)} IDs failed: {e}")
55
 
 
 
 
56
  # Preserve ranking order from hybrid search
57
  papers = [meta[aid] for aid in arxiv_ids if aid in meta]
58
 
 
14
  import uuid
15
  from fastapi import APIRouter, Request, Cookie
16
  from fastapi.responses import HTMLResponse
17
+ from app import arxiv_svc, db, turso_svc, user_state as us, hybrid_search_svc
18
  from app.config import COOKIE_NAME, ARXIV_MAX_RESULTS
19
  from app.templates_env import templates
20
 
 
53
  except Exception as e:
54
  print(f"[search] arXiv fallback for {len(missing)} IDs failed: {e}")
55
 
56
+ # Phase 4.3: Cache to SQLite so dismissal category JOINs work
57
+ await db.cache_turso_metadata_batch(list(meta.values()))
58
+
59
  # Preserve ranking order from hybrid search
60
  papers = [meta[aid] for aid in arxiv_ids if aid in meta]
61
 
app/templates/index.html CHANGED
@@ -31,14 +31,17 @@
31
  <!-- Recommendations section -->
32
  <div>
33
  <h2 class="text-lg font-semibold mb-3">Recommended for You</h2>
34
- <div id="rec-section"
35
- hx-get="/api/recommendations"
36
- hx-trigger="load"
37
- hx-indicator="#rec-spinner"
38
- hx-swap="innerHTML">
39
- <div class="flex items-center gap-2 text-base-content/50">
40
- <span id="rec-spinner" class="htmx-indicator loading loading-spinner loading-sm"></span>
41
- <span>Loading recommendations…</span>
 
 
 
42
  </div>
43
  </div>
44
  </div>
 
31
  <!-- Recommendations section -->
32
  <div>
33
  <h2 class="text-lg font-semibold mb-3">Recommended for You</h2>
34
+ <div id="rec-section-wrapper" class="relative">
35
+ <span id="rec-spinner" class="htmx-indicator loading loading-spinner loading-sm absolute right-0 top-0"></span>
36
+ <div id="rec-section"
37
+ hx-get="/api/recommendations"
38
+ hx-trigger="load"
39
+ hx-indicator="#rec-spinner"
40
+ hx-swap="innerHTML">
41
+ <div class="flex items-center gap-2 text-base-content/50">
42
+ <span class="loading loading-spinner loading-sm"></span>
43
+ <span>Loading recommendations…</span>
44
+ </div>
45
  </div>
46
  </div>
47
  </div>
app/templates/partials/action_buttons.html CHANGED
@@ -2,12 +2,16 @@
2
  Action buttons for a paper card.
3
  Expects: paper_id (or paper.arxiv_id), saved (bool), dismissed (bool)
4
  Optional: source ("search" | "recommendation" | "saved"), position (int)
 
5
  These are returned directly by the /api/papers/{id}/save endpoint
6
  so they also work as a standalone partial.
7
  #}
8
  {% set pid = paper_id if paper_id is defined else paper.arxiv_id %}
9
  {% set is_saved = saved if saved is defined else (paper.saved | default(false)) %}
10
  {% set _source = source if source is defined else "search" %}
 
 
 
11
 
12
  {% if is_saved %}
13
  <!-- Already saved — show saved state, allow unsave via not-interested -->
@@ -19,7 +23,7 @@
19
  hx-post="/api/papers/{{ pid }}/not-interested"
20
  hx-target="#paper-{{ pid }}"
21
  hx-swap="outerHTML swap:200ms"
22
- hx-vals='{"source": "{{ _source }}"}'>
23
  Remove
24
  </button>
25
  </div>
@@ -28,9 +32,9 @@
28
  <!-- Save -->
29
  <button class="btn btn-primary btn-xs"
30
  hx-post="/api/papers/{{ pid }}/save"
31
- hx-target="#actions-{{ pid }}"
32
  hx-swap="innerHTML"
33
- hx-vals='{"source": "{{ _source }}", "position": "{{ position | default(0) }}"}'>
34
  ⭐ Save
35
  </button>
36
  <!-- Not interested (removes the whole card) -->
@@ -38,8 +42,9 @@
38
  hx-post="/api/papers/{{ pid }}/not-interested"
39
  hx-target="#paper-{{ pid }}"
40
  hx-swap="outerHTML swap:200ms"
41
- hx-vals='{"source": "{{ _source }}"}'>
42
  ✕ Not interested
43
  </button>
44
  </div>
45
  {% endif %}
 
 
2
  Action buttons for a paper card.
3
  Expects: paper_id (or paper.arxiv_id), saved (bool), dismissed (bool)
4
  Optional: source ("search" | "recommendation" | "saved"), position (int)
5
+ Phase 4.5: ranker_version, candidate_source, cluster_id (set by recommendations.py)
6
  These are returned directly by the /api/papers/{id}/save endpoint
7
  so they also work as a standalone partial.
8
  #}
9
  {% set pid = paper_id if paper_id is defined else paper.arxiv_id %}
10
  {% set is_saved = saved if saved is defined else (paper.saved | default(false)) %}
11
  {% set _source = source if source is defined else "search" %}
12
+ {% set _ranker_version = paper.ranker_version | default("") if paper is defined else "" %}
13
+ {% set _candidate_source = paper.candidate_source | default("") if paper is defined else "" %}
14
+ {% set _cluster_id = paper.cluster_id | default("") if paper is defined else "" %}
15
 
16
  {% if is_saved %}
17
  <!-- Already saved — show saved state, allow unsave via not-interested -->
 
23
  hx-post="/api/papers/{{ pid }}/not-interested"
24
  hx-target="#paper-{{ pid }}"
25
  hx-swap="outerHTML swap:200ms"
26
+ hx-vals='{"source": "{{ _source }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}"}'>
27
  Remove
28
  </button>
29
  </div>
 
32
  <!-- Save -->
33
  <button class="btn btn-primary btn-xs"
34
  hx-post="/api/papers/{{ pid }}/save"
35
+ hx-target="[id='actions-{{ pid }}']"
36
  hx-swap="innerHTML"
37
+ hx-vals='{"source": "{{ _source }}", "position": "{{ position | default(0) }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}"}'>
38
  ⭐ Save
39
  </button>
40
  <!-- Not interested (removes the whole card) -->
 
42
  hx-post="/api/papers/{{ pid }}/not-interested"
43
  hx-target="#paper-{{ pid }}"
44
  hx-swap="outerHTML swap:200ms"
45
+ hx-vals='{"source": "{{ _source }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}"}'>
46
  ✕ Not interested
47
  </button>
48
  </div>
49
  {% endif %}
50
+
app/templates/partials/paper_card.html CHANGED
@@ -25,11 +25,14 @@
25
  {% endif %}
26
  </div>
27
 
28
- <!-- Meta: arXiv ID + year -->
29
  <div class="text-xs text-base-content/50">
30
  [{{ paper.arxiv_id }}]
31
  {% if paper.published %} · {{ paper.published[:4] }}{% endif %}
32
  {% if authors_list %} · {{ authors_list | join(", ") }}{% endif %}
 
 
 
33
  </div>
34
 
35
  <!-- Abstract (truncated) -->
 
25
  {% endif %}
26
  </div>
27
 
28
+ <!-- Meta: arXiv ID + year + citations -->
29
  <div class="text-xs text-base-content/50">
30
  [{{ paper.arxiv_id }}]
31
  {% if paper.published %} · {{ paper.published[:4] }}{% endif %}
32
  {% if authors_list %} · {{ authors_list | join(", ") }}{% endif %}
33
+ {% if paper.citation_count %}
34
+ · <span class="font-medium text-base-content/70" title="{{ paper.influential_citations|default(0) }} influential">📊 {{ paper.citation_count }} citations</span>
35
+ {% endif %}
36
  </div>
37
 
38
  <!-- Abstract (truncated) -->
app/turso_svc.py CHANGED
@@ -59,9 +59,11 @@ async def fetch_metadata_batch(arxiv_ids: list[str]) -> dict[str, dict]:
59
  pipeline_url = url.rstrip("/")
60
  # Convert to HTTP API URL format
61
  if pipeline_url.startswith("libsql://"):
62
- pipeline_url = pipeline_url.replace("libsql://", "https://")
63
- if not pipeline_url.startswith("https://"):
64
- pipeline_url = "https://" + pipeline_url.lstrip("https://").lstrip("http://")
 
 
65
 
66
  payload = {
67
  "requests": [
 
59
  pipeline_url = url.rstrip("/")
60
  # Convert to HTTP API URL format
61
  if pipeline_url.startswith("libsql://"):
62
+ pipeline_url = "https://" + pipeline_url[len("libsql://"):]
63
+ elif pipeline_url.startswith("http://"):
64
+ pipeline_url = "https://" + pipeline_url[len("http://"):]
65
+ elif not pipeline_url.startswith("https://"):
66
+ pipeline_url = "https://" + pipeline_url
67
 
68
  payload = {
69
  "requests": [
docs/TASK-TRACKER.md CHANGED
@@ -1,8 +1,8 @@
1
  # ResearchIT — Master Task Tracker
2
 
3
  > **Purpose**: Single source of truth for all completed, in-progress, and upcoming work.
4
- > **Last updated**: 2026-04-20
5
- > **Current phase**: Phase 3.5 (Turso Metadata DB) — COMPLETE ✔
6
 
7
  ---
8
 
@@ -241,21 +241,25 @@
241
 
242
  ---
243
 
244
- ## Phase 4: Recommendation Pipeline Fixes 📋 NOT STARTED
245
 
246
- > *Fix the known architectural debt in the recommendation pipeline.*
247
- > *Estimated effort: ~1 week*
248
 
249
  ### 4.1 — Replace RRF with Importance-Weighted Quota Fusion
250
- - [ ] Create `app/recommend/fusion.py` — quota allocation logic
251
  - `w_k = importance_k / sum(importance_k)`
252
  - `slot_k = max(floor(F × w_k), F_min=3)` — every cluster gets at least 3 slots
253
  - Distribute remainder by largest fractional part
254
- - [ ] Refactor `_multi_interest_recommend()` in `recommendations.py`
 
 
255
  - Replace `multi_interest_search()` with per-cluster separate ANN queries
256
- - Allocate feed slots proportionally
257
- - Deduplicate across clusters (assign to highest-ranked)
258
- - MMR over merged union
 
 
259
 
260
  ### 4.2 — Pre-populate Metadata Store ✅ DONE (via Turso)
261
  - [x] Bulk-loaded arXiv metadata from Kaggle to Turso cloud DB (Phase 3.5)
@@ -265,13 +269,60 @@
265
  - [x] **Impact**: Search time dropped from ~10.7s to ~1.75s on HF Spaces
266
 
267
  ### 4.3 — Hungarian Matching for Cluster Stability
268
- - [ ] Implement Hungarian matching in `clustering.py`
269
- - Match new cluster IDs to previous IDs by medoid similarity
270
- - Prevents cluster IDs from shuffling between reclusterings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
271
 
272
- ### 4.4 Wire Remaining Negative Signal Components
273
- - [ ] Per-item short-term decay: `score -= α × exp(-dt / τ_neg)` — needs per-item timestamp tracking
274
- - [ ] Category-level suppression: if ≥3 dismissals hit the same arXiv category within a week, suppress for 2 weeks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
  ---
277
 
@@ -306,7 +357,8 @@
306
 
307
  > *Replace heuristic scorer with a trained LightGBM lambdarank model.*
308
  > *Blocked by: ≥500 labeled interactions OR citation-graph bootstrap*
309
- > *Estimated effort: ~2-4 weeks*
 
310
 
311
  - [ ] Citation-graph pseudo-labels from unarXive 2022 (cited = relevance 2, co-cited = 1, random = 0)
312
  - [ ] Author-as-user simulation
@@ -329,11 +381,30 @@
329
 
330
  ## Phase 8: LLM Interest Summaries + Distilled Re-ranker 📋 NOT STARTED
331
 
332
- > *Estimated effort: ~2 weeks*
333
-
334
- - [ ] Claude/Groq interest summaries per cluster (human-readable descriptions)
335
- - [ ] Distill BGE-reranker-v2-m3 offline → TinyBERT-L2 student (FlashRank recipe)
336
- - [ ] Deploy student score as LightGBM feature on top-20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
 
338
  ---
339
 
@@ -380,18 +451,19 @@
380
  | Test File | Count | Status |
381
  |---|---|---|
382
  | `tests/test_profiles.py` | 11 | ✅ Passing |
383
- | `tests/test_clustering.py` | 10 | ✅ Passing |
384
  | `tests/test_reranker_diversity.py` | 13 | ✅ Passing |
385
- | `tests/test_db.py` | | ✅ Passing |
 
386
  | `tests/test_qdrant_svc.py` | — | ✅ Passing |
387
  | `tests/test_arxiv_svc.py` | — | ✅ Passing |
388
- | `tests/test_integration.py` | — | ✅ Passing |
389
  | `tests/test_user_state.py` | — | ✅ Passing |
390
  | `tests/test_saved.py` | — | ✅ Passing |
391
  | `tests/test_hybrid_search.py` | 21 | ✅ Passing |
392
  | `tests/test_search_router.py` | 6 | ✅ Passing |
393
  | `tests/test_live_search.py` | 8 | ✅ Passing |
394
- | **Total** | **123** | ✅ |
395
  | `test_e2e_recs.py` (standalone) | 1 | ✅ E2E simulation |
396
 
397
  ---
@@ -404,8 +476,8 @@
404
  | L2-normalize before Ward clustering | ✅ Applied | `app/recommend/clustering.py` |
405
  | Medoid not centroid | ✅ Applied | `app/recommend/clustering.py` → `_find_medoid()` |
406
  | Negative EWMA wired into reranking | ✅ Applied | `app/recommend/reranker.py` → Feature 5 |
407
- | RRF → quota fusion for recommendations | [!] Backlog | Phase 4.1 |
408
- | Hungarian cluster matching | [!] Backlog | Phase 4.3 |
409
- | Per-item short-term negative decay | [!] Backlog | Phase 4.4 |
410
- | Category-level suppression | [!] Backlog | Phase 4.4 |
411
  | BGE-reranker NEVER in hot path | ✅ Followed | Heuristic scorer used instead |
 
1
  # ResearchIT — Master Task Tracker
2
 
3
  > **Purpose**: Single source of truth for all completed, in-progress, and upcoming work.
4
+ > **Last updated**: 2026-04-26
5
+ > **Current phase**: Phase 4.5 (Instrumentation Foundation) — COMPLETE ✔
6
 
7
  ---
8
 
 
241
 
242
  ---
243
 
244
+ ## Phase 4: Recommendation Pipeline Fixes COMPLETE
245
 
246
+ > *Fixed the known architectural debt in the recommendation pipeline.*
247
+ > *Detailed plan: `docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`*
248
 
249
  ### 4.1 — Replace RRF with Importance-Weighted Quota Fusion
250
+ - [x] Create `app/recommend/fusion.py` — quota allocation logic
251
  - `w_k = importance_k / sum(importance_k)`
252
  - `slot_k = max(floor(F × w_k), F_min=3)` — every cluster gets at least 3 slots
253
  - Distribute remainder by largest fractional part
254
+ - [x] Create `tests/test_fusion.py` — **20 unit tests** for quota allocation
255
+ - Proportionality, floor enforcement, total invariant, edge cases, Doc 06 worked examples
256
+ - [x] Refactor `_multi_interest_recommend()` in `recommendations.py`
257
  - Replace `multi_interest_search()` with per-cluster separate ANN queries
258
+ - Use `asyncio.gather()` for concurrent searches (~15ms wall-clock)
259
+ - Allocate feed slots proportionally via `allocate_quotas()`
260
+ - Deduplicate across clusters (first-occurrence = highest-ranked cluster wins)
261
+ - MMR over merged union (unchanged)
262
+ - [x] Keep `qdrant_svc.multi_interest_search()` in codebase (no deletion)
263
 
264
  ### 4.2 — Pre-populate Metadata Store ✅ DONE (via Turso)
265
  - [x] Bulk-loaded arXiv metadata from Kaggle to Turso cloud DB (Phase 3.5)
 
269
  - [x] **Impact**: Search time dropped from ~10.7s to ~1.75s on HF Spaces
270
 
271
  ### 4.3 — Hungarian Matching for Cluster Stability
272
+ - [x] Add `stabilize_cluster_ids()` function to `clustering.py`
273
+ - Uses `scipy.optimize.linear_sum_assignment` (already a dependency)
274
+ - Cost matrix: `1 - cosine_sim(new_medoid, old_medoid)` — trivial at K≤7
275
+ - Matched clusters keep old indices; new clusters get next available
276
+ - Min cosine threshold (0.5) rejects unrelated matches
277
+ - [x] Call between `compute_clusters()` and `save_clusters_to_db()` in recommendations.py
278
+ - [x] **10 tests** in `test_clustering.py` — perturbed clusters preserve indices,
279
+ unrelated match rejection, K growth/shrink, custom thresholds
280
+
281
+ ### 4.4 — Category-Level Negative Suppression
282
+ - [x] Add `get_suppressed_categories()` to `db.py`
283
+ - Joins `interactions` + `paper_metadata` to find categories with ≥3 dismissals
284
+ - **Primary category only** (decision: avoid over-suppression)
285
+ - **14-day window** (standard default, τ_neg = 14 days)
286
+ - [x] Add suppression filter in `_multi_interest_recommend()` after reranking
287
+ - [x] Cache Turso metadata to `paper_metadata` via `cache_turso_metadata_batch()`
288
+ - [x] **8 tests** in `test_db.py` — threshold, partitioning, user isolation, custom threshold
289
+ - [~] Per-item short-term decay → **deferred to Phase 6** (LightGBM feature)
290
+
291
+ **Gaps**: None.
292
+
293
+ ---
294
+
295
+ ## Phase 4.5: Instrumentation Foundation ✅ COMPLETE
296
 
297
+ > *Added telemetry columns to the interactions table so every saved/dismissed paper*
298
+ > *can be attributed to its pipeline tier, cluster origin, and ranker version.*
299
+ > *Doc 07 (ADR A4) identified this as the single most valuable early investment —*
300
+ > *retrofitting these fields after real user data exists is painful and blocks all*
301
+ > *later counterfactual evaluation.*
302
+
303
+ ### Schema changes
304
+ - [x] Add `ranker_version TEXT` to `interactions` table — pipeline version tag
305
+ - [x] Add `candidate_source TEXT` to `interactions` — e.g. `cluster_0`, `exploration`, `ewma_longterm`, `qdrant_recommend`, `short_term_supplement`
306
+ - [x] Add `cluster_id INTEGER` to `interactions` — interest cluster index (NULL if N/A)
307
+ - [x] ALTER TABLE migration for existing DBs (safe try/except, idempotent)
308
+
309
+ ### Pipeline tagging
310
+ - [x] Add `_RANKER_VERSION` constant to `recommendations.py`
311
+ - [x] Tag Tier 1 papers with cluster origin, exploration status, short-term supplement
312
+ - [x] Tag Tier 2 papers as `ewma_longterm`
313
+ - [x] Tag Tier 3 papers as `qdrant_recommend`
314
+ - [x] Build `paper_cluster_map` before quota merge (first-occurrence = cluster attribution)
315
+ - [x] Exploration papers tagged as `candidate_source='exploration'`
316
+
317
+ ### End-to-end flow
318
+ - [x] `recommendations.py` embeds tags in paper dicts
319
+ - [x] `action_buttons.html` includes tags in `hx-vals` JSON
320
+ - [x] `events.py` accepts `ranker_version`, `candidate_source`, `cluster_id` Form fields
321
+ - [x] `db.log_interaction()` stores all three new columns
322
+
323
+ **Files modified**: `app/db.py`, `app/routers/events.py`, `app/routers/recommendations.py`, `app/templates/partials/action_buttons.html`
324
+
325
+ **Gaps**: None. `propensity` and `policy_id` fields deferred until ε-greedy exploration (Phase 9).
326
 
327
  ---
328
 
 
357
 
358
  > *Replace heuristic scorer with a trained LightGBM lambdarank model.*
359
  > *Blocked by: ≥500 labeled interactions OR citation-graph bootstrap*
360
+ > *Estimated effort: ~2-4 weeks*
361
+ > *Architecture decision: one-stage LambdaMART first (Doc 07 ADR A3)*
362
 
363
  - [ ] Citation-graph pseudo-labels from unarXive 2022 (cited = relevance 2, co-cited = 1, random = 0)
364
  - [ ] Author-as-user simulation
 
381
 
382
  ## Phase 8: LLM Interest Summaries + Distilled Re-ranker 📋 NOT STARTED
383
 
384
+ > *Estimated effort: ~10-12 weeks (Doc 07)*
385
+ > *Detailed research plan: `docs/research/07-LLM-Summaries-Reranker-and-Scaling-Research.md`*
386
+ > *Entry criteria: Phase 7 eval producing stable nDCG@10; cluster stability Jaccard ≥0.7 over 7 days*
387
+
388
+ ### 8a Claude-generated per-cluster interest summaries (Doc 07 §A)
389
+ - [ ] Cluster snapshot versioning (ADR A1)
390
+ - [ ] Content-addressed caching: `sha256(sorted(paper_ids) + prompt_version + model)`
391
+ - [ ] Shared summaries (not per-user) — Haiku 4.5 + Batch API (~$50-80/month @ 1K users)
392
+ - [ ] Nightly regeneration job with 7-day TTL + event-triggered refresh
393
+ - [ ] "You're reading about X" UI framing with sub-theme bullets
394
+ - [ ] Anthropic Citations API for hallucination prevention
395
+
396
+ ### 8b — Distilled cross-encoder reranker (Doc 07 §B)
397
+ - [ ] Deploy `cross-encoder/ms-marco-TinyBERT-L-2-v2` INT8 ONNX as MVP
398
+ - [ ] 6ms budget for 20 pairs on CPU (AVX-512 VNNI)
399
+ - [ ] TinyBERT score as LightGBM feature (Option C architecture)
400
+ - [ ] Custom distillation from BGE-reranker-v2-m3 only if held-out gap >3 nDCG
401
+ - [ ] MarginMSE loss + SciNCL citation-graph hard negatives
402
+
403
+ ### 8c — Use-cases and information-gain design doc (Doc 07 §C)
404
+ - [ ] 8 user personas (P1 cold-start through P8 stay-current)
405
+ - [ ] Information-gain table (save=3-5×, dismiss-as-label=−3-4×, passive skip=−0.1×)
406
+ - [ ] Mode-switching UI: "Stay Current" vs "Lit Review" toggle
407
+ - [ ] Failure mode detection rules (feed collapse, stale profile, filter bubble)
408
 
409
  ---
410
 
 
451
  | Test File | Count | Status |
452
  |---|---|---|
453
  | `tests/test_profiles.py` | 11 | ✅ Passing |
454
+ | `tests/test_clustering.py` | 21 | ✅ Passing | (9 compute + 10 Hungarian + 2 persistence) |
455
  | `tests/test_reranker_diversity.py` | 13 | ✅ Passing |
456
+ | `tests/test_fusion.py` | 20 | ✅ Passing | (Phase 4.1) |
457
+ | `tests/test_db.py` | 19 | ✅ Passing | (includes 4 Turso cache + 8 suppression) |
458
  | `tests/test_qdrant_svc.py` | — | ✅ Passing |
459
  | `tests/test_arxiv_svc.py` | — | ✅ Passing |
460
+ | `tests/test_integration.py` | — | ✅ Passing | (includes quota pipeline E2E) |
461
  | `tests/test_user_state.py` | — | ✅ Passing |
462
  | `tests/test_saved.py` | — | ✅ Passing |
463
  | `tests/test_hybrid_search.py` | 21 | ✅ Passing |
464
  | `tests/test_search_router.py` | 6 | ✅ Passing |
465
  | `tests/test_live_search.py` | 8 | ✅ Passing |
466
+ | **Total** | **171** | ✅ |
467
  | `test_e2e_recs.py` (standalone) | 1 | ✅ E2E simulation |
468
 
469
  ---
 
476
  | L2-normalize before Ward clustering | ✅ Applied | `app/recommend/clustering.py` |
477
  | Medoid not centroid | ✅ Applied | `app/recommend/clustering.py` → `_find_medoid()` |
478
  | Negative EWMA wired into reranking | ✅ Applied | `app/recommend/reranker.py` → Feature 5 |
479
+ | RRF → quota fusion for recommendations | Applied | `app/recommend/fusion.py` (Phase 4.1) |
480
+ | Hungarian cluster matching | Applied | `app/recommend/clustering.py` → `stabilize_cluster_ids()` (Phase 4.3) |
481
+ | Per-item short-term negative decay | [!] Backlog | Phase 6 (LightGBM feature) |
482
+ | Category-level suppression | Applied | `app/db.py` → `get_suppressed_categories()` (Phase 4.4) |
483
  | BGE-reranker NEVER in hot path | ✅ Followed | Heuristic scorer used instead |
docs/phases/PHASE3-Hybrid-Semantic-Search.md CHANGED
@@ -3,7 +3,7 @@
3
  > **Purpose**: Replace the Phase 1 placeholder arXiv keyword API search with real vector-based
4
  > semantic search using BGE-M3 encoding + Qdrant dense + Zilliz sparse + RRF fusion.
5
  >
6
- > **Status**: 📋 Not started
7
  > **Estimated effort**: ~2-3 weeks
8
  > **Predecessor**: Phase 2c (complete) — the recommendation pipeline
9
  > **Deployment target**: Hugging Face Spaces (Docker SDK, free tier: 16GB RAM, 2 vCPUs)
 
3
  > **Purpose**: Replace the Phase 1 placeholder arXiv keyword API search with real vector-based
4
  > semantic search using BGE-M3 encoding + Qdrant dense + Zilliz sparse + RRF fusion.
5
  >
6
+ > **Status**: Complete
7
  > **Estimated effort**: ~2-3 weeks
8
  > **Predecessor**: Phase 2c (complete) — the recommendation pipeline
9
  > **Deployment target**: Hugging Face Spaces (Docker SDK, free tier: 16GB RAM, 2 vCPUs)
docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md ADDED
@@ -0,0 +1,603 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 4 — Recommendation Pipeline Fixes
2
+
3
+ > **Purpose**: Fix the 3 remaining architectural faults identified by Doc 06 in the
4
+ > recommendation pipeline: replace RRF with importance-weighted quota fusion, add
5
+ > Hungarian matching for cluster stability, and wire category-level negative suppression.
6
+ >
7
+ > **Status**: 📋 Not started
8
+ > **Estimated effort**: ~1 week
9
+ > **Predecessor**: Phase 3.5 (complete) — Turso metadata DB
10
+ > **Deployment target**: Same — Hugging Face Spaces (no infra changes)
11
+
12
+ ---
13
+
14
+ ## Why This Matters
15
+
16
+ The recommendation engine works today — all 3 tiers cascade correctly, EWMA profiles
17
+ update, Ward clustering detects interests, and MMR enforces diversity. But Doc 06
18
+ identified three concrete faults that degrade quality for multi-interest users:
19
+
20
+ | # | Fault | Impact | Who gets hurt |
21
+ |---|---|---|---|
22
+ | **4.1** | RRF fuses interest clusters by consensus, not proportionally | Dominant cluster drowns minority interests | User who likes both NLP (70%) and RL (30%) never sees RL papers |
23
+ | **4.3** | Cluster indices shuffle on every recluster | Future analytics and UI labels break | Any user who saves a new paper |
24
+ | **4.4** | No category-level negative suppression | Dismissed topics keep reappearing | User who dismisses 5 physics papers still gets physics recs |
25
+
26
+ **What's already fixed (not Phase 4)**:
27
+ - ✅ α_long = 0.03 (was 0.10, fixed Phase 2a — PinnerSage rejected 0.10)
28
+ - ✅ L2-normalize before Ward (fixed Phase 2b — Doc 06 fault #4)
29
+ - ✅ Negative EWMA penalty in reranker (fixed Phase 2c — Feature 5, weight 0.15)
30
+ - ✅ Metadata store pre-populated (Phase 3.5 — Turso, 1.23GB)
31
+
32
+ ---
33
+
34
+ ## Current Architecture vs Target Architecture
35
+
36
+ ### Current Retrieval (Phase 2b — being fixed)
37
+
38
+ ```
39
+ Cluster medoids + short-term vector
40
+
41
+
42
+ Single Qdrant prefetch+RRF call
43
+ ├── Prefetch: medoid_1 (limit=40)
44
+ ├── Prefetch: medoid_2 (limit=30)
45
+ ├── Prefetch: medoid_3 (limit=25)
46
+ └── Prefetch: short_term (limit=25)
47
+
48
+
49
+ FusionQuery(fusion=Fusion.RRF)
50
+ │ ← papers near ALL cluster centroids get boosted
51
+ │ ← minority interests get drowned
52
+
53
+ ~100 candidates → rerank → MMR → serve
54
+ ```
55
+
56
+ **Problem**: RRF was designed for fusing *different retrievers on the same query*
57
+ (BM25 + vector). Here we're fusing *different queries for the same user*. Consensus
58
+ means "near the centroid of everything" — the exact failure multi-interest models
59
+ exist to prevent.
60
+
61
+ ### Target Retrieval (Phase 4)
62
+
63
+ ```
64
+ compute_clusters() → K clusters with importance scores
65
+
66
+
67
+ allocate_quotas([imp_1, imp_2, ...], total=100, min=3)
68
+ → [55, 30, 15] (proportional, each ≥ 3)
69
+
70
+
71
+ asyncio.gather( ← concurrent, ~15ms wall-clock
72
+ search_by_vector(medoid_1, limit=55×3), # 3× over-fetch for rerank headroom
73
+ search_by_vector(medoid_2, limit=30×3),
74
+ search_by_vector(medoid_3, limit=15×3),
75
+ search_by_vector(short_term, limit=25), # session boost
76
+ )
77
+
78
+
79
+ Deduplicate across clusters
80
+ (assign each paper to its highest-ranked cluster)
81
+
82
+
83
+ Category suppression: drop papers from suppressed categories
84
+
85
+
86
+ Rerank → MMR → exploration → serve
87
+ ```
88
+
89
+ **Evidence this is correct**:
90
+ - PinnerSage (KDD 2020): samples 3 medoids proportional to importance — no RRF
91
+ - Taobao ULIM (RecSys 2025): per-category parallel retrieval with quota — +5.54% clicks
92
+ - Pinterest Bucketized-ANN (SIGIR 2023): ensures minority items aren't dropped
93
+ - Twitter kNN-Embed: candidates per cluster proportional to mixture weight
94
+ - Bruch et al. (SIGIR 2022): RRF optimises Recall not nDCG — quota gives better nDCG
95
+
96
+ ---
97
+
98
+ ## 4.1 — Replace RRF with Importance-Weighted Quota Fusion
99
+
100
+ ### New File: `app/recommend/fusion.py`
101
+
102
+ Pure-math module with zero I/O dependencies. Contains one function:
103
+
104
+ ```python
105
+ def allocate_quotas(
106
+ importances: list[float],
107
+ total_slots: int = 100,
108
+ min_slots: int = 3,
109
+ ) -> list[int]:
110
+ """
111
+ Importance-weighted quota allocation with a minimum floor.
112
+
113
+ Each cluster gets feed slots proportional to its importance,
114
+ with a guaranteed minimum of `min_slots` to protect minority interests.
115
+
116
+ Algorithm:
117
+ 1. Normalise: w_k = importance_k / sum(importances)
118
+ 2. Raw allocation: raw_k = total_slots × w_k
119
+ 3. Apply floor: slot_k = max(floor(raw_k), min_slots)
120
+ 4. Distribute remainder by largest fractional part
121
+ 5. Guarantee: sum(slots) == total_slots
122
+
123
+ This is the Doc 06 formula verbatim:
124
+ slot_k = max(⌊F × w_k⌋, F_min=3)
125
+
126
+ Reference: PinnerSage (KDD 2020), Taobao ULIM (RecSys 2025),
127
+ Pinterest Bucketized-ANN (SIGIR 2023).
128
+ """
129
+ ```
130
+
131
+ **Worked example** (from Doc 06 §"Worked example"):
132
+ - 3 clusters with importances [0.55, 0.30, 0.15], total_slots=30
133
+ - Raw allocation: [16.5, 9.0, 4.5]
134
+ - Floor applied: [16, 9, 4] (all ≥ 3, so floor has no effect)
135
+ - Remainder: 30 - 29 = 1 slot → goes to cluster 0 (largest fractional part: 0.5)
136
+ - Final: [17, 9, 4] — minority cluster gets 4 slots, not 0
137
+
138
+ **Edge case — tiny cluster**:
139
+ - 4 clusters with importances [0.60, 0.25, 0.10, 0.05], total_slots=30
140
+ - Raw allocation: [18.0, 7.5, 3.0, 1.5]
141
+ - Without floor: [18, 7, 3, 1] — smallest cluster gets 1 paper
142
+ - With floor (min=3): [18, 7, 3, 3] — smallest cluster gets 3 papers
143
+
144
+ ### Modified File: `app/routers/recommendations.py`
145
+
146
+ The `_multi_interest_recommend()` function changes its retrieval step:
147
+
148
+ **What gets removed**:
149
+ - The `_CLUSTER_LIMITS = [40, 30, 25, 20, 15, 15, 15]` hardcoded list
150
+ - The call to `qdrant_svc.multi_interest_search()` (the prefetch+RRF path)
151
+ - Building the `interest_vectors` list of `(medoid_embedding, limit)` tuples
152
+
153
+ **What replaces it**:
154
+ ```python
155
+ import asyncio
156
+ from app.recommend.fusion import allocate_quotas
157
+
158
+ # Step 2: Quota-based parallel retrieval (replaces RRF)
159
+ quotas = allocate_quotas(
160
+ importances=[c.importance for c in clusters],
161
+ total_slots=100, # wide retrieval net
162
+ min_slots=3, # every cluster gets at least 3 slots
163
+ )
164
+
165
+ # Launch concurrent ANN searches — one per cluster + session
166
+ search_coros = []
167
+ for cluster, quota in zip(clusters, quotas):
168
+ search_coros.append(
169
+ qdrant_svc.search_by_vector(
170
+ query_vector=cluster.medoid_embedding.tolist(),
171
+ limit=quota * 3, # 3× over-fetch for rerank headroom
172
+ exclude_ids=seen,
173
+ )
174
+ )
175
+ # Add short-term session vector if available
176
+ st_vec = await profiles.load_profile(user_id, "short_term")
177
+ if st_vec is not None:
178
+ search_coros.append(
179
+ qdrant_svc.search_by_vector(
180
+ query_vector=st_vec.tolist(),
181
+ limit=25,
182
+ exclude_ids=seen,
183
+ )
184
+ )
185
+
186
+ # Execute all searches concurrently (~15ms wall-clock)
187
+ per_cluster_results = await asyncio.gather(*search_coros)
188
+
189
+ # Deduplicate: first occurrence wins (highest-ranked cluster)
190
+ seen_in_results = set()
191
+ candidate_ids = []
192
+ for result_list in per_cluster_results:
193
+ for arxiv_id in result_list:
194
+ if arxiv_id not in seen_in_results:
195
+ seen_in_results.add(arxiv_id)
196
+ candidate_ids.append(arxiv_id)
197
+ ```
198
+
199
+ **Key design decisions**:
200
+
201
+ 1. **`asyncio.gather()` for concurrency** — Each `search_by_vector()` call takes ~5-15ms.
202
+ With `asyncio.gather()`, 3-7 concurrent queries run in ~15-25ms wall-clock — same as
203
+ the old single prefetch call.
204
+
205
+ 2. **3× over-fetch** — We fetch `quota × 3` candidates per cluster, then let the reranker
206
+ pick the best `quota` from each. This gives the heuristic scorer enough headroom to
207
+ find quality papers even if some candidates are poor matches.
208
+
209
+ 3. **First-occurrence deduplication** — Papers appearing in multiple cluster results are
210
+ assigned to whichever cluster ranked them highest (first encounter). This is simple,
211
+ deterministic, and matches the PinnerSage pattern.
212
+
213
+ 4. **`multi_interest_search()` is NOT deleted** — The function stays in `qdrant_svc.py`
214
+ for potential future use. We simply stop calling it from the recommendations router.
215
+
216
+ ### Latency Impact
217
+
218
+ | Stage | Before (RRF) | After (Quota) |
219
+ |---|---|---|
220
+ | Qdrant retrieval | ~15-25ms (1 prefetch call) | ~15-25ms (3-7 concurrent calls) |
221
+ | Dedup + quota | N/A | <1ms |
222
+ | Rerank + MMR | ~12ms | ~12ms (unchanged) |
223
+ | **Total pipeline** | ~30ms | ~30ms |
224
+
225
+ No latency regression. The concurrent gather matches the prefetch parallelism.
226
+
227
+ ---
228
+
229
+ ## 4.3 — Hungarian Matching for Cluster Stability
230
+
231
+ ### Why This Matters
232
+
233
+ When a user saves a new paper, `compute_clusters()` runs Ward clustering from scratch.
234
+ The cluster that was "NLP papers" yesterday might get `cluster_idx=2` today and
235
+ `cluster_idx=0` tomorrow. This breaks:
236
+
237
+ - Future analytics ("which cluster does the user engage with most?")
238
+ - Future UI labels ("Your Interest: Natural Language Processing")
239
+ - A/B test logs that reference cluster indices
240
+ - Doc 06 §"Clustering specifics" calls this "the real operational risk"
241
+
242
+ ### Modified File: `app/recommend/clustering.py`
243
+
244
+ Add a new function called between `compute_clusters()` and `save_clusters_to_db()`:
245
+
246
+ ```python
247
+ from scipy.optimize import linear_sum_assignment
248
+
249
+ def stabilize_cluster_ids(
250
+ new_clusters: list[InterestCluster],
251
+ old_clusters: list[dict] | None,
252
+ paper_vectors: dict[str, list[float]] | None = None,
253
+ ) -> list[InterestCluster]:
254
+ """
255
+ Remap new cluster indices to match previous clusters via Hungarian matching.
256
+
257
+ 1. Compute cost matrix: cost[i][j] = 1 - cosine_sim(new_medoid_i, old_medoid_j)
258
+ 2. Solve assignment with scipy.optimize.linear_sum_assignment
259
+ 3. Remap new cluster_idx to matched old cluster_idx
260
+ 4. Genuinely new clusters (no match) get next available index
261
+
262
+ At K ≤ 7 this is trivially fast (7×7 matrix).
263
+
264
+ Reference: Doc 06 §"Clustering specifics" — "persist cluster→medoid-paper-id
265
+ mapping across reclusterings and use Hungarian matching against previous medoids."
266
+ """
267
+ ```
268
+
269
+ **Algorithm walkthrough**:
270
+
271
+ 1. Load previous clusters from SQLite via `load_clusters_from_db(user_id)`
272
+ 2. If `old_clusters is None` (first time): no remapping needed, return as-is
273
+ 3. Build a cost matrix of shape `(K_new, K_old)`:
274
+ - For each pair, fetch the old medoid embedding from `paper_vectors`
275
+ - `cost[i][j] = 1 - cosine_similarity(new_medoid_i, old_medoid_j)`
276
+ 4. Run `scipy.optimize.linear_sum_assignment(cost_matrix)` — O(K³), trivial at K≤7
277
+ 5. For matched pairs `(new_i, old_j)` where `cost < 0.5` (cosine sim > 0.5):
278
+ assign `new_clusters[new_i].cluster_idx = old_clusters[old_j]['cluster_idx']`
279
+ 6. For unmatched new clusters: assign the next available index
280
+
281
+ **Where it's called** — in `_multi_interest_recommend()` in `recommendations.py`:
282
+
283
+ ```python
284
+ # Step 1: Compute interest clusters
285
+ clusters = compute_clusters(aligned_ids, aligned_embs)
286
+
287
+ # Step 1.5: Stabilise cluster IDs against previous run
288
+ old_clusters = await load_clusters_from_db(user_id)
289
+ clusters = stabilize_cluster_ids(clusters, old_clusters, vectors)
290
+
291
+ # Step 1.6: Persist (now with stable IDs)
292
+ await save_clusters_to_db(user_id, clusters)
293
+ ```
294
+
295
+ ### What Needs to Change
296
+
297
+ The old medoid embeddings need to be compared against new medoid embeddings. The old
298
+ medoid embeddings aren't stored in SQLite (only the `medoid_paper_id` is). Two options:
299
+
300
+ **Option A** (recommended): Use the `paper_vectors` dict that's already loaded at the
301
+ top of `_multi_interest_recommend()` (line 128: `vectors = await qdrant_svc.get_paper_vectors(positives)`).
302
+ Old medoid paper IDs are likely in this set since the medoid IS a saved paper. If not,
303
+ do a small `get_paper_vectors([old_medoid_id])` call.
304
+
305
+ **Option B**: Store medoid embeddings as BLOBs in `user_clusters` table. This adds a
306
+ 4KB column but avoids any Qdrant call. Overhead is negligible.
307
+
308
+ **Decision**: Option A — avoids schema migration and the vectors are already in memory.
309
+
310
+ ---
311
+
312
+ ## 4.4 — Category-Level Negative Suppression
313
+
314
+ ### Design Decisions (Per User Input)
315
+
316
+ 1. **Primary category only** — arXiv papers have multiple categories (e.g., `cs.CV`, `cs.AI`).
317
+ Suppression applies to the **primary category only** to avoid suffocating the recommendation
318
+ graph. A paper tagged `[cs.CV, cs.AI]` is only suppressed if `cs.CV` (primary) is
319
+ suppressed, not if `cs.AI` is.
320
+
321
+ 2. **τ_neg = 14 days** — Standard default from the literature. If a user dismisses ≥3 papers
322
+ from the same primary category within 14 days, that category is suppressed for 14 days
323
+ from the last dismissal.
324
+
325
+ ### ⚠️ Critical Implementation Detail: Category Format Mismatch
326
+
327
+ The arXiv API and Turso store categories in **different formats**:
328
+ - **arXiv API** (`arxiv_svc.py`): uses arXiv codes like `cs.CV`, `cs.CL`, `stat.ML`
329
+ - **Turso** (`turso_svc.py`): uses `primary_topic` which contains human-readable labels
330
+ like `"AI/ML"`, `"Computer Vision"`, `"NLP/Computational Linguistics"`
331
+ - Both write to `paper_metadata.category` via different paths
332
+
333
+ This means `paper_metadata.category` contains a **mix of both formats** depending on
334
+ which service populated it. The suppression logic must handle this:
335
+
336
+ ```python
337
+ # In the suppression filter, normalise category comparison:
338
+ # - Papers from arXiv have codes: "cs.CV"
339
+ # - Papers from Turso have labels: "Computer Vision"
340
+ # Both may appear in suppressed_cats, so we suppress on exact match
341
+ ```
342
+
343
+ **Resolution**: The `get_suppressed_categories()` query will return whatever format is
344
+ in the database. The filter in `recommendations.py` will compare candidate categories
345
+ (from Turso metadata) against the suppressed set. Since recommendations primarily use
346
+ Turso for metadata, the formats will match. For the rare arXiv-fallback case, we accept
347
+ the slight inconsistency — it's a minor gap that self-corrects as more Turso data is used.
348
+
349
+ ### What's Already Done
350
+
351
+ The EWMA negative profile is already wired as Feature 5 in `reranker.py`:
352
+ ```python
353
+ # Feature 5: cosine_sim_negative (0.15 penalty weight)
354
+ neg_penalty = cosine_sim(candidate, neg_profile) * 0.15
355
+ final_score -= neg_penalty
356
+ ```
357
+
358
+ This gives a "soft" directional signal: papers semantically similar to dismissed papers
359
+ get demoted. What's missing is the "hard" category-level suppression.
360
+
361
+ ### What's NOT Being Done (Deferred)
362
+
363
+ **Per-item temporal decay** (`score -= α × exp(-dt / τ)`) is deferred to Phase 6.
364
+ Reasoning:
365
+ - Requires per-dismissed-item timestamps matched against candidates
366
+ - Most naturally expressed as a LightGBM feature (`days_since_most_recent_similar_dismissal`)
367
+ - The EWMA negative penalty already covers the directional signal
368
+ - Adding hand-tuned temporal formulas when LightGBM is the next phase would create throwaway code
369
+
370
+ ### Modified File: `app/db.py`
371
+
372
+ Add one new helper function:
373
+
374
+ ```python
375
+ async def get_suppressed_categories(
376
+ user_id: str,
377
+ threshold: int = 3,
378
+ days: int = 14,
379
+ ) -> set[str]:
380
+ """
381
+ Find primary arXiv categories where the user has dismissed ≥ threshold
382
+ papers within the last `days` days.
383
+
384
+ Joins interactions (event_type='not_interested') against paper_metadata
385
+ to get the category of each dismissed paper.
386
+
387
+ Returns: set of category strings to suppress (e.g., {'cs.CV', 'physics.optics'})
388
+ """
389
+ async with aiosqlite.connect(DB_PATH) as db:
390
+ cur = await db.execute(
391
+ """SELECT pm.category, COUNT(*) as cnt
392
+ FROM interactions i
393
+ JOIN paper_metadata pm ON i.paper_id = pm.arxiv_id
394
+ WHERE i.user_id = ?
395
+ AND i.event_type = 'not_interested'
396
+ AND i.timestamp >= datetime('now', ?)
397
+ GROUP BY pm.category
398
+ HAVING cnt >= ?""",
399
+ (user_id, f"-{days} days", threshold),
400
+ )
401
+ rows = await cur.fetchall()
402
+ return {row[0] for row in rows if row[0]}
403
+ ```
404
+
405
+ **Data dependency**: This requires dismissed papers to have their metadata in
406
+ `paper_metadata`. Currently:
407
+ - Papers from **arXiv API** (`arxiv_svc.py`) are automatically cached via `db.cache_metadata()`
408
+ - Papers from **Turso** (`turso_svc.py`) are **NOT cached** to `paper_metadata`
409
+
410
+ This is a gap. When a user dismisses a paper whose metadata came from Turso (the common
411
+ case since Phase 3.5), the category won't be in `paper_metadata` and the suppression
412
+ join will miss it.
413
+
414
+ **Fix**: Add a `cache_turso_metadata()` helper in the recommendations router that writes
415
+ Turso-sourced paper dicts to `paper_metadata` after fetching. This is a small INSERT OR
416
+ IGNORE — ~1ms overhead for 20 papers. We should also add this to `search.py` and
417
+ `saved.py` so ALL metadata paths feed the cache.
418
+
419
+ ### Modified File: `app/routers/recommendations.py`
420
+
421
+ In `_multi_interest_recommend()`, after re-ranking but before MMR:
422
+
423
+ ```python
424
+ # Step 3.5: Category suppression
425
+ suppressed_cats = await db.get_suppressed_categories(user_id)
426
+ if suppressed_cats:
427
+ # Filter out candidates whose primary category is suppressed
428
+ reranked_ids_filtered = []
429
+ reranked_scores_filtered = []
430
+ reranked_embs_list = []
431
+ for i, rid in enumerate(reranked_ids):
432
+ cat = cand_meta.get(rid, {}).get("category", "")
433
+ # Extract primary category (first in the list, or the whole string)
434
+ primary_cat = cat.split()[0] if cat else ""
435
+ if primary_cat not in suppressed_cats:
436
+ reranked_ids_filtered.append(rid)
437
+ reranked_scores_filtered.append(reranked_scores[i])
438
+ reranked_embs_list.append(reranked_embs[i])
439
+
440
+ if reranked_ids_filtered:
441
+ reranked_ids = reranked_ids_filtered
442
+ reranked_scores = reranked_scores_filtered
443
+ reranked_embs = np.array(reranked_embs_list, dtype=np.float32)
444
+ ```
445
+
446
+ ---
447
+
448
+ ## What Does NOT Change
449
+
450
+ These are explicitly out of scope for Phase 4:
451
+
452
+ | Component | Why it stays |
453
+ |---|---|
454
+ | **Search pipeline** (`search.py`, `hybrid_search_svc.py`) | RRF is correct for search (different retrievers, same query) |
455
+ | **α_long = 0.03** (`profiles.py`) | Already fixed in Phase 2a |
456
+ | **L2 normalization** (`clustering.py`) | Already applied before Ward in Phase 2b |
457
+ | **Negative EWMA Feature 5** (`reranker.py`) | Already wired in Phase 2c |
458
+ | **`qdrant_svc.multi_interest_search()`** | Kept in codebase, just no longer called by recs |
459
+ | **Per-item temporal decay** | Deferred to Phase 6 (LightGBM feature) |
460
+ | **Templates / UI** | No frontend changes |
461
+ | **Infrastructure** | Same deployment, same databases |
462
+
463
+ ---
464
+
465
+ ## Files Changed — Complete Map
466
+
467
+ | File | Action | Lines Changed (est.) | What Changes |
468
+ |---|---|---|---|
469
+ | `app/recommend/fusion.py` | **NEW** | ~60 | `allocate_quotas()` function |
470
+ | `app/routers/recommendations.py` | **MODIFY** | ~40 | Replace RRF call with quota + parallel search; add category suppression |
471
+ | `app/recommend/clustering.py` | **MODIFY** | ~50 | Add `stabilize_cluster_ids()` with Hungarian matching |
472
+ | `app/db.py` | **MODIFY** | ~20 | Add `get_suppressed_categories()` |
473
+ | `tests/test_fusion.py` | **NEW** | ~80 | Unit tests for quota allocation |
474
+ | `tests/test_clustering.py` | **MODIFY** | ~30 | Add test for Hungarian matching stability |
475
+ | `tests/test_search_router.py` | **NO CHANGE** | 0 | Search pipeline untouched |
476
+ | `tests/test_integration.py` | **NO CHANGE** | 0 | Integration tests use mocks, unaffected |
477
+
478
+ **Total new/modified production code**: ~170 lines
479
+ **Total new test code**: ~110 lines
480
+
481
+ ---
482
+
483
+ ## Implementation Order
484
+
485
+ Each step leaves the app in a working state. Tests pass after every step.
486
+
487
+ ### Step 1 — Create `fusion.py` + unit tests (~30 min)
488
+
489
+ Build `allocate_quotas()` in isolation with thorough unit tests:
490
+
491
+ - `test_basic_allocation` — 3 clusters, verify proportionality
492
+ - `test_floor_enforcement` — tiny cluster still gets `min_slots`
493
+ - `test_total_equals_requested` — sum always equals `total_slots`
494
+ - `test_single_cluster` — all slots go to the one cluster
495
+ - `test_equal_importances` — even split
496
+ - `test_many_clusters_with_floor` — 7 clusters, floor forces redistribution
497
+
498
+ ### Step 2 — Refactor `_multi_interest_recommend()` (~1 hour)
499
+
500
+ Replace the RRF call with quota + `asyncio.gather()`. Key changes:
501
+ 1. Remove `_CLUSTER_LIMITS` hardcoded list
502
+ 2. Import `allocate_quotas` from `fusion.py`
503
+ 3. Replace `multi_interest_search()` with per-cluster `search_by_vector()` calls
504
+ 4. Add deduplication logic
505
+ 5. Wire short-term vector as a separate search
506
+
507
+ **Test**: Run `python -m pytest tests/ -v` — all tests must pass.
508
+
509
+ ### Step 3 — Add Hungarian matching to `clustering.py` (~1 hour)
510
+
511
+ 1. Add `stabilize_cluster_ids()` function
512
+ 2. Call it in `_multi_interest_recommend()` between `compute_clusters()` and `save_clusters_to_db()`
513
+ 3. Add test: create clusters, slightly perturb, verify indices preserved
514
+
515
+ **Test**: Run `python -m pytest tests/test_clustering.py -v`
516
+
517
+ ### Step 4 — Add category suppression (~30 min)
518
+
519
+ 1. Add `get_suppressed_categories()` to `db.py`
520
+ 2. Add suppression filter in `_multi_interest_recommend()` after reranking
521
+ 3. Ensure Turso metadata is cached to `paper_metadata` for the join to work
522
+
523
+ **Test**: Run full `python -m pytest tests/ -v`
524
+
525
+ ### Step 5 — End-to-end verification (~30 min)
526
+
527
+ 1. Run `python test_e2e_recs.py` — verify recommendations generate correctly
528
+ 2. Verify latency stays comparable (~7-8s end-to-end including network I/O)
529
+ 3. Run full `python -m pytest tests/ -v` — 125+ tests, zero regressions
530
+
531
+ ---
532
+
533
+ ## Test Plan
534
+
535
+ ### New Unit Tests: `tests/test_fusion.py`
536
+
537
+ | Test | What it verifies |
538
+ |---|---|
539
+ | `test_basic_proportional_allocation` | 3 clusters with [0.5, 0.3, 0.2] → ~[50, 30, 20] slots |
540
+ | `test_floor_protects_minority` | Tiny importance still gets ≥ `min_slots` |
541
+ | `test_sum_always_equals_total` | No slots lost or gained during allocation |
542
+ | `test_single_cluster` | One cluster gets all slots |
543
+ | `test_equal_importances` | N clusters get total/N each |
544
+ | `test_remainder_distribution` | Remainder goes to largest fractional part |
545
+
546
+ ### New Unit Test: `tests/test_clustering.py`
547
+
548
+ | Test | What it verifies |
549
+ |---|---|
550
+ | `test_hungarian_preserves_indices` | Slight perturbation doesn't shuffle indices |
551
+
552
+ ### Regression
553
+
554
+ - All 125 existing tests must pass
555
+ - `test_e2e_recs.py` must complete successfully
556
+
557
+ ---
558
+
559
+ ## Risks and Mitigations
560
+
561
+ | Risk | Impact | Mitigation |
562
+ |---|---|---|
563
+ | **Concurrent searches slower than prefetch** | Higher latency | `asyncio.gather()` runs them truly concurrently. Each is ~5-15ms. Wall-clock ~ max(all), not sum(all). |
564
+ | **Floor forces too many slots** | With 7 clusters, floor=3 requires 21 minimum slots. If total<21... | `allocate_quotas()` will clamp: if `K × min_slots > total`, reduce floor proportionally. At `total_slots=100` and `MAX_CLUSTERS=7`, minimum is 21, well within budget. |
565
+ | **Hungarian matching with different K** | New clustering produces fewer/more clusters than before | Handle rectangular cost matrices. `linear_sum_assignment` natively supports non-square matrices. Unmatched new clusters get fresh indices. |
566
+ | **`paper_metadata` missing for suppression join** | `get_suppressed_categories()` returns empty set | **Real gap found** — Turso metadata is not cached to `paper_metadata`. Fix: add `cache_turso_metadata()` calls in search/rec/saved routers. |
567
+ | **Turso categories vs arXiv categories format** | Turso stores human-readable categories ("AI/ML"), arXiv uses codes ("cs.AI") | **Real gap found** — both formats coexist in `paper_metadata.category`. Suppression will work within each format. Cross-format inconsistency is minor and self-corrects as Turso dominates. |
568
+ | **`search_by_vector` already does 2× over-fetch internally** | Asking for `quota*3` then `search_by_vector` internally doubles it | **Real gap found** — `search_by_vector()` at line 234 already fetches `limit*2` when `exclude_ids` is set. So asking for `quota*3` will actually fetch `quota*6` from Qdrant. This is fine (more candidates for reranker) but should be noted for tuning. |
569
+
570
+ ---
571
+
572
+ ## Verification Checklist
573
+
574
+ Before declaring Phase 4 complete:
575
+
576
+ - [ ] `python -m pytest tests/ -v` — all tests pass (130+ including new tests)
577
+ - [ ] `test_fusion.py` — 6+ quota allocation tests pass
578
+ - [ ] `test_clustering.py` — Hungarian matching test passes
579
+ - [ ] `test_e2e_recs.py` — end-to-end recommendations generate correctly
580
+ - [ ] Recommendations include papers from minority clusters (quota working)
581
+ - [ ] Cluster indices remain stable across consecutive saves
582
+ - [ ] Category suppression activates after ≥3 dismissals of same category
583
+ - [ ] Search pipeline is completely unaffected (RRF still used for search)
584
+ - [ ] Latency comparable to Phase 3.5 baseline
585
+ - [ ] All 3 recommendation tiers still cascade correctly (Tier 1 → 2 → 3)
586
+
587
+ ---
588
+
589
+ ## References
590
+
591
+ - PinnerSage (Pal et al., KDD 2020) — Ward + medoid + importance sampling, no RRF
592
+ - Taobao ULIM (Meng et al., RecSys 2025) — quota allocation, +5.54% clicks
593
+ - Pinterest Bucketized-ANN (SIGIR 2023) — minority representation protection
594
+ - Twitter kNN-Embed (arXiv:2205.06205) — per-cluster proportional drawing
595
+ - Bruch et al. (SIGIR 2022) — RRF optimises Recall not nDCG
596
+ - YouTube (Xia et al., 2023) — 3× gain from richer negative treatment
597
+ - Doc 06 §"The fusion fault in Doc 03" — full RRF critique
598
+ - Doc 06 §"Clustering specifics" — Hungarian matching recommendation
599
+ - Doc 06 §"Negative signals" — three-layer negative design
600
+
601
+ ---
602
+
603
+ *Last updated: 2026-04-23*
docs/research/03-MultiInterest-Recommender-Architecture.md CHANGED
@@ -266,7 +266,7 @@ Each cluster gets feed slots proportional to its importance, with a floor of 3 t
266
 
267
  **Note:** RRF *is* correct for the search bar (fusing dense + sparse for the *same* query). Only the recommendation pipeline needs quota.
268
 
269
- **Status:** ⚠️ Code still uses RRF. Scheduled for Phase 4.
270
 
271
  ---
272
 
 
266
 
267
  **Note:** RRF *is* correct for the search bar (fusing dense + sparse for the *same* query). Only the recommendation pipeline needs quota.
268
 
269
+ **Status:** ⚠️ Code still uses RRF. Phase 4 planned — see `docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`.
270
 
271
  ---
272
 
docs/research/07-LLM-Summaries-Reranker-and-Scaling-Research.md ADDED
@@ -0,0 +1,426 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ResearchIT Phase 4 Implementation Plan and Phase 5 Preview — Research Report for Amin
2
+
3
+ This report synthesizes 2024–2026 sources (RecSys/SIGIR/KDD/NeurIPS/ACL/EMNLP papers, production blogs from Pinterest, Spotify, YouTube, Netflix, and documentation from BAAI, Jina, Mixedbread, Anthropic) into an implementation-ready plan. The headline recommendation is to run Phase 4a (Claude summaries) and 4d (use-cases doc) in parallel over weeks 1–3 after a one-week ADR sprint, then spend weeks 4–9 on 4b (distilled reranker) — total ~10–12 weeks for Phase 4 with buffer. Nearly every Phase 5 workstream (exploration, IPS, propensity logging, telemetry schema) must be architected *before* Phase 4 code lands, even though the workstreams themselves are gated on user-count thresholds. The single most valuable decision to make now is the telemetry event schema, because retrofitting propensity, policy-id, and position fields after you have real-user data is painful and blocks all later counterfactual evaluation.
4
+
5
+ ## A. Phase 4a — Claude-API-generated per-cluster interest summaries
6
+
7
+ ### A.1 Prompt engineering
8
+
9
+ The closest published analogue to Amin's use case is **Scholar Inbox** (Flicke et al., ACL 2025 Demo, arXiv 2504.08385), which generates 4-level hierarchical labels (field → subfield → subsubfield → method) from t-SNE paper clusters using Qwen; their appendix §6.1 contains the exact prompt. Microsoft's **TnT-LLM** (KDD 2024) and **TopicGPT** (Pham et al., NAACL 2024) converge on the same pattern: structured XML-tagged inputs, constrained vocabulary, and JSON output. The recommended template for ResearchIT:
10
+
11
+ ```
12
+ You are summarizing a research interest cluster for a specific user.
13
+
14
+ USER PROFILE CONTEXT (tone only, not content):
15
+ {short profile string}
16
+
17
+ CLUSTER MEDOID PAPER (most representative):
18
+ <medoid><title>{...}</title><abstract>{...}</abstract></medoid>
19
+
20
+ NEAREST NEIGHBOR PAPERS:
21
+ <papers>
22
+ <paper id="1"><title>...</title><abstract>...</abstract></paper>
23
+ ... (up to 20)
24
+ </papers>
25
+
26
+ TASK: Produce JSON {"label": "<1-sentence 'You're reading about X, particularly Y' framing>", "themes": [<≤5-word bullet>, ... up to 4]}
27
+
28
+ RULES:
29
+ - Every technical term in "label" and "themes" MUST appear verbatim in at least one provided title or abstract.
30
+ - Do NOT introduce methods, datasets, or concepts not present in inputs.
31
+ - If fewer than 3 papers share a theme, omit it.
32
+ - Prefer specific phrases ("retrieval-augmented generation evaluation") over generic ones ("NLP research").
33
+ - Output JSON only.
34
+ ```
35
+
36
+ **Start zero-shot with this constrained prompt; add 2–3 hand-written few-shot examples only to anchor the "You're reading about X" voice.** Spotify Research's Dec 2024 "Contextualized Recommendations Through Personalized Narratives Using LLMs" post found zero-shot adequate but converged on 3–5 "golden" style examples for tone. The Anthropic cookbook's `using_citations.ipynb` demonstrates the **Citations API**, which returns structured citation objects and explicitly "will not return citations pointing to documents or locations that were not provided as valid sources" — **use the Citations API for ResearchIT**, it eliminates the hallucination vector at the API level.
37
+
38
+ ### A.2 Regeneration frequency
39
+
40
+ The 2024–2026 literature (Google's arXiv 2510.20260 on "Balancing Fine-tuning and RAG for Dynamic LLM Recommendation Updates"; Spotify's production narratives cache per-item) strongly favors **event-triggered regeneration over fixed nightly cadence**. Concrete hybrid policy:
41
+
42
+ Regenerate when the medoid paper changes, when Jaccard distance between old and new paper-ID sets exceeds 0.3, or when a cluster is added/merged/split. Apply a **7-day TTL fallback** even when nothing changes (captures embedding/context drift). **Do not regenerate nightly** — it is roughly 7× the cost for negligible UX gain on Ward clusters whose membership is stable over the timescale of a single day.
43
+
44
+ ### A.3 Pricing (April 2026) and cost estimate
45
+
46
+ Verified current pricing from platform.claude.com/docs, cross-checked against Finout/MetaCTO/PE Collective reporting: **Haiku 4.5 at $1/$5 per MTok in/out**, **Sonnet 4.6 at $3/$15**, **Opus 4.7 at $5/$25** (released April 16, 2026, with a new tokenizer that can inflate token counts up to 35%). Cache reads are 10% of base input; cache writes 125% (5-minute) or 200% (1-hour). Batch API gives a flat 50% discount with ≤24h turnaround and stacks with caching. Haiku 3 is deprecated April 19, 2026 — do not build against it.
47
+
48
+ For 1,000 users × 5 clusters × 20-paper contexts (~6,000 input tokens each) regenerated weekly, monthly traffic is ~130M input + ~3.25M output tokens. Total monthly cost by model:
49
+
50
+ - **Haiku 4.5 + Batch API: ~$73/month; with prompt caching on stable prefix, ~$50–60/month**
51
+ - Sonnet 4.6 + Batch API: ~$220/month (~$150–180 with caching)
52
+ - Opus 4.6/4.7 + Batch API: ~$366/month (~$280 with caching)
53
+
54
+ **Recommendation: Haiku 4.5 + Batch API is the right default.** The task (label a cluster from provided abstracts) sits comfortably within Haiku's capability. Reserve Sonnet for offline A/B quality evaluation on a minority of calls. Skip Opus entirely for this task. Prompt caching savings are modest because each cluster's paper context is unique per cluster; the real economic lever is the **shared cross-user dedup** (§A.7), not prompt caching within a single call.
55
+
56
+ ### A.4 Content-addressed caching
57
+
58
+ Construct the cache key as `sha256(sorted(paper_ids) + prompt_version + model + schema_version)`. Sort paper IDs before hashing for order-independence; include prompt and model version so stale summaries don't survive a template change; **omit user ID** from the shared cache key (that's the entire point — §A.7). Use an immutable, content-addressed store (`summaries[hash] = {label, themes, generated_at, model, tokens_used}`) — never overwrite; let old entries age out on a 90-day LRU. This mirrors CDN asset hashing (`main.a3f2b1c9.js`) and matches the Anthropic Claude Code cache-invalidation discussion (issue #29230) recommending SHA-256 of all source files be part of the cache key.
59
+
60
+ Expected exact 20-paper dedup rate is low (papers are drawn from 3M+ arXiv), but a **two-tier cache** with a "narrow" key (medoid + top-5 neighbors) as fallback increases hit rate substantially.
61
+
62
+ ### A.5 Explainable-recommender UX in academic search
63
+
64
+ None of Scholar Inbox, Connected Papers, Elicit, ResearchRabbit, Semantic Scholar, Consensus, or Undermind currently displays a **personalized "You're reading about X" per-user cluster narrative**. Scholar Inbox's Scholar Map labels are the closest analogue but are global/shared across users. This means ResearchIT's Phase 4a is **genuinely novel UX for academic search**, and the right place to borrow heavily is Spotify (which reports up to 4× CTR on niche content when LLM narratives personalize discovery) and Wang et al.'s "LLMs for User Interest Exploration in Large-scale Recommendation Systems" (RecSys 2024, arXiv 2405.16363), an architecturally identical recipe (interest clusters + constrained LLM descriptions). Lubos et al.'s UMAP 2024 user study on "LLM-generated Explanations for Recommender Systems" confirms users rate LLM explanations highly for decision support.
65
+
66
+ UX recommendation: lead with the 1-sentence "You're reading about X, particularly Y" framing, then an expandable bullet list of 3–5 sub-themes, with **source paper titles as linkable chips** under each bullet (the Anthropic Citations / deterministic-quoting pattern, which kills trust issues by letting users verify). A subtle "regenerated on {date}" timestamp plus a manual refresh button gives users control.
67
+
68
+ ### A.6 Hallucination prevention
69
+
70
+ The 2024–2026 state-of-the-art for grounding evaluation is **MiniCheck** (Tang, Laban, Durrett, EMNLP 2024, arXiv 2404.10774) — a 770M-parameter fine-tuned Flan-T5 that matches GPT-4 fact-checking accuracy at ~400× lower cost. Ranked strongest-to-weakest, grounding techniques are: (1) deterministic quoting (surface verbatim source text in the UI); (2) **Anthropic Citations API** (native, recommended); (3) prompt-based "use only phrases from source" rules; (4) post-hoc NLI verification with MiniCheck-FT5; (5) constrained decoding (overkill for 1-sentence labels).
71
+
72
+ Recommended stack: Anthropic Citations API + explicit "verbatim-phrase" rule in prompt + post-hoc substring verification on noun phrases (reject and regenerate if >1 unsupported phrase). Run MiniCheck-FT5 offline on a sample as an ongoing faithfulness metric. Zhou et al. (Findings EMNLP 2023) "context-faithful prompting" shows instruction-only grounding measurably reduces hallucination but is not sufficient alone — combine with a verification layer.
73
+
74
+ ### A.7 Per-user vs shared summaries
75
+
76
+ **Use a hybrid two-stage design.** Stage 1 generates a **shared, content-addressed, public-paper-only** cluster description (the Claude call gets only paper titles/abstracts, never user profile text) — identical cluster content produces identical summary across users and days, enabling aggressive dedup. Stage 2 wraps the shared summary with per-user framing either via client-side string templating ("You're reading about {shared_label}") or via a lightweight per-user LLM pass cached at `(user_id, shared_hash)`.
77
+
78
+ This matches Spotify's item-level-narrative + per-user-context split and Google's arXiv 2510.20260 offline-bulk/online-lookup separation. **Privacy payoff:** shared summaries are pure functions of public arXiv content, so they can ride Anthropic's Batch API with ZDR safely, be logged freely, and be cached cross-user. User profile text never leaves your infrastructure (or does so only in a heavily-filtered form for Stage 2). This is the architectural decision (ADR A2) that must be made **before** building the caching layer, because switching from per-user to shared requires a full cache-schema migration post-launch.
79
+
80
+ ## B. Phase 4b — Distilled cross-encoder reranker
81
+
82
+ ### B.1 FlashRank recipe and student candidates
83
+
84
+ **FlashRank (PrithivirajDamodaran) does not train its own students** — it repackages existing open checkpoints as quantized ONNX. The default "Nano" is `ms-marco-TinyBERT-L-2-v2` (14M params, ~17MB fp32, ~6MB INT8), "Small" is `ms-marco-MiniLM-L-12-v2`, and "Medium" is `rank-T5-flan`. The engineering pattern to steal is ONNX + INT8 dynamic quantization + the `tokenizers` Rust library only (no PyTorch/transformers at runtime), keeping cold-start under 500ms on serverless.
85
+
86
+ For Amin's 6ms-for-20-pairs CPU budget (≈0.3ms/pair), **the only candidates that fit with headroom are 2-layer students**:
87
+
88
+ | Model | Params | INT8 CPU latency/pair | BEIR nDCG@10 |
89
+ |---|---|---|---|
90
+ | **ms-marco-TinyBERT-L-2-v2** | 14M | ~0.3–1.0ms | ~43–45 |
91
+ | ms-marco-MiniLM-L-4-v2 | 19M | ~1.5–2ms | ~46 |
92
+ | ms-marco-MiniLM-L-6-v2 | 22M | ~3–5ms (tight on budget) | ~48 |
93
+ | jina-reranker-v1-turbo-en | 38M | ~3–5ms | 49.60 (95% of jina-base) |
94
+ | jina-reranker-v1-tiny-en | 33M | ~2–3ms | 48.54 (92.5%) |
95
+ | mxbai-rerank-xsmall-v1 | 71M | ~8–12ms (over budget) | 43.9 |
96
+
97
+ Tonellotto et al.'s "Shallow Cross-Encoders" (SIGIR 2024, arXiv 2403.20222) found that at latency ≤10ms on CPU, TinyBERT-gBCE reaches nDCG@10 of 0.652 on TREC-DL-2019, a +51% gain over MonoBERT-Large (0.431). **The architectural choice (2L vs 12L) matters more than the teacher weights at tight latency.** Don't pick a bigger student.
98
+
99
+ ### B.2 Domain adaptation — how much does arXiv-specific fine-tuning buy?
100
+
101
+ **Typical gain from in-domain distillation at the 2-layer scale: +1 to +3 nDCG@10 points on SciDocs**, not 10. MedCPT (PubMed, Jin et al. arXiv 2307.00589) surpasses BM25 only after ~150M query-article pairs, showing diminishing returns for modest training budgets. The listwise-distillation paper arXiv 2505.19274 demonstrates that a general RankT5-3B teacher is competitive with in-domain rerankers on SciDocs/SciFact/NFCorpus, within noise. **No BGE-reranker-v2 checkpoint fine-tuned on scientific text exists on Hugging Face as of April 2026** (searched).
102
+
103
+ ### B.3 Distillation objectives
104
+
105
+ The 2025 reproducibility study (arXiv 2603.03010) benchmarks nine loss functions across nine backbones with SPLADE-v3 top-1000 candidates. Average rank across out-of-domain BEIR:
106
+
107
+ 1. InfoNCE (rank 1.83)
108
+ 2. **MarginMSE** (2.17) — Hofstätter-style pairwise distillation
109
+ 3. DistillRankNet (3.61)
110
+ 4. ADR-MSE (3.66)
111
+ 5. Hinge (3.99)
112
+ 6. BCE (5.74) — significantly worse than every other
113
+
114
+ Critically, "**MarginMSE with BM25-mined negatives is statistically equivalent to InfoNCE with ColBERTv2 hard negatives**" — loss formulation matters more than negative-pool quality. BAAI/BGE uses MarginMSE + self-knowledge-distillation from ensembles. Jina uses explicit KL on logits from the full-size teacher. Yang, He, Yang's SIGIR 2024 paper proposes CKL (contrastively-weighted KL) outperforming MarginMSE+plain KL on MS MARCO + BEIR zero-shot, but the gap is small.
115
+
116
+ **Recommended loss:** `L = α·MarginMSE(student, teacher, pos, neg) + β·KL(σ(student/T), σ(teacher/T)) + γ·BCE(pos, 1)` with α=1.0, β=0.5, γ=0.1, T=1.0. MarginMSE alone is a fine MVP.
117
+
118
+ ### B.4 Integration architecture
119
+
120
+ Three options: (A) TinyBERT score as one feature in a second LightGBM pass; (B) TinyBERT as a direct re-ranker on top-20 replacing LightGBM at that stage; (C) two-stage LightGBM with TinyBERT in between. Bing's LambdaMART over hundreds of features (including BERT scores), Pinterest's TransActV2 feeding neural scores into GBDT, Google/DeepMind's DASALC+TFR-BERT, and TREC TOT 2025 (arXiv 2601.15518) all converge on **the neural score as one feature among many in a final LambdaMART**, not as a terminal reranker.
121
+
122
+ **Recommendation: Option C (≈Option A).** Keep the upstream LightGBM-lambdarank, score the top-20 with TinyBERT (~0.3ms/pair × 20 = ~6ms), and feed the student scores back into a second LightGBM pass that has access to the full personalization feature set. **Do not do Option B** — replacing LightGBM with TinyBERT at top-20 throws away user features, citation-graph features, and temporal decay that LightGBM already incorporates. Engineered features for LightGBM-2: `tinybert_score`, `tinybert_rank_position`, `tinybert_score_normalized_within_query`, and the interaction `tinybert_score − bm25_score`.
123
+
124
+ ### B.5 Hard negative mining
125
+
126
+ The 2024 standard is **NV-Retriever** (arXiv 2407.15831) "positive-aware" filtering: mine top-100 ANN neighbors, then filter with the teacher cross-encoder, dropping candidates whose teacher score is within 0.3 of the positive (likely false negatives or duplicates). For academic papers, supplement with SPECTER/SciNCL citation-graph negatives: **SPECTER** uses 2 "citation-of-citation" hard negatives per query; **SciNCL** (Ostendorff et al.) improves on this by sampling from a continuous citation embedding space (PyTorch-BigGraph over S2ORC) with controlled distance margins (k_min=3998, k_max=4000 on a 52M-node graph), delivering +1.8 points on SciDocs. Recommended mix per (seed, positive): 3 SciNCL-style citation-of-citation negatives, 5 teacher-filtered ANN negatives (top 10–100 with teacher score below 95th percentile), 2 random in-batch. Critically, re-score all candidates with BGE-reranker-v2-m3 and **drop any within 0.3 teacher-score of the positive**.
127
+
128
+ ### B.6 Evaluation and distillation quality gap
129
+
130
+ Typical retention rates from the 2024–2025 literature: jina-v1-base → jina-v1-turbo retains 95% (52.45 → 49.60); TinyBERT-4L retains ~96.8% of BERT-base on GLUE; MiniLM-L6 → MiniLM-L2 rerank retains ~85–90%. **For Amin's ~20× compression from BGE-reranker-v2-m3 (278M) → TinyBERT-L2 (14M), expect 82–88% retention of nDCG@10.** If below 80%, something is wrong (bad negatives, insufficient data, teacher-label leakage into eval).
131
+
132
+ Run evaluations on SciDocs (focusing on Co-view / Co-read / Cite / Co-cite tasks), SciRepEval proximity tasks, the BEIR scientific subset (NFCorpus, SciDocs, SciFact, TREC-COVID), and held-out unarXive 2024–2026 queries with citation-graph ground truth. **CPU latency protocol: 50 warmup inferences discarded, 1000 measured inferences at seq_len=128, batch=20; report P50/P95/P99, not mean** (Pinterest standard).
133
+
134
+ ### B.7 Off-the-shelf scientific-domain rerankers
135
+
136
+ **There is no well-maintained small (<50M param) scientific-domain cross-encoder reranker on Hugging Face as of April 2026 that beats MS MARCO-trained TinyBERT on SciDocs at the 6ms budget.** SPECTER/SPECTER2/SciNCL are bi-encoders (embedders), not rerankers. MedCPT is biomedical-specific. Third-party SciBERT cross-encoders exist but are not validated at MS-MARCO MiniLM-L6 quality. No BAAI bge-reranker fine-tuned on scientific corpus published.
137
+
138
+ **Decision tree:**
139
+
140
+ - **If Amin already has a pseudo-label pipeline producing >200K (query, doc, teacher_score) triples** → distill TinyBERT-L-2 from bge-reranker-v2-m3 on arXiv data. Expect +1–3 nDCG over off-the-shelf.
141
+ - **If Amin wants MVP now** → deploy `cross-encoder/ms-marco-TinyBERT-L-2-v2` with INT8 ONNX (HF already ships `onnx/model_qint8_avx512_vnni.onnx`), measure on held-out eval. If gap vs teacher is <3 nDCG@10, ship; distill later if needed.
142
+
143
+ **Strong recommendation: go off-the-shelf first.** Distillation is ~2–4 weeks of solo-dev work and the marginal gain at 2-layer scale is usually small. Time is better spent on hard-negative mining and LightGBM-2 feature engineering.
144
+
145
+ ### B.8 ONNX / FastAPI hot path
146
+
147
+ Latency ranking for BERT-base-class inference on x86 with AVX-512 VNNI:
148
+
149
+ - PyTorch eager fp32: baseline (1.0×)
150
+ - PyTorch INT8 dynamic CPU: 0.4×
151
+ - ONNX Runtime fp32: 0.3×
152
+ - **ONNX Runtime + INT8 dynamic AVX-512 VNNI: 0.15–0.25× (up to 6× over ORT fp32)**
153
+ - torch.compile: 1.5–2× over eager but still behind ONNX on CPU
154
+
155
+ For TinyBERT-L-2-v2 on Render's standard ~2 vCPU x86: fp32 PyTorch seq=128 ≈3–5ms/pair; **INT8 ONNX ≈0.3–1.0ms/pair single-thread; batched 20 pairs ≈2–4ms total wall-clock on AVX-512 VNNI hardware** (2–3× slower without VNNI). Production code pattern:
156
+
157
+ ```python
158
+ import onnxruntime as ort
159
+ from tokenizers import Tokenizer
160
+
161
+ sess_options = ort.SessionOptions()
162
+ sess_options.intra_op_num_threads = 2 # match Render vCPUs
163
+ sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
164
+ sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
165
+ sess_options.add_session_config_entry("session.intra_op.allow_spinning", "0")
166
+
167
+ session = ort.InferenceSession(
168
+ "model_qint8_avx512_vnni.onnx", sess_options, providers=["CPUExecutionProvider"]
169
+ )
170
+ tokenizer = Tokenizer.from_pretrained("cross-encoder/ms-marco-TinyBERT-L-2-v2")
171
+ tokenizer.enable_truncation(max_length=128)
172
+ tokenizer.enable_padding(length=128)
173
+
174
+ def score_pairs(query, docs):
175
+ enc = tokenizer.encode_batch([(query, d) for d in docs])
176
+ return session.run(None, {
177
+ "input_ids": np.array([e.ids for e in enc], dtype=np.int64),
178
+ "attention_mask": np.array([e.attention_mask for e in enc], dtype=np.int64),
179
+ "token_type_ids": np.array([e.type_ids for e in enc], dtype=np.int64),
180
+ })[0].squeeze(-1).tolist()
181
+ ```
182
+
183
+ Critical tips: pin padding length to enable kernel fusion; use `tokenizers` (Rust, ~0.1ms for 20 pairs) not `transformers.AutoTokenizer` (~5ms); cache sessions globally; disable thread spinning; skip QAT (dynamic INT8 costs <0.5 nDCG).
184
+
185
+ ### B.9 Latency scaling top-20 → top-50 → top-100
186
+
187
+ Linearity is approximately valid but with caveats. K=20→50 ≈ 2.5× latency (6ms → 15ms) with modest sub-linear batching gains of 5–10% from amortized Python/tokenization overhead. K=100 ≈ 4.5× rather than 5×. Memory pressure kicks in at K≥64 with seq_len=512 but not at seq_len=128. Render's 2-vCPU boxes saturate at intra_op_num_threads=2.
188
+
189
+ | K | Strategy | Expected latency |
190
+ |---|---|---|
191
+ | 20 | single batch of 20 | 2–6ms |
192
+ | 50 | single batch of 50 | 6–15ms |
193
+ | 100 | 2 batches of 50, pipelined | 12–25ms |
194
+ | 200 | upgrade to MiniLM-L-4 or go async | 30–50ms |
195
+
196
+ **Beyond K=50, the right move is NOT to batch harder but to prune harder upstream** — make LightGBM-1 more selective. Pinterest and Bing aggressively trim before the expensive stage.
197
+
198
+ ## C. Phase 4d — Use-cases and information-gain design doc
199
+
200
+ ### C.1 User personas
201
+
202
+ Foundational literature: Bates's "berrypicking" (Online Review 1989) — real scholarly search is iterative, multi-source, goal-mutating, not one-shot. Ellis/Wilson's six activities (starting, chaining, browsing, differentiating, monitoring, extracting) map cleanly: monitoring = stay-current mode; chaining+differentiating+extracting = literature-review mode. Al-Shboul & Abrizah (2014, Journal of Academic Librarianship) is the explicit persona-template reference. Gordon et al. (Taylor & Francis 2020/2021) quantify scholarly pain: only 15.4% of physicists feel successful at staying current; 28.6% feel unsuccessful. Mysore et al. (CHIIR 2023) and Soufan, Ruthven, Azzopardi (CHIIR 2024) empirically confirm berrypicking in modern AI/ML workflows. Niwanputri et al. (SIGIR ICTIR 2025) "Untangling Cognitive Processes in Academic Information Searching" is the 2025 SIGIR anchor. **Scholar Inbox (Flicke et al. 2025, arXiv 2504.08385)** is the closest comparable system — they released an 800k-rating dataset and use an active-learning rating onboarding pattern.
203
+
204
+ Drop-in persona cards for the doc:
205
+
206
+ | # | Persona | Profile state | Mode | Day-1 signal | UX demand |
207
+ |---|---|---|---|---|---|
208
+ | P1 | Brand-new (cold start) | Empty EWMA | Exploration-forced | Categories + 5–10 ratings | Active-learning onboarding (Scholar Inbox) |
209
+ | P2 | PhD student, active | 50–500 interactions, 2–4 tight clusters | Stay-current/deep | Daily skim, narrow topic | Don't flood with diversity early |
210
+ | P3 | Senior researcher/PI | 1k+ interactions, 8–15 clusters | Mixed monitoring | Scan many, save few, dismiss often | No single cluster >40% |
211
+ | P4 | Cross-disciplinary | Multiple distinct medoids | Parallel stay-current | Per-cluster cadence diverges | Cluster-balanced delivery |
212
+ | P5 | Lapsed (3-mo gap) | α_long preserved, α_short stale | Re-orient | High dismissal first 3 sessions | "What changed" framing |
213
+ | P6 | Cold-restart pivot | Has history, wants new field | Explicit pivot | System seeds new cluster | "Start new interest" UI |
214
+ | P7 | Literature-review session | Any profile + deep-session intent | Deep single-cluster | Many click-throughs, long dwells | Suppress MMR, amplify depth |
215
+ | P8 | Stay-current daily | Any profile, 10-min daily | Monitoring | Fast skim, binary save/dismiss | Strong MMR, proportional cluster coverage |
216
+
217
+ ### C.2 Information gain per interaction
218
+
219
+ Foundational: Joachims (KDD 2002) clicks as relative pairwise preferences; Joachims et al. (TOIS 2007) eye-tracking validates ~80% reliability for "click i, skip i−1" pairs; Yi et al. (RecSys 2014) dwell time ≥30s as valid-engagement threshold; Xie et al. (WWW 2023) "valid read" = click + sufficient dwell; Yin et al. (WSDM 2013) "Silence is also evidence" — short dwell after click is negative, not missing. **The central paper** is Wang et al. RecSys 2023 (arXiv 2308.12256): dislike as feature only → −0.34% dislike rate (not significant); dislike as **feature AND training label** → −2.44% dislike rate, **−9.60% repeated dislike on same creator**, −2.05% dismissing users, and counterfactually **60.8% reduction in similar-content recommendations versus 22% when dislike is feature-only**. Implicit skip as negative label delivered +0.40% user enjoyment, +0.61% DAU≥1h.
220
+
221
+ Drop-in information-gain table (normalized to click = 1.0 baseline):
222
+
223
+ | Interaction | Sign | Relative strength | EWMA update | ~Bits info |
224
+ |---|---|---|---|---|
225
+ | Explicit category at onboarding | + | 5–10× | α_long seed | 3–5 |
226
+ | Save / bookmark | + | 3–5× | α_short + α_long | ~2 |
227
+ | Click-through to arXiv (no dwell) | + | 1.0× | α_short | ~0.5 |
228
+ | Long dwell (>30s) on abstract | + | 2–3× | α_short elevated | ~1 |
229
+ | Short dwell (<5s) after click | − weak | −0.5× | small α_neg | ~0.3 |
230
+ | Share / export to bib | + | 4–6× | α_short + α_long strong | ~2–3 |
231
+ | Dismiss (feature only) | − | −1× | Layer-1 only | ~0.3 |
232
+ | **Dismiss (feature + training label + similar suppression)** | − | **−3× to −4×** | All three layers | ~1.5–2 |
233
+ | "Don't recommend cluster" mute | − | −10× | Hard filter persistent | 3+ |
234
+ | Passive skip / scroll-past | − very weak | −0.1× | Aggregate only | ~0.05 |
235
+ | Revisit saved paper | + | 2× | α_long | ~1 |
236
+
237
+ **Product principles derived:** every save must move the EWMA profile measurably (if α_short=0.40 doesn't produce a visible medoid shift after one save, the profile is broken); dismissals must be 1-click because their information value is ~3× passive skip; dwell must be normalized per device/context; explicit negatives must enter both the LightGBM feature vector AND the training label — feature-only is essentially wasted.
238
+
239
+ ### C.3 Longitudinal journeys
240
+
241
+ Time-drift literature (Koren KDD 2009 timeSVD++; Mansoury CIKM 2020 feedback loop; TDLRP-MF MDPI Systems 2025; TransActV2 arXiv 2506.02267) validates Amin's α_short/α_long split. The temporal-drift papers consistently show α_short ≈ 10× α_long is healthy; Amin's 13× ratio is in range. Per-persona day-1/7/30/90 table: P1 progresses from explicit ratings + popularity-biased exploration to 1–2 tight clusters by week, to 2–4 stable medoids at 30 days, to indistinguishable from P2 at 90 days. P5 on return at d=90 starts with stale α_short; decay α_long by (1−α_long)^90 ≈ 0.065 to partially refresh. P7 is session-scoped only (MMR λ down, cluster depth up, session-TTL long). P8 is steady monitoring at 10-min daily, evolving slowly in α_long regime.
242
+
243
+ ### C.4 Instrumentation priorities
244
+
245
+ Production references: Spotify Event Delivery Infrastructure (8M events/s, schema-first, session-context qualifies every signal); Pinterest TransActV2 (arXiv 2506.02267, real-time top-100 sequence, **p99 latency as production-critical metric not mean**); YouTube Covington RecSys 2016 + Wang 2023 (80B signals/day, separate logging for watch/search/subscribe/dismiss/satisfaction); OpenTelemetry Weaver (2025) for schema-first telemetry with SDK generation. The schema must be frozen before any real-user logging (ADR A4) because post-launch migrations are painful.
246
+
247
+ Minimum event families to log: session_start/end + mode_declared; feed_request/served with slot_index, cluster_id, medoid_id, popularity_prior_weight, mmr_lambda, exploration_flag; positive (click, dwell_end, save, bookmark, share, export_bib, revisit) with dwell_ms, scroll_depth, device_context; negative (dismiss, mute_cluster, hide_author, explicit_dislike) with reason_code, layer_applied; profile ops (ewma_step, cluster_rebuild, medoid_shift) with α_used, silhouette_delta; model ops with per-stage latency; health/error events (empty_candidate_set, stale_profile_warning, popularity_fallback_triggered). **Log p50/p95/p99 latency percentiles per stage.** Nightly aggregations for SLO dashboards: personalized-to-popularity ratio (target ≥0.85 after day 7), cluster-share Gini (alert >0.7), exploration-slot fire rate (target 1/10 ±50% drift alert), per-cluster dismiss rate (>35% → mute candidate), save-to-click ratio, α_short day-over-day distance (alert if zero for 14 days), time-between-sessions (detects lapsed users).
248
+
249
+ ### C.5 Product principles
250
+
251
+ Netflix North-Star thinking (Gibson Biddle) suggests **"saves per active week"** as ResearchIT's primary leading indicator — tied to customer value, directly moves α_long, not gameable by dismissals. Spotify contextual-session principle: a skip in stay-current mode ≠ a skip in lit-review mode. Pinterest tail-latency principle: operational metrics on p99 not mean. Stated principles for ResearchIT: every save must measurably move the profile; dismissals are always 1-click, always logged as both feature and label; three-layer negatives have distinct half-lives (session/α_neg=0.15/persistent-until-unmuted); context qualifies every signal; exploration is a budget not an afterthought; cluster balance beats global top-K for cross-disciplinary users; cold-start is active not passive (Scholar Inbox pattern); latency SLOs on p99; stale profiles must announce themselves; never dark-launch a ranker change without a popularity-baseline A/B.
252
+
253
+ ### C.6 Mode-switching / intent-conditioned recommendation
254
+
255
+ Broder (SIGIR Forum 2002) navigational/informational/transactional extends to informational-narrow (lit-review) vs informational-broad (stay-current). **Jannach, Mobasher et al. TORS 2024 (arXiv 2406.16350) "A Survey on Intent-Aware Recommender Systems"** is the 2024 anchor — categorizes diversification-based, intent-prediction, and latent-intent modeling; identifies gap of offline-only evaluation. RecSys 2024 reproducibility study "A Worrying Reproducibility Study of Intent-Aware Recommendation Models" is cautionary: most intent-aware claims don't replicate. **Industry validates explicit mode switching over fully-latent intent** (Pinterest Homefeed vs Related-Pins vs Search; Spotify Deep-Focus vs What's-New).
256
+
257
+ Recommendation: start with an **explicit two-mode toggle** in UI ("Stay Current" / "Lit Review"): stay-current has high MMR λ, per-cluster quota on, small popularity prior, 10-min session TTL; lit-review has low MMR λ, high single-cluster depth, citation-chain exposure, 60-min session TTL. Add latent intent fallback: if session shows 3 consecutive clicks into one cluster with long dwells, quietly switch to lit-review. Defer sophisticated latent-intent models.
258
+
259
+ ### C.7 Failure modes and detection
260
+
261
+ Chaney, Stewart, Engelhardt (RecSys 2018) prove feedback loops amplify homogeneity; Mansoury et al. (CIKM 2020) quantify bubble intensification across rounds; Nguyen et al. (WWW 2014) first longitudinal filter-bubble measurement; Tang et al. (arXiv 2508.11239, Aug 2025) "Mitigating Filter Bubble from Community Detection" defines filter-bubble index = fraction of recs inside user's own community — **directly operationalizable using Ward clusters as the Louvain analog**. Drop-in detection rules:
262
+
263
+ | Failure | Detection rule | Mitigation |
264
+ |---|---|---|
265
+ | Feed collapse | 7-day rolling cluster-share Gini >0.7 OR top-cluster share >0.6 | Force MMR λ up; inject exploration; cap per-cluster at 40% |
266
+ | Stale profile | α_short unchanged for 14 days AND last session >30 days | "Refresh interests" card; boost popularity prior; Scholar-Inbox-style re-prompt |
267
+ | Cluster fragmentation | Cluster count >K_max OR >40% clusters with <3 neighbors | Lower Ward threshold; merge |
268
+ | Cluster over-merging | Silhouette week-over-week Δ <−0.15 | Raise Ward threshold; split top-variance cluster |
269
+ | Filter bubble | Filter-bubble index >0.95 for 30 days | Cross-cluster sampling; raise exploration budget |
270
+ | Popularity collapse | popularity_fallback >0.2 DAU/day | Ranker may be broken; verify LightGBM not degenerate |
271
+ | Latency regression | p99 > SLO for 1h | Standard SRE playbook |
272
+ | Dismissal ineffective | In-cluster rec rate within 7 days of dismiss > baseline | Verify three-layer pipeline; check layer-2 re-training |
273
+ | Feedback-loop amplification | Avg pairwise served-item similarity trending up 4+ weeks | CD-CGCN community-aware negative sampling |
274
+ | Cold-start stuck | Personalized score share <0.3 at day 7 | Push active-learning prompts; lower warm threshold |
275
+
276
+ ## D. Phase 5 preview at Phase-4-level detail
277
+
278
+ ### D.1 Epsilon-greedy exploration
279
+
280
+ **Spotify BaRT** (McInerney, Lacker, Hansen, Higley, Bouchard, Gruson, Mehrotra; RecSys 2018; DOI 10.1145/3240323.3240354) is the canonical reference. Two-stage contextual bandit over Home shelves (rows + explanations) and cards (playlists). Reward = factorization machine over user × item × explanation × context features predicting a binary stream event (≥30s listen). Epsilon-greedy per-slot: with probability ε pick uniformly among candidates, otherwise argmax. Conditional exploration separates "explore the item" from "explore the explanation" sharing one reward model — this keeps propensities tractable. Training uses counterfactual risk minimization with IPS on logs. Heavier exploration for new users, lighter for established.
281
+
282
+ **Pinterest "Warmer for Less"** (arXiv 2512.17277, Dec 2025) targets industrial cold-start items: **targeted lightweight augmentations (~+5% params) to the main model can match heavier bespoke approaches**. Strongly validates leaning on BGE-M3 content embeddings + light corrections for new arXiv papers rather than a separate CF/graph cold-start pipeline.
283
+
284
+ Literature consensus on exploration budget clusters at **5–15%, with 10% as default**. For ResearchIT:
285
+
286
+ - **Pre-launch → 100 users: ε-greedy at ε=0.10, slot-reservation pattern** (reserve 1/10 feed slots for exploration candidates — cleaner and lower-variance than per-slot coin flips).
287
+ - **100–500 users: stratified exploration** (ε distributed over arXiv primary categories the user hasn't engaged with × medoid-to-item cosine uncertainty).
288
+ - **500–1K users, >1K eng/week: Beta-Bernoulli Thompson sampling at category level.**
289
+ - **>5K users, >10K eng/week: neural-linear bandit (mtNLB-style, KDD 2024 DOI 10.1145/3637528.3671649) reusing LightGBM scorer as representation — only if ε-greedy shows regret plateau.**
290
+
291
+ Thompson vs ε-greedy: Chapelle & Li (NeurIPS 2011) and Vermorel & Mohri (2005) show vanilla ε-greedy routinely matches or beats TS/UCB at small N. TS at item level across 1.6M items with <1K users is infeasible; TS at category or cluster level is tractable. Other contextual bandit references: LinUCB (Li, Chu, Langford, Schapire WWW 2010); NeuralUCB (Zhou ICML 2020); NeUClust (Atalar et al. arXiv 2410.14586, Oct 2024) — contextual-combinatorial for list recommendations; ENR (CIKM 2023) epistemic neural nets for scalable TS; Ban/Qi/He WebConf 2024 tutorial.
292
+
293
+ ### D.2 LightFM collaborative filtering
294
+
295
+ LightFM (Kula 2015, arXiv 1507.08439) is legacy-but-still-competitive; in 2026 it remains perfect for Render's CPU-only deployment because every user/item embedding is a sum of feature embeddings (including a unique-ID feature), enabling **strong cold-start with metadata — exactly ResearchIT's setting**. Alternatives: implicit ALS (industrial baseline but no cold-items); LightGCN (SIGIR 2020 arXiv 2002.02126, ~16% avg lift on standard datasets but training overhead); two-tower (Google, needs GPU); UltraGCN (marginal gains). 500-user rule-of-thumb: LightFM with WARP loss crosses above content-only when users×interactions >5K; at 500 users × ~10 positive interactions = ~5K, exactly threshold.
296
+
297
+ **Integration: Pattern 2 (CF score as a LightGBM feature).** Spotify and Pinterest production consistently run CF + content-based candidate generators in parallel with a learned ranker blending them; within the ranker, CF is one feature among many. This gracefully handles users with weak CF signal because LightGBM learns to down-weight it. Don't do separate quota slots (worst at blending score scales). Warm-start uses LightFM's feature-averaging: a new user with claimed research categories/authors gets a warm embedding without any interaction history.
298
+
299
+ ### D.3 Dismissal-labeled LightGBM retraining
300
+
301
+ **Minimum viable signal: ~1K dismissal events total** to distinguish systematic item-level dismissals from session noise. **For LightGBM retraining with dismissals as labels: ~10K events.** At 500 users × 5% dismissal rate × 50 impressions/week = ~125 dismissals/week → ~10K takes ~80 weeks of steady use. **Action: add dismissals as features now; add as labels only at scale.** Asymmetric loss via LightGBM's `is_unbalance=True` or explicit `scale_pos_weight`; a dismissal costs more than a missed save because it actively damaged the session. Focal loss (Lin et al.) and class-balanced loss (Cui CVPR 2019) supportable via LightGBM custom objective but only worth it when imbalance exceeds ~1:20.
302
+
303
+ Session-overfitting mitigations: include "fraction of session slots dismissed so far" and "dominant category of session dismissals" as features so LightGBM can learn to discount anomalous sessions; decay dismissal weight by session-age; **within-session negative sampling** (contrast dismissed items against other items shown *in the same session*, not global catalog) — the Wang et al. 2023 pattern. IPS/SNIPS/DR corrections require propensity logging from day 1; for ResearchIT's known policy, exploration slots have propensity = ε / num_candidates, exploit slots ≈1. Apply 99th-percentile weight clipping. SNIPS is the best default (Eugene Yan's benchmarking); DR via Open Bandit Pipeline for robustness; arXiv 2509.00333 (Sept 2025) IPS-weighted BPR + propensity regularizer is a concrete code pattern.
304
+
305
+ ### D.4 Other Phase 5+ previews
306
+
307
+ **Semantic IDs / TIGER** (Rajput et al. NeurIPS 2023, arXiv 2305.05065): item = tuple of discrete codewords from RQ-VAE over content embedding; Transformer seq2seq decodes next-item autoregressively. +29% NDCG@5, +17.3% Recall@5 on Beauty vs S³-Rec. **ActionPiece** (Hou et al. ICML 2025 Spotlight, arXiv 2502.13581) is context-aware tokenization (same action → different tokens depending on neighbors) and outperforms TIGER-style context-independent semantic IDs. Spotify Research Sept 2025 "Semantic IDs for Generative Search and Recommendation" (Penha et al.) shows task-specific Semantic IDs fail to generalize cross-task. **Would TIGER work on CPU for 1.6M corpus?** RQ-VAE training is feasible (hours), but autoregressive Transformer decoding with beam=10 hits hundreds of ms/request on Render CPU. **Defer indefinitely** — it solves embedding-table-cost at scale, which is not ResearchIT's pain. Entry threshold: >10K users AND ANN on 1.6M becomes the bottleneck AND a GPU becomes available.
308
+
309
+ **PinnerFormer** (Pancha et al. KDD 2022, arXiv 2205.04507): single-vector user embedding from transformer over recent engagement sequence; novel dense-all-action loss predicts a random positive action within a 14-day future window from any random sequence position. Batch daily inference closes most of the gap to realtime (0.243 vs 0.251 Recall). **Defer indefinitely for solo-dev pre-launch.** A cheap equivalent is mean of BGE-M3 vectors over recent engagements — already what Amin's medoid retrieval does (PinnerSage's original approach). Entry threshold: ≥10K users AND ≥50 avg interactions/user AND a clear need for sequence modeling AND GPU availability.
310
+
311
+ **DPP / Sliding Spectrum Decomposition.** Classic DPP: Kulesza & Taskar 2011; Chen, Zhang, Zhou KDD 2018 (YouTube-scale). SSD: Huang, Wang, Peng, Wang KDD 2021 (arXiv 2107.05204) — originally Xiaohongshu, adopted by Pinterest in early 2025. Pinterest's April 2026 engineering blog ("Evolution of Multi-Objective Optimization at Pinterest Home feed") documents DPP → SSD migration with >2% time-spent-impression week-1 lift. SSD in PyTorch is cleaner than DPP (avoids PSD enforcement, log-dets, Cholesky stability). **For ResearchIT: MMR is fine at 500 users.** Upgrade entry threshold: feed size ≥20 AND ≥2 diversity axes (category × recency × reading-difficulty) AND visible user complaints of "too-similar" results >5% rate.
312
+
313
+ **Calibration of LightGBM scores.** Default binary log-loss training is often near-calibrated; miscalibration mostly appears with `lambdarank`/`rank_xendcg` objectives — then calibration is **essential before multi-objective fusion or thresholding**. Platt scaling (sigmoid(a·score + b)) is small-data-friendly and parametric; isotonic regression is non-parametric and needs ~≥1K calibration points; beta calibration (Kull, Silva Filho, Flach AISTATS 2017) sits between. LinkedIn's in-model isotonic calibration layer and Google's "Scale Calibration of Deep Ranking Models" (Yan et al. KDD 2022) are recent pointers. **For ResearchIT:** isotonic regression on held-out 10–20% of training interactions, refit weekly. When it matters: thresholding (p(save)>0.3), ranking-fusion (combining CF + LightGBM + exploration bonus). When it doesn't: pure ranking by raw LightGBM output. Do this right after 4b (~2 days of work).
314
+
315
+ **Active learning for cold-start.** Nature Scientific Reports 2025 "Active learning algorithm for alleviating the user cold-start problem of recommender systems" uses decision-tree-based item selection with Like/Dislike/Unknown answers, 20-query cap, 3 like-constraints per user — but found online evaluation with 50 real users could not confirm offline lift. MDPI 2024 review and CIKM 2025 "Harnessing Light for Cold-Start Recommendations" confirm uncertainty+popularity hybrid queries as dominant pattern. **Practical pattern for ResearchIT: 2×3 grid at signup** — 2 triplets of 3 papers each spanning 6 arXiv subfields, user picks best per triplet, yielding a seed medoid from ~2 queries. This is Netflix's post-signup "pick 3 you like" flow. Entry threshold: ≥50 signups/week AND measurable onboarding drop-off.
316
+
317
+ ### D.5 Scaling infrastructure (SQLite → Supabase)
318
+
319
+ SQLite's single-writer lock ceiling: **~50 writes/second with WAL on SSD, ~10 in default mode**. Any long INSERT blocks all writes. FTS5 shares this limit. For ResearchIT at 500 users × 50 events/session × few sessions/week, still fine. Breaks when: concurrent cluster-snapshot writes + live event logging conflict; >100 concurrent users with mutable state; ML-training jobs run alongside API writes. Supabase Postgres features for recsys: pgvector 0.7 with halfvec (50% memory savings) and parallel HNSW builds (30× faster); Row-Level Security for lab/team multi-tenancy (one `lab_id` column, policy `lab_id = auth.jwt()->>'lab_id'`); realtime subscriptions. Free tier is 500MB; paid starts ~$25/mo.
320
+
321
+ **Migration trigger: hit ~500MB SQLite OR visible writer contention OR concurrent cluster-snapshot + event-log conflicts.** Use immutable snapshot tables (`clusters_v42`, `clusters_v43`) with pointer-table atomic swap; Qdrant/Zilliz collection aliases for zero-downtime rebuilds; keep last 2 snapshots for rollback. Vector cache invalidation: version cluster_snapshot_id on cached candidates; background job refills.
322
+
323
+ ### D.6 A/B testing at ~500 users
324
+
325
+ Statistical power at N=500 (α=0.05, 80% power, 50/50 split): binary metric with baseline p=0.10 has **MDE ≈ 5.5 percentage points absolute** (10% → 15.5%, ~55% relative); continuous metric MDE ≈ 0.25σ Cohen's d. **Only large lifts are detectable at this scale.** CUPED (Deng, Xu, Kohavi, Walker WSDM 2013) reduces required N by 2–3× on predictable metrics; 2024/2025 extensions include arXiv 2410.09027 (Lin & Crespo, Etsy) and arXiv 2510.03468 (CUPED + trimmed mean for heavy tails).
326
+
327
+ **For solo pre-launch: scipy.stats + evidently.ai-style notebook now.** GrowthBook (self-hosted, open-source, SQL-based) is the right upgrade at ≥1K users with ≥1 concurrent experiment/month. Skip Statsig (vendor dependency). Skip switchback unless adding shared team feeds where spillover matters. Experiment templates: exploration-% ablation (5/10/15 with primary = 7-day save rate, secondary = session length + dismissal rate); CF on/off at 50/50 user-level randomization; dismissal-feature vs dismissal-label over ≥4 weeks.
328
+
329
+ ### D.7 Multi-tenancy / group recommendation
330
+
331
+ Masthoff (2015 survey) taxonomy holds: Average/Additive Utilitarian; **Least Misery** (good for veto scenarios like "labmate dislikes biology → don't recommend to whole lab"); Most Pleasure; **Average Without Misery** (recommended compromise — average but filter below per-individual threshold); Approval Voting / Borda / Kemeny rank aggregation. Fairness-aware 2024–2025: Stratigi et al. (JIIS 2021) SDAA/SIAA sequential satisfaction-balancing; FAccT 2025 "Group Fair Rated Preference Aggregation: Ties Are (Mostly) All You Need" (Fate-Break and Fate-Rate). LLM-based group rec 2025: arXiv 2505.05016 "Pitfalls of Growing Group Complexity" — LLMs often implicitly do Average; explicit prompts for Least Misery change behavior. Academic-collaboration-context group-RS papers remain rare — you'd be doing mostly greenfield work.
332
+
333
+ **Recommendation for ResearchIT**: **Average-Without-Misery with a tunable misery threshold**, enforced via Postgres RLS per-lab. Lab profile surfaces only aggregate signals (counts, category histograms) — never individual read/save events — unless explicitly opted-in; GDPR consent language must be explicit because "labmate X saved this" is a personal-data disclosure. Entry threshold: real user demand (multiple lab opt-ins requested) post-launch; **not in Phase 5 core scope**.
334
+
335
+ ## E. Offline evaluation scale-up
336
+
337
+ **Regression testing in CI.** Frozen eval set as a Git-LFS artifact with version-pinned manifest (split date, author allowlist, citation-pair count, dataset hash) — never mutate without bumping `eval_set_v1.0.parquet → v1.1.parquet`. Pytest + GitHub Actions on every PR touching `retrieval/rerank/rank/diversify/`. Threshold-based assertions: hard fail if nDCG@10 drops >3% absolute or Recall@50 drops >2%; soft warn (xfail strict=False) if ILS/entropy moves >10%. Use bootstrap 1000-replicate 95% CIs to fail only when the baseline is excluded. PRs that intentionally move metrics must update `eval/baselines/main.json` with an `EVAL_DELTA_JUSTIFICATION`. CPU budget: freeze to 5k-query subsample (~5 min on Render free tier); full eval is nightly cron. **Tooling: DIY pytest now (~200 LOC, zero deps). Evidently AI** (open-source) has a built-in GitHub Action wrapping Python tests and failing CI on threshold violations with 15+ ranking metrics. DeepEval is overkill for ranking.
338
+
339
+ **Per-stage attribution.** IJCAI-22 "Neural Re-ranking in Multi-stage Recommender Systems: A Review" and Pinterest's WebConf 2023 "End-to-End Diversification" paper: each stage needs its own intermediate ground truth plus a joint evaluation. For ResearchIT: retrieval = Recall@200 (ceiling for all downstream); rerank = nDCG@50 on retrieved set + Precision@10; diversify = ΔnDCG@10 and ΔILS/entropy pre-vs-post. Log `stage_metrics.jsonl` per eval with `{run_id, stage, metric, value, params_hash}`; a "regression diagnosis" script compares PR vs main across stages. Hron et al. 2021 "On component interactions in two-stage recommender systems" is the theoretical grounding — retrieval-rerank interactions are non-trivial. Pinterest reports retrieval-layer diversification gives +8% diversity in candidate set but only +1% at final rank — stage-specific diversity deltas matter.
340
+
341
+ **Experiment tracking.** Append-only `eval_runs.jsonl` now (`{run_id, git_sha, timestamp, dataset_hash, config_hash, metrics, stage_metrics}` with Streamlit/Jupyter for plotting). Adopt MLflow locally (SQLite backend) at Phase 4b when distillation creates many hyperparameter-tuning runs. Skip W&B unless/until a collaborator appears (free tier fine but cloud dependency). Skip DVC (Git-LFS + manifests cover 80% of value). Signal to upgrade from JSONL to MLflow: "I can't find the run from 3 weeks ago in grep."
342
+
343
+ **Synthetic user generation.** RecSim NG (Google 2021), RecBole simulators, **Balog & Zhai 2024 "User Simulation for Evaluating Information Access Systems" (Foundations & Trends, 261 pages) is the foundational survey**. 2025 LLM-agent simulators: UserSimCRS v2 (Balog & Zhai 2025), RecUserSim (Chen et al. WWW 2025 arXiv 2507.22897). Sim4IA workshop at SIGIR 2024 is the community reference. Concrete plan: extract 2–5k author personas from unarXive 2022 author graphs spanning deep specialists, bridge authors, early-career, prolific surveyers, methodology-transfer; choice model `p(save|paper) = σ(α·cos(paper,centroid) + β·cited_by_persona + γ·recency − δ·already_saved)`; add drift by slightly updating centroid with each saved paper. Evaluation: longitudinal nDCG trajectories, calibration of saved/dismissed ratio (expect 15–25%), exploration metric for bridge authors. Budget 2–3 weeks; start 100 personas × 30 days, scale to 2k later. Always triangulate against held-out real data.
344
+
345
+ **Cluster evaluation.** Silhouette coefficient + Davies-Bouldin index daily (Chicco et al. 2025 PeerJ — SC+DBI superior to Dunn/CH on convex clusters). Stability across time is the production-critical metric: Hungarian match day-over-day via `scipy.optimize.linear_sum_assignment` with cost = −|C_i ∩ C_j|; per-cluster Jaccard after matching; aggregate mean Jaccard and fraction with J≥0.8. Complement with Adjusted Rand Index across consecutive days and object-level stability (Toms et al. WorldCat; Toussi 2017). Alert threshold: mean Jaccard <0.7 for 3 consecutive days. **Cluster snapshot versioning is architecturally necessary before Phase 4a** because summaries will be keyed to cluster IDs.
346
+
347
+ **Counterfactual evaluation.** Required from day 1 of Phase 4 — every displayed recommendation must log `p(shown|context)` under the active policy. Without propensities, IPS/SNIPS/DR are retroactively impossible. Inject 5% ε-greedy exploration for non-degenerate propensities. Estimator choice (per Eugene Yan benchmarking + JTIE 2025 reproducible study): **SNIPS is best default** (no hyperparameter, lower variance than IPS); Direct Method alongside for low-variance potentially-biased imputation; Switch-DR in moderate-overlap regimes. **Tooling: Open Bandit Pipeline** (Saito et al.) in Python. JTIE 2025 reporting template: always report oracle decomposition, overlap diagnostics, estimator components, and effective sample size. **ESS <100 = unreliable; don't ship.**
348
+
349
+ ## F. Planning and requirements
350
+
351
+ ### F.1 Architectural decisions blocking Phase 4 start (ADR sprint, week 0)
352
+
353
+ These seven decisions must be captured as ADRs *before* any Phase 4 code lands:
354
+
355
+ - **A1 Cluster snapshot versioning.** SQLite table `cluster_snapshots(snapshot_date, cluster_id, paper_id, centroid_blob)`, 30-day retention, Hungarian-matched stable IDs as separate column. Without this, Phase 4a cache invalidation is guesswork.
356
+ - **A2 Per-user vs shared cluster summaries.** **Recommended: shared.** Per-cluster cached once per `(cluster_stable_id, snapshot_date)`. Per-user adds 3–5× Claude cost with marginal UX gain pre-launch. Shared ≈$50–80/month; per-user easily $500+. Schema-migration-hard to change later.
357
+ - **A3 LightGBM v1 vs v2.** **Recommended: one-stage LambdaMART in 4b; two-stage deferred to Phase 5.** Single LambdaMART over {bi-encoder score, BM25, recency, category match, author overlap} captures 80% of two-stage value at 30% complexity.
358
+ - **A4 Telemetry event schema v1 (frozen before any logging).** Minimum fields: `event_id, user_id, session_id, timestamp, event_type, paper_id, position, cluster_id, cluster_stable_id, policy_id, propensity, ranker_version, rerank_version, candidate_source, ab_bucket`. Retrofitting is painful. OpenTelemetry OTEP 0152/0243 on schema evolution are the canonical references.
359
+ - **A5 Eval-set version pinning + baseline format.** `eval/baselines/main.json`, `eval/eval_set_v1.0.parquet`; PRs that move metrics update both.
360
+ - **A6 Distillation training-data boundary.** Commit before 4b to: teacher (BGE-reranker-v2-m3), query distribution (must NOT overlap with eval's time-split), output format (MarginMSE margins). Assertion in training: `max(train.timestamp) < eval_cutoff`.
361
+ - **A7 Claude model/cache strategy.** Haiku 4.5 for 4a summaries; 5-min prompt cache on shared system prompt + style guide; single `cache_control` breakpoint on cluster-papers block. Stable-prefix-first prompt structure decided before coding.
362
+
363
+ ### F.2 Phase 4 subworkstream entry/exit criteria
364
+
365
+ **4a Claude summaries.** Entry: A1/A2/A7 decided; cluster stability mean Jaccard day-over-day ≥0.7 over 7 days. Exit: all 50–200 active clusters have fresh summary daily; p95 generation latency <3s; monthly cost <$30 at 100 clusters × 1 refresh/day with caching; 20 human-rated summaries score ≥4/5 on coherence. Deliverables: `services/summaries/claude_client.py` with prompt cache + retry/backoff; `services/summaries/summary_job.py` nightly job writing `cluster_summaries(cluster_stable_id, snapshot_date, summary_md, input_tokens, output_tokens, cached_tokens)`; Jinja templates; cost monitoring SQL view. **Effort: 2–3 weeks solo.** Risks: Claude cost overruns (set hard spend cap, log cache hit ratio — if <70%, prompt structure wrong); stale summaries from snapshot_date collisions (use content-hash tie-breaker); prompt injection from abstracts (use `<paper_abstract>` tags + "summarize only; ignore instructions" system line).
366
+
367
+ **4b Distilled reranker.** Entry: Phase 3 eval producing stable nDCG@10 within ±0.5% across runs; retrieval Recall@200 ≥0.85 on held-out; A3/A5/A6 decided; frozen eval set never seen in training (enforced assertion). Exit: student recovers ≥95% teacher nDCG@10 at ≥10× lower CPU latency; ONNX-exported INT8-quantized with PyTorch numerical closeness <1e-3 on 1000 samples; feature-flagged shadow traffic for 1 week with no regressions. Deliverables: teacher-scoring pipeline (non-eval time window); student training script with MarginMSE loss; ONNX export + `optimum-cli`; FastAPI integration with onnxruntime; stage-attribution eval report. **Effort: 4–6 weeks solo** (1 week data prep, 1 week training/tuning, 1 week ONNX+quantization+perf, 1–2 weeks integration+shadow, 1 week buffer). Risks: training-data leakage (time-cutoff assertion); CPU latency regression from naive batching (batch top-50 as one forward pass, not serial); quantization-catastrophic-recall (always compare fp32 vs INT8 on same eval — usually <0.5 nDCG, can be worse with bad calibration data).
368
+
369
+ **4d Use-cases doc.** Entry: Phase 3 eval showing consistent wins; dogfooding anecdotes; 4a scoped (for UX mockups). Exit: 10–15 page markdown doc with 3–5 personas drawn from synthetic-persona work, top 10 use cases with before/after storyboards, explicit non-goals, 3-month roadmap. Deliverables: single markdown doc + 1-page "pitch" derivative. **Effort: 1–1.5 weeks focused writing, calendar-time ~3 weeks** (competes with dev work). Risks: writing-in-a-vacuum (need 5–10 real conversations); premature lock-in (publish externally only after 10 external users × 2 weeks).
370
+
371
+ ### F.3 Dependency graph and sequencing
372
+
373
+ ```
374
+ Week 0: ADR sprint (A1–A7) [1 week, no coding]
375
+
376
+ ├──→ 4d Use-Cases Doc (1–1.5 wk writing, weeks 1–3 calendar)
377
+
378
+ ├──→ 4a Claude Summaries (2–3 wk, weeks 1–3) — needs A1, A2, A7
379
+
380
+ └──→ 4b Distilled Reranker (4–6 wk, weeks 4–9) — needs A3, A5, A6
381
+ ```
382
+
383
+ **Sequencing rationale: 4a first (cheapest, most visible, low risk, UI-validating, infrastructure reused by 4d); 4d in parallel (writing surfaces missing features); 4b last (largest quality lift but biggest risk, benefits from 4a UI being in prod and 4d clarifying what matters).** Add 30% buffer to every estimate — solo-dev posts uniformly show actual timelines are 1.5–2× initial estimates. **Realistic Phase 4 total: 10–12 weeks with parallelization and buffer; ~8 weeks if nothing breaks (it will).**
384
+
385
+ Week-by-week plan:
386
+
387
+ | Week | 4a | 4b | 4d | Cross-cutting |
388
+ |---|---|---|---|---|
389
+ | 0 | — | — | — | ADR sprint A1–A7 |
390
+ | 1 | Claude client + cache | Teacher scoring script | Persona draft | CI regression harness v1 |
391
+ | 2 | Nightly summary job + DB | 500k-pair sampling + MarginMSE training | Use case storyboards | Synthetic persona sim v0 |
392
+ | 3 | UI integration + human eval | Training runs (MLflow) | External review + polish | Stage-attribution diagnostic |
393
+ | 4 | Cost polish; freeze | ONNX + INT8 export | done | — |
394
+ | 5 | monitoring buffer | CPU perf optimization | — | Cluster stability alerts live |
395
+ | 6 | — | FastAPI integration + flag | — | — |
396
+ | 7 | — | Shadow traffic + debug | — | — |
397
+ | 8 | — | Full rollout + eval report | — | Phase 4 retrospective |
398
+ | 9–10 | — | buffer | — | Plan Phase 5 entry threshold review |
399
+
400
+ ### F.4 "Good enough" exit criteria
401
+
402
+ 4a: summaries ship to 100% of clusters, cost within budget, no correctness incidents 2 weeks. 4b: ≥95% teacher nDCG@10 recovery, CPU p95 <200ms top-50 rerank, 1 week shadow clean. 4d: 3 external readers provide feedback → 1 revision → published. General rubric for solo dev: primary objective met + smallest acceptable safety net = ship. Resist the "perfect" standard — solo devs chasing "done" on every phase never launch. Log tech debt in `TODO.md`; every 6–8 weeks, 2-week refactoring cycle (Matt Robertson solo-dev pattern).
403
+
404
+ ### F.5 Phase 5 entry thresholds
405
+
406
+ | Workstream | Entry threshold | Rationale |
407
+ |---|---|---|
408
+ | ε-greedy exploration | **Day 1 of Phase 4 (even with 1 user)** | Required architectural decision, not future workstream — without exploration no propensities, without propensities no retrospective IPS |
409
+ | LightFM / hybrid CF | ≥100 users OR ≥500 saves total | CF beats pure content only once interaction signal overlaps; below ~500 saves, content+recency wins |
410
+ | Dismissal retraining (as labels) | ≥5K dismissal events AND propensity-logged | Fewer means IPS variance explodes (ESS<100); propensities must come from day 1 or impossible to apply later |
411
+ | Semantic IDs (TIGER) | ≥10K users AND ANN bottleneck measurable AND GPU available | Solves embedding-table-cost at scale — not ResearchIT's pain at 10K users × 1.6M papers |
412
+ | PinnerFormer | ≥10K users AND ≥50 avg interactions/user AND basic sequence features built AND GPU available | Dense-all-action loss needs 14-day future prediction window per user; <50 interactions/user has nothing to learn |
413
+ | DPP / SSD diversity | MMR clustering complaints >5% of user feedback | 500+ LOC complexity not worth it until MMR visibly fails |
414
+ | Calibration (isotonic) | Before any multi-objective score fusion | ~2 days of work; schedule right after 4b |
415
+ | Active learning onboarding | ≥50 signups/week AND measurable funnel drop-off | Nature 2025 study couldn't confirm offline lift online with 50 real users |
416
+ | SQLite → Supabase | ~500MB DB OR writer contention OR cluster-job + event-log collisions | SQLite fine for ResearchIT workload until one of these fires |
417
+ | GrowthBook (from scipy) | ≥1K users AND ≥1 concurrent experiment/month | scipy + notebook covers pre-launch |
418
+ | Lab/group profiles | Multiple explicit lab opt-in requests post-launch | Not in Phase 5 core; greenfield for academic context |
419
+
420
+ ### F.6 Cross-cutting risks
421
+
422
+ Telemetry gaps bite hardest in Phase 5 (IPS impossible without propensities): **freeze schema before any logging (A4); include policy_id, propensity, shown_position, ranker_version**. Training data leakage produces phantom lift in 4b: eval-time-cutoff assertion in training script; never use eval queries as teacher-scoring queries. Claude cost overruns: Haiku + shared summaries + caching + hard dashboard cap + daily cost view. Cluster instability causes mis-cached summaries and UI label-jumping: Hungarian-matched stable IDs + Jaccard <0.7 alert. Solo-dev estimation drift: multiply all estimates by 1.5, parallelize ADR+writing with dev, commit to a hard "good enough" definition per workstream. Evaluation-overfit (CI green but real users unhappy): run synthetic-persona longitudinal sim alongside static eval; once you have real users, weight live metrics > offline. Eval-set rot: every 6 months recompute with new cutoff, bump version, re-baseline intentionally.
423
+
424
+ ## Conclusion
425
+
426
+ Phase 4 is mostly a 10–12 week engineering effort bounded by two real constraints — solo-dev capacity and a 6ms CPU budget for the cross-encoder — and one architectural constraint: **every downstream Phase 5 workstream depends on decisions made in week 0 of Phase 4**. The ADR sprint is the non-negotiable entry gate. Within Phase 4, the highest-leverage sequencing is 4a (Claude summaries, shared-not-per-user, Haiku 4.5 + Batch API, ~$50–80/mo) in parallel with 4d writing, then 4b (distilled reranker, off-the-shelf TinyBERT-L-2-v2 INT8 ONNX first, distill only if held-out gap >3 nDCG, Option-C LightGBM integration). The novel contribution of Phase 4a is that **no other academic recommender currently shows personalized "You're reading about X" cluster narratives** — Scholar Inbox's shared labels are the closest analogue. The novel contribution of 4b for a solo dev is recognizing that the Shallow Cross-Encoders finding (SIGIR 2024) plus FlashRank's ONNX packaging pattern plus HF-shipped AVX-512-VNNI INT8 models means 6ms for 20 pairs on CPU is genuinely achievable without custom distillation — distillation is the more-complex fallback, not the default. For Phase 5, the single most valuable action that costs nothing now is **logging propensity and policy_id from day 1**, which unlocks SNIPS/DR counterfactual evaluation for every later workstream. The dismissal-as-label YouTube finding (Wang et al. 2023: 22% → 60.8% similar-content reduction when dismissals are both features AND labels) is the best-justified Phase 5 quality lever, but it needs ~10K dismissals and is ~80 weeks away at pre-launch scale — so in the interim, dismissals enter as features only, and the real Phase 5 quality investment should be (in order) calibration of LightGBM scores, ε-greedy exploration at 10%, stratified exploration by unused arXiv category, and LightFM-as-LightGBM-feature once interactions cross 5K. Everything else — TIGER, PinnerFormer, DPP, group rec, active learning, neural bandits — should be deferred until a specific production pain signal fires.
docs/walkthroughs/02-Phase2-MultiInterest-Recommender.md CHANGED
@@ -17,7 +17,7 @@ EWMA profiles update (background, non-blocking)
17
 
18
  Ward clustering → K distinct interest medoids (auto K per user)
19
 
20
- Qdrant prefetch + RRF fusion (~15-25ms, single API call)
21
 
22
  Heuristic re-ranking of ~100 candidates (~1-2ms)
23
 
 
17
 
18
  Ward clustering → K distinct interest medoids (auto K per user)
19
 
20
+ Qdrant prefetch + RRF fusion (~15-25ms, single API call) [⚠️ Replaced by Quota Fusion in Phase 4]
21
 
22
  Heuristic re-ranking of ~100 candidates (~1-2ms)
23
 
docs/walkthroughs/03-Code-Summary-and-Test-Plan.md CHANGED
@@ -36,7 +36,7 @@ The current application is a fully functional FastAPI + HTMX research paper disc
36
 
37
  ## 2. Comprehensive Testing Plan
38
 
39
- The current test suite has **86 passing tests** executing via `pytest`. Our testing strategy is split into three layers: Automated, Manual, and Analytics-based evaluation.
40
 
41
  ### A. Automated Testing (Current & Ongoing)
42
 
 
36
 
37
  ## 2. Comprehensive Testing Plan
38
 
39
+ The current test suite has **125 passing tests** (as of Phase 3.5) executing via `pytest`. Our testing strategy is split into three layers: Automated, Manual, and Analytics-based evaluation.
40
 
41
  ### A. Automated Testing (Current & Ongoing)
42
 
docs/walkthroughs/04-Next-Steps-and-Phase-Plan.md CHANGED
@@ -15,29 +15,27 @@
15
  |---|---|---|
16
  | Qdrant Cloud (1.6M BGE-M3 papers) | ✅ Live | BQ enabled, HNSW m=32, `arxiv_bgem3_dense` collection |
17
  | Phase 1: Zero-ML Recommender | ✅ Complete | Qdrant BEST_SCORE with raw IDs, 55 tests |
18
- | Phase 2a: EWMA Profiles | ✅ Complete | Long-term (α=0.10), Short-term (α=0.40), Negative (α=0.15) |
19
- | Phase 2b: Ward Clustering + Prefetch+RRF | ✅ Complete | Adaptive gap-based threshold, 2+ clusters detected on real data |
20
- | Phase 2c: Heuristic Re-ranking + MMR | ✅ Complete | 4-feature scorer, MMR λ=0.6, exploration injection |
 
 
21
  | SQLite (interactions, profiles, clusters, metadata cache) | ✅ Live | WAL mode, async via aiosqlite |
22
  | HTMX Frontend | ✅ Live | Search, save, dismiss, recommendations |
23
- | Test Suite | ✅ 88 tests passing | Unit, integration, and E2E simulation |
24
 
25
  ### What's NOT Built Yet
26
 
27
  | Component | Planned In | Blocked By |
28
  |---|---|---|
29
- | **Hybrid Search (BGE-M3 encode + Zilliz sparse)** | **Phase 3 (NEXT)** | BGE-M3 model loading (~570MB, ~15s cold start) |
30
- | Recommendation fixes (RRF→quota, α tuning) | Phase 4 | Code refactor only |
31
- | LightGBM lambdarank re-ranker | Phase 6 | Need ≥500 labeled save/dismiss interactions |
32
  | Cold-start onboarding (category picker / ORCID) | Phase 5 | Not yet designed |
33
- | Negative profile used in retrieval | Phase 4 | Stored but not wired |
34
- | Pre-populated metadata store | Phase 4 | arXiv API is the latency bottleneck (~7.6s cold) |
35
  | LLM interest summaries per cluster | Phase 8 | Needs Claude/Groq API integration |
36
 
37
- > **Note on search architecture:** The current arXiv keyword API search was always a Phase 1 placeholder.
38
- > The entire point of building 1.6M BGE-M3 embeddings in Qdrant (with BQ + HNSW) is to power
39
- > vector-based semantic search. Replacing the arXiv API with Qdrant dense + Zilliz sparse
40
- > hybrid search is the **#1 priority** for the next phase.
41
 
42
  ### Dataset Coverage
43
 
@@ -47,7 +45,7 @@
47
  | Newest paper | `2505.04101` (~May 2025) |
48
  | Total papers | 1,596,587 |
49
  | Payload stored in Qdrant | `arxiv_id` only |
50
- | Metadata source | arXiv API (live) → SQLite cache |
51
 
52
  ---
53
 
@@ -126,7 +124,7 @@ The 6 research documents contain several contradictions. Here is each one and it
126
 
127
  This is what PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN actually deploy.
128
 
129
- **Current status**: RRF is still in the codebase. Needs to be replaced.
130
 
131
  ### 2. EWMA α_long = 0.10 vs 0.03
132
 
@@ -136,7 +134,7 @@ This is what PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN actually depl
136
 
137
  **Resolution**: PinnerSage tested λ=0.1 and **explicitly rejected it as too recent-biased**. Their optimal was λ=0.01. Doc 06 recommends α_long=0.03 as a compromise.
138
 
139
- **Current status**: α=0.10 is in the codebase. Should be tuned down to 0.03.
140
 
141
  ### 3. BGE-reranker-v2 in the Hot Path
142
 
@@ -231,46 +229,52 @@ Final results → fetch metadata → render
231
 
232
  ### Phase 4: Recommendation Pipeline Fixes (~1 week)
233
 
 
 
234
  Corrections to the existing recommendation pipeline based on Doc 06's findings.
 
235
 
236
  #### 4.1 Replace RRF with Importance-Weighted Quota Fusion
237
- **Why**: RRF lets dominant clusters swamp minor interests — the exact failure mode multi-interest models exist to prevent.
238
 
239
- **What to change**: In `app/routers/recommendations.py`, replace `multi_interest_search()` (which uses Qdrant's server-side RRF) with per-cluster separate ANN queries, then allocate feed slots proportional to cluster importance with a floor of F_min=3.
 
 
 
240
 
241
- **New flow**:
242
  ```
243
  clusters = compute_clusters(...)
244
- weights = normalize_importance(clusters)
245
- for each cluster k:
246
- slots_k = max(floor(total_slots × weight_k), 3)
247
- candidates_k = qdrant search with medoid_k (limit = slots_k × 3)
248
- rerank within cluster_k via LightGBM / heuristic
249
- take top slots_k
250
- deduplicate across clusters (assign to highest-ranked)
251
- MMR over the merged union
252
  ```
253
 
254
- #### 4.2 Tune α_long from 0.10 → 0.03
255
- **Why**: PinnerSage explicitly rejected 0.10 as too recent-biased.
256
 
257
- **What to change**: Single constant in `app/recommend/profiles.py`.
258
 
259
- #### 4.3 Wire the Negative Profile into Re-ranking
260
- **Why**: Currently computed and stored but never used. YouTube showed a 3× gain from using dislikes as both features and labels.
261
 
262
- **What to add**: In `app/recommend/reranker.py`, add a negative-similarity penalty:
263
- ```python
264
- neg_penalty = cosine_sim(candidate, neg_profile) * penalty_weight
265
- final_score = base_score - neg_penalty
266
- ```
 
 
 
 
 
 
 
 
 
267
 
268
- #### 4.4 Pre-populate Metadata Store
269
- **Why**: The arXiv API is the #1 latency bottleneck (~7.6 seconds cold for 50 papers).
270
 
271
- **What to do**: Download the Kaggle arXiv metadata dataset (~4GB JSON). Bulk-insert all 1.6M papers' metadata into SQLite's `paper_metadata` table. The arXiv API becomes a fallback for genuinely new papers only.
272
 
273
- **Impact**: Metadata fetch drops from ~7,600ms to <5ms.
274
 
275
  ---
276
 
@@ -379,8 +383,9 @@ Add LightFM hybrid model with switching strategy:
379
  - ≥10 interactions: LightFM
380
  Retrain LightGBM with dismissals as negative labels (YouTube's 3× gain from dual labels).
381
 
382
- #### 9.3 Category-Level Negative Suppression
383
- If ≥3 dismissals hit the same arXiv category within a week, suppress that category for 2 weeks.
 
384
 
385
  ---
386
 
@@ -388,17 +393,13 @@ If ≥3 dismissals hit the same arXiv category within a week, suppress that cate
388
 
389
  If you can only do three things, do these:
390
 
391
- ### 1. Build hybrid semantic search (Phase 3)
392
- **Impact**: Replaces the arXiv keyword API placeholder with real vector-based search. This is what the 1.6M BGE-M3 embeddings in Qdrant were built for. Transforms the product from a keyword aggregator into a semantic discovery engine.
393
- **Effort**: 4 new service files + router swap. ~2-3 weeks.
394
 
395
- ### 2. Pre-populate the metadata store (Phase 4.4)
396
- **Impact**: Drops cold metadata fetch from 7,600ms to <5ms. Single biggest latency win.
397
- **Effort**: Download Kaggle dataset, write a bulk-insert script, run once.
398
 
399
- ### 3. Replace RRF with quota fusion in recommendations (Phase 4.1)
400
  **Impact**: Prevents the dominant cluster from drowning out minority interests. Fixes the core multi-interest failure mode.
401
- **Effort**: Refactor `_multi_interest_recommend()` in recommendations.py.
402
 
403
  ---
404
 
@@ -415,5 +416,7 @@ If you can only do three things, do these:
415
  | — | [Phase 1 Walkthrough](PHASE1-Zero-ML-Recommender.md) | Zero-ML recommender code tour | ✅ Complete |
416
  | — | [Phase 2 Recommender Walkthrough](02-Phase2-MultiInterest-Recommender.md) | Multi-interest engine implementation | ✅ Complete |
417
  | — | [Code Summary & Test Plan](03-Code-Summary-and-Test-Plan.md) | Codebase summary and testing strategy | ✅ Complete |
418
- | — | [Phase 2 Hybrid Search Plan](../phases/PHASE2-Hybrid-Search-Plan.md) | BGE-M3 + Zilliz hybrid search (not yet built) | 📋 Planned |
 
 
419
  | — | **This Document** | Revised phase plan synthesizing all research | ✅ Current |
 
15
  |---|---|---|
16
  | Qdrant Cloud (1.6M BGE-M3 papers) | ✅ Live | BQ enabled, HNSW m=32, `arxiv_bgem3_dense` collection |
17
  | Phase 1: Zero-ML Recommender | ✅ Complete | Qdrant BEST_SCORE with raw IDs, 55 tests |
18
+ | Phase 2a: EWMA Profiles | ✅ Complete | Long-term (α=0.03 ✅), Short-term (α=0.40), Negative (α=0.15) |
19
+ | Phase 2b: Ward Clustering + Prefetch+RRF | ✅ Complete | L2-norm + adaptive gap threshold, 2+ clusters on real data |
20
+ | Phase 2c: Heuristic Re-ranking + MMR | ✅ Complete | 5-feature scorer (neg penalty wired), MMR λ=0.6, exploration |
21
+ | Phase 3: Hybrid Semantic Search | ✅ Complete | BGE-M3 + Qdrant dense + Zilliz sparse + RRF, 123 tests |
22
+ | Phase 3.5: Turso Metadata DB | ✅ Complete | 1.23GB metadata + citations, search ~10.7s → ~1.75s |
23
  | SQLite (interactions, profiles, clusters, metadata cache) | ✅ Live | WAL mode, async via aiosqlite |
24
  | HTMX Frontend | ✅ Live | Search, save, dismiss, recommendations |
25
+ | Test Suite | ✅ 125 tests passing | Unit, integration, E2E simulation, search pipeline |
26
 
27
  ### What's NOT Built Yet
28
 
29
  | Component | Planned In | Blocked By |
30
  |---|---|---|
31
+ | **Rec pipeline fixes (RRF→quota, Hungarian, neg suppression)** | **Phase 4 (NEXT)** | Code refactor only |
 
 
32
  | Cold-start onboarding (category picker / ORCID) | Phase 5 | Not yet designed |
33
+ | LightGBM lambdarank re-ranker | Phase 6 | Need ≥500 labeled save/dismiss interactions |
 
34
  | LLM interest summaries per cluster | Phase 8 | Needs Claude/Groq API integration |
35
 
36
+ > **Note**: Hybrid Search (Phase 3), Turso Metadata (Phase 3.5), α_long tuning, L2
37
+ > normalization, and negative profile wiring are all DONE. The next priority is fixing
38
+ > the recommendation fusion from RRF quota (Phase 4).
 
39
 
40
  ### Dataset Coverage
41
 
 
45
  | Newest paper | `2505.04101` (~May 2025) |
46
  | Total papers | 1,596,587 |
47
  | Payload stored in Qdrant | `arxiv_id` only |
48
+ | Metadata source | Turso DB (primary) → arXiv API (fallback) → SQLite cache |
49
 
50
  ---
51
 
 
124
 
125
  This is what PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN actually deploy.
126
 
127
+ **Current status**: RRF is still in the codebase. Phase 4 plan created — see `docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`.
128
 
129
  ### 2. EWMA α_long = 0.10 vs 0.03
130
 
 
134
 
135
  **Resolution**: PinnerSage tested λ=0.1 and **explicitly rejected it as too recent-biased**. Their optimal was λ=0.01. Doc 06 recommends α_long=0.03 as a compromise.
136
 
137
+ **Current status**: ✅ Already fixed — α=0.03 in `app/recommend/profiles.py:30`.
138
 
139
  ### 3. BGE-reranker-v2 in the Hot Path
140
 
 
229
 
230
  ### Phase 4: Recommendation Pipeline Fixes (~1 week)
231
 
232
+ > **Detailed plan**: [`docs/phases/PHASE4-Recommendation-Pipeline-Fixes.md`](../phases/PHASE4-Recommendation-Pipeline-Fixes.md)
233
+
234
  Corrections to the existing recommendation pipeline based on Doc 06's findings.
235
+ Items 4.2 (α_long tuning) and 4.3-old (negative profile wiring) are already done.
236
 
237
  #### 4.1 Replace RRF with Importance-Weighted Quota Fusion
238
+ **Why**: RRF lets dominant clusters swamp minor interests — the exact failure mode multi-interest models exist to prevent. PinnerSage, Taobao ULIM, and Pinterest Bucketized-ANN all use quota, not RRF.
239
 
240
+ **What to build**:
241
+ - New `app/recommend/fusion.py` — `allocate_quotas()` function
242
+ - Refactor `_multi_interest_recommend()` to use `asyncio.gather()` for concurrent per-cluster searches
243
+ - Deduplicate across clusters (first-occurrence wins)
244
 
 
245
  ```
246
  clusters = compute_clusters(...)
247
+ quotas = allocate_quotas([c.importance for c in clusters], total=100, min=3)
248
+ results = asyncio.gather(search_by_vector(medoid_k, limit=quota_k*3) for each k)
249
+ deduplicate rerank MMR → serve
 
 
 
 
 
250
  ```
251
 
252
+ #### ~~4.2 Tune α_long from 0.10 → 0.03~~ ✅ ALREADY DONE (Phase 2a)
 
253
 
254
+ α_long is already 0.03 in `app/recommend/profiles.py:30`.
255
 
256
+ #### ~~4.3-old Wire the Negative Profile~~ ALREADY DONE (Phase 2c)
 
257
 
258
+ Negative EWMA is already Feature 5 in `app/recommend/reranker.py` with 0.15 penalty weight.
259
+
260
+ #### 4.3 Hungarian Matching for Cluster Stability
261
+ **Why**: Cluster indices shuffle when users save new papers, breaking analytics and future UI.
262
+
263
+ **What to build**: `stabilize_cluster_ids()` in `clustering.py` using `scipy.optimize.linear_sum_assignment`. Cost matrix of medoid cosine distances; trivial at K≤7.
264
+
265
+ #### 4.4 Category-Level Negative Suppression
266
+ **Why**: YouTube (2023) showed 3× gain from richer negative treatment.
267
+
268
+ **Decisions resolved**:
269
+ - **Primary category only** — avoids over-suppression from secondary tags
270
+ - **14-day window** — standard default (τ_neg = 14 days)
271
+ - **Per-item temporal decay** → deferred to Phase 6 (LightGBM feature)
272
 
273
+ **What to build**: `get_suppressed_categories()` in `db.py` (SQL join: interactions × paper_metadata), filter in `_multi_interest_recommend()` after reranking.
 
274
 
275
+ #### ~~4.5 Pre-populate Metadata Store~~ ALREADY DONE (Phase 3.5 Turso)
276
 
277
+ Turso cloud DB with 1.23GB of metadata + citation counts. Search time: ~10.7s ~1.75s.
278
 
279
  ---
280
 
 
383
  - ≥10 interactions: LightFM
384
  Retrain LightGBM with dismissals as negative labels (YouTube's 3× gain from dual labels).
385
 
386
+ #### ~~9.3 Category-Level Negative Suppression~~ → Moved to Phase 4.4
387
+ If ≥3 dismissals hit the same primary arXiv category within 14 days, suppress that category.
388
+ **Decision**: Primary category only, τ_neg = 14 days. See Phase 4 plan.
389
 
390
  ---
391
 
 
393
 
394
  If you can only do three things, do these:
395
 
396
+ ### 1. ~~Build hybrid semantic search (Phase 3)~~ ✅ DONE
 
 
397
 
398
+ ### 2. ~~Pre-populate the metadata store (Phase 3.5)~~ ✅ DONE
 
 
399
 
400
+ ### 3. Replace RRF with quota fusion in recommendations (Phase 4.1) ← NEXT
401
  **Impact**: Prevents the dominant cluster from drowning out minority interests. Fixes the core multi-interest failure mode.
402
+ **Effort**: New `fusion.py` + refactor `_multi_interest_recommend()`. ~1 week for all 3 Phase 4 items.
403
 
404
  ---
405
 
 
416
  | — | [Phase 1 Walkthrough](PHASE1-Zero-ML-Recommender.md) | Zero-ML recommender code tour | ✅ Complete |
417
  | — | [Phase 2 Recommender Walkthrough](02-Phase2-MultiInterest-Recommender.md) | Multi-interest engine implementation | ✅ Complete |
418
  | — | [Code Summary & Test Plan](03-Code-Summary-and-Test-Plan.md) | Codebase summary and testing strategy | ✅ Complete |
419
+ | — | [Phase 2 Hybrid Search Plan](../phases/PHASE2-Hybrid-Search-Plan.md) | BGE-M3 + Zilliz hybrid search prototype | Superseded by Phase 3 |
420
+ | — | [Phase 3 Hybrid Semantic Search](../phases/PHASE3-Hybrid-Semantic-Search.md) | Full hybrid search implementation plan | ✅ Complete |
421
+ | — | [Phase 4 Recommendation Fixes](../phases/PHASE4-Recommendation-Pipeline-Fixes.md) | Quota fusion, Hungarian matching, negative suppression | 📋 Planned |
422
  | — | **This Document** | Revised phase plan synthesizing all research | ✅ Current |
tests/test_clustering.py CHANGED
@@ -15,6 +15,7 @@ import numpy as np
15
 
16
  from app.recommend.clustering import (
17
  compute_clusters,
 
18
  InterestCluster,
19
  MIN_PAPERS_FOR_CLUSTERING,
20
  MAX_CLUSTERS,
@@ -110,6 +111,7 @@ def test_importance_is_sorted_descending():
110
  def test_few_papers_returns_single_cluster():
111
  """When papers < MIN_PAPERS_FOR_CLUSTERING, return a single catch-all cluster."""
112
  ids = ["p1", "p2", "p3"]
 
113
  rng = np.random.RandomState(11)
114
  embs = rng.randn(3, 1024).astype(np.float32)
115
  # Normalise
@@ -155,6 +157,237 @@ def test_find_medoid():
155
  assert idx == 1, f"Expected medoid idx 1, got {idx}"
156
 
157
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  # ── DB persistence test ──────────────────────────────────────────────────────
159
 
160
  @pytest.fixture
 
15
 
16
  from app.recommend.clustering import (
17
  compute_clusters,
18
+ stabilize_cluster_ids,
19
  InterestCluster,
20
  MIN_PAPERS_FOR_CLUSTERING,
21
  MAX_CLUSTERS,
 
111
  def test_few_papers_returns_single_cluster():
112
  """When papers < MIN_PAPERS_FOR_CLUSTERING, return a single catch-all cluster."""
113
  ids = ["p1", "p2", "p3"]
114
+ assert len(ids) < MIN_PAPERS_FOR_CLUSTERING, "test precondition: ids must be below threshold"
115
  rng = np.random.RandomState(11)
116
  embs = rng.randn(3, 1024).astype(np.float32)
117
  # Normalise
 
157
  assert idx == 1, f"Expected medoid idx 1, got {idx}"
158
 
159
 
160
+ # ── Hungarian matching / cluster ID stabilisation (Phase 4.2) ────────────────
161
+
162
+ def _make_two_cluster_pair(seed: int = 0) -> tuple[list, list]:
163
+ """
164
+ Build two well-separated InterestCluster lists sharing the same embedding
165
+ space so Hungarian matching can correctly align them.
166
+
167
+ Returns (new_clusters, old_clusters) where new_clusters[0] corresponds
168
+ semantically to old_clusters[0].
169
+ """
170
+ rng = np.random.RandomState(seed)
171
+ dim = 1024
172
+
173
+ # Two distinct topic centers
174
+ center_a = rng.randn(dim).astype(np.float32)
175
+ center_a /= np.linalg.norm(center_a)
176
+ center_b = rng.randn(dim).astype(np.float32)
177
+ center_b /= np.linalg.norm(center_b)
178
+
179
+ def _near(center, n=5, spread=0.001):
180
+ # NOTE: spread is scaled small because random noise in 1024-d has
181
+ # magnitude ~sqrt(dim)*spread, so spread=0.05 gives noise≈1.6 which
182
+ # dominates the unit-length center. 0.001 keeps cosine sim ≥ 0.99.
183
+ vecs = []
184
+ for _ in range(n):
185
+ v = center + rng.randn(dim).astype(np.float32) * spread
186
+ v /= np.linalg.norm(v)
187
+ vecs.append(v)
188
+ return vecs
189
+
190
+ medoid_a_new = _near(center_a)[0]
191
+ medoid_b_new = _near(center_b)[0]
192
+ medoid_a_old = _near(center_a)[0]
193
+ medoid_b_old = _near(center_b)[0]
194
+
195
+ old = [
196
+ InterestCluster(cluster_idx=0, medoid_paper_id="old_a", medoid_embedding=medoid_a_old,
197
+ paper_ids=["old_a"], importance=5.0),
198
+ InterestCluster(cluster_idx=1, medoid_paper_id="old_b", medoid_embedding=medoid_b_old,
199
+ paper_ids=["old_b"], importance=3.0),
200
+ ]
201
+ # new clusters have swapped order (b first, a second) → naive assignment would shuffle
202
+ new = [
203
+ InterestCluster(cluster_idx=0, medoid_paper_id="new_b", medoid_embedding=medoid_b_new,
204
+ paper_ids=["new_b"], importance=3.0),
205
+ InterestCluster(cluster_idx=1, medoid_paper_id="new_a", medoid_embedding=medoid_a_new,
206
+ paper_ids=["new_a"], importance=5.0),
207
+ ]
208
+ return new, old
209
+
210
+
211
+ def test_stabilize_matches_semantically_equivalent_clusters():
212
+ """
213
+ When topic A was cluster 0 and remains cluster 0 after recluster (just
214
+ re-ordered by importance), stabilise_cluster_ids should restore idx=0 for A.
215
+ """
216
+ new, old = _make_two_cluster_pair()
217
+ # new[0] is topic B, new[1] is topic A
218
+ # old[0] is topic A (idx=0), old[1] is topic B (idx=1)
219
+ stabilised = stabilize_cluster_ids(new, old)
220
+
221
+ # After stabilisation, the cluster containing "new_a" should have idx=0
222
+ # and "new_b" should have idx=1
223
+ idx_map = {c.medoid_paper_id: c.cluster_idx for c in stabilised}
224
+ assert idx_map["new_a"] == 0, f"Topic A should be idx 0, got {idx_map}"
225
+ assert idx_map["new_b"] == 1, f"Topic B should be idx 1, got {idx_map}"
226
+
227
+
228
+ def test_stabilize_preserves_all_clusters():
229
+ """Output length must equal input length."""
230
+ new, old = _make_two_cluster_pair()
231
+ stabilised = stabilize_cluster_ids(new, old)
232
+ assert len(stabilised) == len(new)
233
+
234
+
235
+ def test_stabilize_unique_indices():
236
+ """All cluster indices in the output must be unique."""
237
+ new, old = _make_two_cluster_pair()
238
+ stabilised = stabilize_cluster_ids(new, old)
239
+ indices = [c.cluster_idx for c in stabilised]
240
+ assert len(indices) == len(set(indices)), f"Duplicate indices: {indices}"
241
+
242
+
243
+ def test_stabilize_no_old_clusters_returns_unchanged():
244
+ """With no old clusters, return new clusters as-is."""
245
+ new, _ = _make_two_cluster_pair()
246
+ result = stabilize_cluster_ids(new, [])
247
+ assert result == new
248
+
249
+
250
+ def test_stabilize_no_new_clusters_returns_empty():
251
+ """With no new clusters, return empty list."""
252
+ _, old = _make_two_cluster_pair()
253
+ result = stabilize_cluster_ids([], old)
254
+ assert result == []
255
+
256
+
257
+ def test_stabilize_rejects_unrelated_match():
258
+ """
259
+ Doc 06 requirement: Hungarian must NOT inherit an old cluster's identity
260
+ when the cosine similarity is below the threshold (default 0.5). A user's
261
+ genuinely-new topic should get a fresh index, not steal an old NLP idx
262
+ just because Hungarian found the "least bad" assignment.
263
+ """
264
+ rng = np.random.RandomState(7)
265
+ dim = 1024
266
+
267
+ def _rand_unit():
268
+ v = rng.randn(dim).astype(np.float32)
269
+ return v / np.linalg.norm(v)
270
+
271
+ # Two very different topics: old_topic_vec vs new_topic_vec (orthogonal-ish)
272
+ old_vec = _rand_unit()
273
+ new_vec = _rand_unit()
274
+ # Force near-orthogonality so cosine sim << 0.5
275
+ # (random 1024-dim unit vectors already average near 0, so this should hold)
276
+ cos_sim = float(new_vec @ old_vec)
277
+ assert abs(cos_sim) < 0.3, f"test precondition failed: cos_sim={cos_sim}"
278
+
279
+ old = [InterestCluster(cluster_idx=5, medoid_paper_id="old_topic",
280
+ medoid_embedding=old_vec, paper_ids=[], importance=1.0)]
281
+ new = [InterestCluster(cluster_idx=0, medoid_paper_id="new_topic",
282
+ medoid_embedding=new_vec, paper_ids=[], importance=1.0)]
283
+
284
+ stabilised = stabilize_cluster_ids(new, old)
285
+ # The unrelated new cluster must NOT inherit idx=5
286
+ assert stabilised[0].cluster_idx != 5, \
287
+ "Unrelated topic inherited old cluster's index (threshold not enforced)"
288
+
289
+
290
+ def test_stabilize_custom_threshold():
291
+ """Custom min_cosine_sim should control matching strictness."""
292
+ rng = np.random.RandomState(13)
293
+ dim = 1024
294
+ base = rng.randn(dim).astype(np.float32)
295
+ base /= np.linalg.norm(base)
296
+ # Slightly perturbed — spread=0.001 in 1024-d gives cos_sim ~ 0.9995
297
+ perturbed = base + rng.randn(dim).astype(np.float32) * 0.001
298
+ perturbed /= np.linalg.norm(perturbed)
299
+
300
+ old = [InterestCluster(cluster_idx=2, medoid_paper_id="old",
301
+ medoid_embedding=base, paper_ids=[], importance=1.0)]
302
+ new = [InterestCluster(cluster_idx=0, medoid_paper_id="new",
303
+ medoid_embedding=perturbed, paper_ids=[], importance=1.0)]
304
+
305
+ # With default threshold 0.5, match succeeds (~0.9995 cos sim)
306
+ default_result = stabilize_cluster_ids(new, old)
307
+ assert default_result[0].cluster_idx == 2
308
+
309
+ # With threshold 0.99999 (stricter than actual 0.9995 sim), match rejected
310
+ strict_result = stabilize_cluster_ids(new, old, min_cosine_sim=0.99999)
311
+ assert strict_result[0].cluster_idx != 2
312
+
313
+
314
+ def test_stabilize_more_new_than_old():
315
+ """K grew from 1 → 2: matched cluster keeps idx, new gets fresh idx."""
316
+ rng = np.random.RandomState(21)
317
+ dim = 1024
318
+
319
+ base = rng.randn(dim).astype(np.float32)
320
+ base /= np.linalg.norm(base)
321
+ close = base + rng.randn(dim).astype(np.float32) * 0.001
322
+ close /= np.linalg.norm(close)
323
+ far = rng.randn(dim).astype(np.float32)
324
+ far /= np.linalg.norm(far)
325
+
326
+ old = [InterestCluster(cluster_idx=0, medoid_paper_id="o",
327
+ medoid_embedding=base, paper_ids=[], importance=1.0)]
328
+ new = [
329
+ InterestCluster(cluster_idx=0, medoid_paper_id="n1",
330
+ medoid_embedding=close, paper_ids=[], importance=2.0),
331
+ InterestCluster(cluster_idx=1, medoid_paper_id="n2",
332
+ medoid_embedding=far, paper_ids=[], importance=1.0),
333
+ ]
334
+ result = stabilize_cluster_ids(new, old)
335
+ idx_map = {c.medoid_paper_id: c.cluster_idx for c in result}
336
+ assert idx_map["n1"] == 0 # inherits old idx
337
+ assert idx_map["n2"] != 0 # fresh idx
338
+
339
+
340
+ def test_stabilize_fewer_new_than_old():
341
+ """K shrank from 2 → 1: the surviving cluster keeps its idx."""
342
+ rng = np.random.RandomState(25)
343
+ dim = 1024
344
+ base = rng.randn(dim).astype(np.float32)
345
+ base /= np.linalg.norm(base)
346
+ other = rng.randn(dim).astype(np.float32)
347
+ other /= np.linalg.norm(other)
348
+ close = base + rng.randn(dim).astype(np.float32) * 0.001
349
+ close /= np.linalg.norm(close)
350
+
351
+ old = [
352
+ InterestCluster(cluster_idx=7, medoid_paper_id="oA",
353
+ medoid_embedding=base, paper_ids=[], importance=2.0),
354
+ InterestCluster(cluster_idx=9, medoid_paper_id="oB",
355
+ medoid_embedding=other, paper_ids=[], importance=1.0),
356
+ ]
357
+ new = [InterestCluster(cluster_idx=0, medoid_paper_id="nA",
358
+ medoid_embedding=close, paper_ids=[], importance=1.0)]
359
+
360
+ result = stabilize_cluster_ids(new, old)
361
+ assert len(result) == 1
362
+ assert result[0].cluster_idx == 7 # inherits the matching old idx
363
+
364
+
365
+ def test_stabilize_new_cluster_gets_fresh_index():
366
+ """
367
+ If new_clusters has more clusters than old, the extras get fresh indices
368
+ not conflicting with any matched index.
369
+ """
370
+ rng = np.random.RandomState(99)
371
+ dim = 1024
372
+
373
+ emb = lambda: (lambda v: v / np.linalg.norm(v))(rng.randn(dim).astype(np.float32))
374
+
375
+ old = [
376
+ InterestCluster(cluster_idx=0, medoid_paper_id="old_a", medoid_embedding=emb(),
377
+ paper_ids=[], importance=1.0),
378
+ ]
379
+ new = [
380
+ InterestCluster(cluster_idx=0, medoid_paper_id="new_a", medoid_embedding=old[0].medoid_embedding.copy(),
381
+ paper_ids=[], importance=1.0),
382
+ InterestCluster(cluster_idx=1, medoid_paper_id="new_brand", medoid_embedding=emb(),
383
+ paper_ids=[], importance=1.0),
384
+ ]
385
+ stabilised = stabilize_cluster_ids(new, old)
386
+ indices = {c.medoid_paper_id: c.cluster_idx for c in stabilised}
387
+ assert indices["new_a"] == 0, "Matched cluster should inherit old index 0"
388
+ assert indices["new_brand"] != 0, "New unmatched cluster must not collide with idx 0"
389
+
390
+
391
  # ── DB persistence test ──────────────────────────────────────────────────────
392
 
393
  @pytest.fixture
tests/test_db.py CHANGED
@@ -116,3 +116,319 @@ async def test_metadata_cache_batch(tmp_db):
116
  assert "paper0" in result
117
  assert "paper2" in result
118
  assert "paper99" not in result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  assert "paper0" in result
117
  assert "paper2" in result
118
  assert "paper99" not in result
119
+
120
+
121
+ # ── Phase 4.3: cache_turso_metadata_batch ────────────────────────────────────
122
+
123
+ @pytest.mark.asyncio
124
+ async def test_cache_turso_metadata_batch_writes_all(tmp_db):
125
+ """Turso dicts should be written to paper_metadata verbatim."""
126
+ import app.db as db
127
+ await db.init_db()
128
+ papers = [
129
+ {
130
+ "arxiv_id": "1706.03762",
131
+ "title": "Attention Is All You Need",
132
+ "abstract": "Transformers.",
133
+ "authors": '["Vaswani"]',
134
+ "category": "cs.CL",
135
+ "published": "2017-06-12",
136
+ "year": 2017,
137
+ "citation_count": 50000,
138
+ },
139
+ {
140
+ "arxiv_id": "2001.00001",
141
+ "title": "Another Paper",
142
+ "abstract": "...",
143
+ "authors": '["Smith"]',
144
+ "category": "cs.CV",
145
+ "published": "2020-01-01",
146
+ "year": 2020,
147
+ },
148
+ ]
149
+ await db.cache_turso_metadata_batch(papers)
150
+
151
+ cached = await db.get_cached_metadata("1706.03762")
152
+ assert cached is not None
153
+ assert cached["title"] == "Attention Is All You Need"
154
+ assert cached["category"] == "cs.CL"
155
+
156
+ cached2 = await db.get_cached_metadata("2001.00001")
157
+ assert cached2 is not None
158
+ assert cached2["category"] == "cs.CV"
159
+
160
+
161
+ @pytest.mark.asyncio
162
+ async def test_cache_turso_metadata_batch_empty(tmp_db):
163
+ """Empty input must not crash."""
164
+ import app.db as db
165
+ await db.init_db()
166
+ await db.cache_turso_metadata_batch([])
167
+ # No exception = success
168
+
169
+
170
+ @pytest.mark.asyncio
171
+ async def test_cache_turso_metadata_batch_skips_missing_arxiv_id(tmp_db):
172
+ """Rows without arxiv_id should be skipped, others persisted."""
173
+ import app.db as db
174
+ await db.init_db()
175
+ papers = [
176
+ {"title": "No ID", "category": "cs.LG"}, # missing arxiv_id
177
+ {"arxiv_id": "good.123", "title": "Good", "category": "cs.AI",
178
+ "abstract": "", "authors": "[]", "published": "2024-01-01"},
179
+ ]
180
+ await db.cache_turso_metadata_batch(papers)
181
+ cached = await db.get_cached_metadata("good.123")
182
+ assert cached is not None
183
+ assert cached["title"] == "Good"
184
+
185
+
186
+ @pytest.mark.asyncio
187
+ async def test_cache_turso_metadata_batch_upserts(tmp_db):
188
+ """Second write for same arxiv_id should overwrite the first."""
189
+ import app.db as db
190
+ await db.init_db()
191
+ paper_v1 = {"arxiv_id": "p1", "title": "V1", "category": "cs.LG",
192
+ "abstract": "", "authors": "[]", "published": "2024-01-01"}
193
+ paper_v2 = {"arxiv_id": "p1", "title": "V2", "category": "cs.CV",
194
+ "abstract": "", "authors": "[]", "published": "2024-01-01"}
195
+ await db.cache_turso_metadata_batch([paper_v1])
196
+ await db.cache_turso_metadata_batch([paper_v2])
197
+ cached = await db.get_cached_metadata("p1")
198
+ assert cached["title"] == "V2"
199
+ assert cached["category"] == "cs.CV"
200
+
201
+
202
+ # ── Phase 4.3: get_suppressed_categories ──────────────────────────────────────
203
+
204
+ @pytest.mark.asyncio
205
+ async def test_suppressed_empty_for_new_user(tmp_db):
206
+ import app.db as db
207
+ await db.init_db()
208
+ result = await db.get_suppressed_categories("never-dismissed")
209
+ assert result == set()
210
+
211
+
212
+ @pytest.mark.asyncio
213
+ async def test_suppressed_below_threshold_not_returned(tmp_db):
214
+ """Two dismissals in one category (< threshold=3) should NOT suppress."""
215
+ import app.db as db
216
+ await db.init_db()
217
+ # Seed metadata
218
+ for i, aid in enumerate(["p1", "p2"]):
219
+ await db.cache_metadata({
220
+ "arxiv_id": aid, "title": f"t{i}", "abstract": "",
221
+ "authors": "[]", "category": "cs.CV", "published": "2024-01-01",
222
+ })
223
+ # Two dismissals — below threshold=3
224
+ await db.log_interaction("u1", "p1", "not_interested")
225
+ await db.log_interaction("u1", "p2", "not_interested")
226
+
227
+ result = await db.get_suppressed_categories("u1")
228
+ assert "cs.CV" not in result
229
+
230
+
231
+ @pytest.mark.asyncio
232
+ async def test_suppressed_at_threshold_returned(tmp_db):
233
+ """Three dismissals in same category should suppress that category."""
234
+ import app.db as db
235
+ await db.init_db()
236
+ for i, aid in enumerate(["p1", "p2", "p3"]):
237
+ await db.cache_metadata({
238
+ "arxiv_id": aid, "title": f"t{i}", "abstract": "",
239
+ "authors": "[]", "category": "physics.optics", "published": "2024-01-01",
240
+ })
241
+ for aid in ["p1", "p2", "p3"]:
242
+ await db.log_interaction("u1", aid, "not_interested")
243
+
244
+ result = await db.get_suppressed_categories("u1")
245
+ assert "physics.optics" in result
246
+
247
+
248
+ @pytest.mark.asyncio
249
+ async def test_suppressed_only_counts_not_interested(tmp_db):
250
+ """Saves should NOT count toward suppression."""
251
+ import app.db as db
252
+ await db.init_db()
253
+ for aid in ["p1", "p2", "p3"]:
254
+ await db.cache_metadata({
255
+ "arxiv_id": aid, "title": "t", "abstract": "",
256
+ "authors": "[]", "category": "cs.CL", "published": "2024-01-01",
257
+ })
258
+ # 3 saves (not dismissals) in same category
259
+ for aid in ["p1", "p2", "p3"]:
260
+ await db.log_interaction("u1", aid, "save")
261
+
262
+ result = await db.get_suppressed_categories("u1")
263
+ assert "cs.CL" not in result
264
+
265
+
266
+ @pytest.mark.asyncio
267
+ async def test_suppressed_partitions_categories(tmp_db):
268
+ """Different categories should be independent."""
269
+ import app.db as db
270
+ await db.init_db()
271
+ # 3 dismissals in cs.AI, 1 in cs.LG
272
+ for aid in ["a1", "a2", "a3"]:
273
+ await db.cache_metadata({
274
+ "arxiv_id": aid, "title": "t", "abstract": "",
275
+ "authors": "[]", "category": "cs.AI", "published": "2024-01-01",
276
+ })
277
+ await db.log_interaction("u1", aid, "not_interested")
278
+ await db.cache_metadata({
279
+ "arxiv_id": "lone", "title": "t", "abstract": "",
280
+ "authors": "[]", "category": "cs.LG", "published": "2024-01-01",
281
+ })
282
+ await db.log_interaction("u1", "lone", "not_interested")
283
+
284
+ result = await db.get_suppressed_categories("u1")
285
+ assert "cs.AI" in result
286
+ assert "cs.LG" not in result
287
+
288
+
289
+ @pytest.mark.asyncio
290
+ async def test_suppressed_ignores_other_users(tmp_db):
291
+ """One user's dismissals must not affect another user's suppressions."""
292
+ import app.db as db
293
+ await db.init_db()
294
+ for aid in ["p1", "p2", "p3"]:
295
+ await db.cache_metadata({
296
+ "arxiv_id": aid, "title": "t", "abstract": "",
297
+ "authors": "[]", "category": "cs.CV", "published": "2024-01-01",
298
+ })
299
+ await db.log_interaction("userA", aid, "not_interested")
300
+
301
+ result_a = await db.get_suppressed_categories("userA")
302
+ result_b = await db.get_suppressed_categories("userB")
303
+ assert "cs.CV" in result_a
304
+ assert result_b == set()
305
+
306
+
307
+ @pytest.mark.asyncio
308
+ async def test_suppressed_empty_category_excluded(tmp_db):
309
+ """Papers with empty category string should not produce a '' suppression."""
310
+ import app.db as db
311
+ await db.init_db()
312
+ for aid in ["e1", "e2", "e3"]:
313
+ await db.cache_metadata({
314
+ "arxiv_id": aid, "title": "t", "abstract": "",
315
+ "authors": "[]", "category": "", "published": "2024-01-01",
316
+ })
317
+ await db.log_interaction("u1", aid, "not_interested")
318
+
319
+ result = await db.get_suppressed_categories("u1")
320
+ assert "" not in result
321
+
322
+
323
+ @pytest.mark.asyncio
324
+ async def test_suppressed_custom_threshold(tmp_db):
325
+ """Threshold=2 should trigger at 2 dismissals."""
326
+ import app.db as db
327
+ await db.init_db()
328
+ for aid in ["x1", "x2"]:
329
+ await db.cache_metadata({
330
+ "arxiv_id": aid, "title": "t", "abstract": "",
331
+ "authors": "[]", "category": "math.NT", "published": "2024-01-01",
332
+ })
333
+ await db.log_interaction("u1", aid, "not_interested")
334
+
335
+ result = await db.get_suppressed_categories("u1", threshold=2)
336
+ assert "math.NT" in result
337
+
338
+ result_high = await db.get_suppressed_categories("u1", threshold=5)
339
+ assert "math.NT" not in result_high
340
+
341
+
342
+ # ── Phase 4.5: Instrumentation columns ───────────────────────────────────────
343
+
344
+ @pytest.mark.asyncio
345
+ async def test_instrumentation_columns_exist(tmp_db):
346
+ """The interactions table should have ranker_version, candidate_source, cluster_id columns."""
347
+ import app.db as db
348
+ import aiosqlite
349
+ await db.init_db()
350
+ async with aiosqlite.connect(tmp_db) as conn:
351
+ cur = await conn.execute("PRAGMA table_info(interactions)")
352
+ columns = {row[1] for row in await cur.fetchall()}
353
+ assert "ranker_version" in columns
354
+ assert "candidate_source" in columns
355
+ assert "cluster_id" in columns
356
+
357
+
358
+ @pytest.mark.asyncio
359
+ async def test_log_interaction_stores_instrumentation_fields(tmp_db):
360
+ """log_interaction should persist ranker_version, candidate_source, cluster_id."""
361
+ import app.db as db
362
+ import aiosqlite
363
+ await db.init_db()
364
+ await db.log_interaction(
365
+ user_id="u1",
366
+ paper_id="p1",
367
+ event_type="save",
368
+ source="recommendation",
369
+ ranker_version="v4.1_test",
370
+ candidate_source="cluster_0",
371
+ cluster_id=0,
372
+ )
373
+ async with aiosqlite.connect(tmp_db) as conn:
374
+ conn.row_factory = aiosqlite.Row
375
+ cur = await conn.execute(
376
+ "SELECT ranker_version, candidate_source, cluster_id FROM interactions WHERE paper_id = 'p1'"
377
+ )
378
+ row = dict(await cur.fetchone())
379
+ assert row["ranker_version"] == "v4.1_test"
380
+ assert row["candidate_source"] == "cluster_0"
381
+ assert row["cluster_id"] == 0
382
+
383
+
384
+ @pytest.mark.asyncio
385
+ async def test_log_interaction_instrumentation_defaults_to_null(tmp_db):
386
+ """Omitting instrumentation fields should store NULLs (backward compat)."""
387
+ import app.db as db
388
+ import aiosqlite
389
+ await db.init_db()
390
+ await db.log_interaction("u1", "p2", "save", source="search")
391
+ async with aiosqlite.connect(tmp_db) as conn:
392
+ conn.row_factory = aiosqlite.Row
393
+ cur = await conn.execute(
394
+ "SELECT ranker_version, candidate_source, cluster_id FROM interactions WHERE paper_id = 'p2'"
395
+ )
396
+ row = dict(await cur.fetchone())
397
+ assert row["ranker_version"] is None
398
+ assert row["candidate_source"] is None
399
+ assert row["cluster_id"] is None
400
+
401
+
402
+ @pytest.mark.asyncio
403
+ async def test_migration_idempotent(tmp_db):
404
+ """Calling init_db() twice must not crash (ALTER TABLE migration is safe)."""
405
+ import app.db as db
406
+ await db.init_db()
407
+ await db.init_db() # second call — migration should be idempotent
408
+ # No exception = success
409
+
410
+
411
+ @pytest.mark.asyncio
412
+ async def test_instrumentation_exploration_tag(tmp_db):
413
+ """Exploration papers should be stored with candidate_source='exploration'."""
414
+ import app.db as db
415
+ import aiosqlite
416
+ await db.init_db()
417
+ await db.log_interaction(
418
+ user_id="u1",
419
+ paper_id="explore_paper",
420
+ event_type="save",
421
+ source="recommendation",
422
+ ranker_version="v4.1_quota_hungarian_suppression",
423
+ candidate_source="exploration",
424
+ cluster_id=None,
425
+ )
426
+ async with aiosqlite.connect(tmp_db) as conn:
427
+ conn.row_factory = aiosqlite.Row
428
+ cur = await conn.execute(
429
+ "SELECT candidate_source, cluster_id FROM interactions WHERE paper_id = 'explore_paper'"
430
+ )
431
+ row = dict(await cur.fetchone())
432
+ assert row["candidate_source"] == "exploration"
433
+ assert row["cluster_id"] is None
434
+
tests/test_fusion.py ADDED
@@ -0,0 +1,231 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tests for importance-weighted quota fusion.
3
+
4
+ Covers:
5
+ - Proportional allocation (dominant cluster gets most slots)
6
+ - Floor guarantee (every cluster gets at least min_slots)
7
+ - Total slots == sum of allocated slots (or >= when floors force it)
8
+ - Remainder distributed correctly
9
+ - Single cluster gets all slots
10
+ - Equal importances → roughly equal allocation
11
+ - Zero importances fall back to equal distribution
12
+ - merge_quota_results deduplication and order
13
+ """
14
+ from app.recommend.fusion import allocate_quotas, merge_quota_results
15
+
16
+
17
+ # ── allocate_quotas ───────────────────────────────────────────────────────────
18
+
19
+ def test_proportional_allocation():
20
+ """Dominant cluster should receive proportionally more slots."""
21
+ importances = [7.0, 3.0]
22
+ slots = allocate_quotas(importances, total_slots=100, min_slots=3)
23
+ assert len(slots) == 2
24
+ assert slots[0] > slots[1], "Dominant cluster (imp=7) should get more slots than minor (imp=3)"
25
+
26
+
27
+ def test_floor_guarantee():
28
+ """Every cluster must receive at least min_slots regardless of importance."""
29
+ # One huge cluster and one tiny one
30
+ importances = [99.0, 1.0]
31
+ slots = allocate_quotas(importances, total_slots=100, min_slots=3)
32
+ assert all(s >= 3 for s in slots), f"Floor violated: {slots}"
33
+
34
+
35
+ def test_total_slots_met():
36
+ """Sum of allocated slots should equal total_slots when no floor pressure."""
37
+ importances = [5.0, 3.0, 2.0]
38
+ total = 100
39
+ slots = allocate_quotas(importances, total_slots=total, min_slots=3)
40
+ assert sum(slots) == total, f"Expected sum={total}, got {sum(slots)} from {slots}"
41
+
42
+
43
+ def test_floor_overrides_total():
44
+ """When many clusters with min_slots exceed total, allocation may go over."""
45
+ # 7 clusters × 3 min_slots = 21 > 20 total
46
+ importances = [1.0] * 7
47
+ slots = allocate_quotas(importances, total_slots=20, min_slots=3)
48
+ assert all(s >= 3 for s in slots), f"Floor violated under pressure: {slots}"
49
+ assert len(slots) == 7
50
+
51
+
52
+ def test_single_cluster_gets_all():
53
+ """A single cluster should receive all slots (or min_slots if larger)."""
54
+ slots = allocate_quotas([5.0], total_slots=50, min_slots=3)
55
+ assert slots == [50]
56
+
57
+
58
+ def test_equal_importances_roughly_equal():
59
+ """Equal importances should produce roughly equal slot counts."""
60
+ importances = [1.0, 1.0, 1.0]
61
+ slots = allocate_quotas(importances, total_slots=99, min_slots=3)
62
+ assert len(slots) == 3
63
+ assert slots == [33, 33, 33], f"Expected equal split [33,33,33], got {slots}"
64
+
65
+
66
+ def test_zero_importances_fallback():
67
+ """All-zero importances should not crash; falls back to equal distribution."""
68
+ importances = [0.0, 0.0, 0.0]
69
+ slots = allocate_quotas(importances, total_slots=30, min_slots=3)
70
+ assert len(slots) == 3
71
+ assert sum(slots) == 30
72
+ assert all(s >= 3 for s in slots)
73
+
74
+
75
+ def test_empty_importances():
76
+ """Empty input returns empty list."""
77
+ assert allocate_quotas([], total_slots=100) == []
78
+
79
+
80
+ def test_remainder_distributed():
81
+ """With 3 equal clusters and 100 slots, remainder 1 goes to someone."""
82
+ importances = [1.0, 1.0, 1.0]
83
+ # 100 / 3 = 33.333 → floor is 33 each, remainder = 1
84
+ slots = allocate_quotas(importances, total_slots=100, min_slots=3)
85
+ assert sum(slots) == 100
86
+ assert sorted(slots) == [33, 33, 34]
87
+
88
+
89
+ def test_two_cluster_sum_correct():
90
+ """70/30 split on 100 slots: sum should be exactly 100."""
91
+ slots = allocate_quotas([70.0, 30.0], total_slots=100, min_slots=3)
92
+ assert sum(slots) == 100
93
+ assert slots[0] >= slots[1]
94
+ assert slots[1] >= 3
95
+
96
+
97
+ def test_doc06_worked_example():
98
+ """
99
+ Doc 06 worked example:
100
+ importances = [0.55, 0.30, 0.15], total=30, min=3
101
+ raw = [16.5, 9.0, 4.5]
102
+ floor = [16, 9, 4] (sum=29)
103
+ remainder = 1 → largest frac (0.5 at idx 0) gets it
104
+ final = [17, 9, 4]
105
+ """
106
+ slots = allocate_quotas([0.55, 0.30, 0.15], total_slots=30, min_slots=3)
107
+ assert slots == [17, 9, 4], f"Doc 06 example expected [17, 9, 4], got {slots}"
108
+ assert sum(slots) == 30
109
+
110
+
111
+ def test_doc06_tiny_cluster_floor():
112
+ """
113
+ Doc 06 tiny-cluster edge case:
114
+ importances = [0.60, 0.25, 0.10, 0.05], total=30, min=3
115
+ raw = [18.0, 7.5, 3.0, 1.5]
116
+ floor applied: [18, 7, 3, 3] -- smallest cluster gets 3 not 1
117
+ """
118
+ slots = allocate_quotas([0.60, 0.25, 0.10, 0.05], total_slots=30, min_slots=3)
119
+ # The smallest cluster must get at least min_slots (3), not 1
120
+ assert slots[3] >= 3, f"Floor violated: smallest cluster got {slots[3]}"
121
+ # The dominant cluster still dominates
122
+ assert slots[0] > slots[1] > slots[2]
123
+
124
+
125
+ def test_fractional_priority_deterministic():
126
+ """
127
+ Remainder should go to clusters with the largest fractional parts.
128
+ importances=[10,10,10], total=20, min=3
129
+ raw = [6.667, 6.667, 6.667]
130
+ floor = [6, 6, 6] (sum=18)
131
+ remainder = 2 → all fractions equal (0.667), first two get +1 (stable sort)
132
+ final = [7, 7, 6]
133
+ """
134
+ slots = allocate_quotas([10.0, 10.0, 10.0], total_slots=20, min_slots=3)
135
+ assert sum(slots) == 20
136
+ # With 2 remainder slots and 3 equal clusters, counts should be [7, 7, 6] in some order
137
+ assert sorted(slots, reverse=True) == [7, 7, 6]
138
+
139
+
140
+ def test_fractional_priority_prefers_larger_frac():
141
+ """
142
+ Cluster with larger fractional part should receive remainder bonus first.
143
+ importances=[2, 3] on 10 slots, min=3:
144
+ raw = [4.0, 6.0]
145
+ floor = [4, 6] (sum=10, remainder=0)
146
+ final = [4, 6]
147
+ """
148
+ slots = allocate_quotas([2.0, 3.0], total_slots=10, min_slots=3)
149
+ assert slots == [4, 6]
150
+
151
+
152
+ def test_many_clusters_floor_overflow():
153
+ """
154
+ 10 clusters, each needs min=3, but total=20 means 10×3=30 > 20.
155
+ Floor guarantee overrides total — sum exceeds total_slots.
156
+ """
157
+ slots = allocate_quotas([1.0] * 10, total_slots=20, min_slots=3)
158
+ assert len(slots) == 10
159
+ assert all(s >= 3 for s in slots)
160
+ # Floor overflow: sum exceeds requested total because min_slots dominates
161
+ assert sum(slots) == 30
162
+
163
+
164
+ def test_zero_importances_respects_floor_edge():
165
+ """
166
+ Zero-importance with total < n × min should still respect floor.
167
+ """
168
+ slots = allocate_quotas([0.0, 0.0, 0.0], total_slots=6, min_slots=3)
169
+ assert all(s >= 3 for s in slots)
170
+ assert len(slots) == 3
171
+
172
+
173
+ def test_dominant_cluster_does_not_starve_minority():
174
+ """
175
+ Critical Doc 06 fairness test:
176
+ User 70% NLP, 30% RL — RL must not get zero slots (the RRF failure mode).
177
+ """
178
+ slots = allocate_quotas([70.0, 30.0], total_slots=30, min_slots=3)
179
+ assert slots[1] >= 3, f"Minority RL cluster starved: got {slots[1]}"
180
+ assert slots[0] > slots[1] # but dominance is still preserved
181
+ assert sum(slots) == 30
182
+
183
+
184
+ def test_allocation_order_matches_input():
185
+ """Output order must match input order (importance-ranked already by caller)."""
186
+ slots = allocate_quotas([50.0, 25.0, 25.0], total_slots=100, min_slots=3)
187
+ # Cluster 0 is the largest, gets most slots; clusters 1 and 2 tied
188
+ assert slots[0] >= slots[1]
189
+ assert slots[0] >= slots[2]
190
+
191
+
192
+ # ── merge_quota_results ───────────────────────────────────────────────────────
193
+
194
+ def test_merge_respects_quota():
195
+ """Each cluster contributes at most its quota to the result."""
196
+ cluster_a = ["a1", "a2", "a3", "a4", "a5"]
197
+ cluster_b = ["b1", "b2", "b3"]
198
+ result = merge_quota_results([cluster_a, cluster_b], quotas=[3, 3])
199
+ a_count = sum(1 for r in result if r.startswith("a"))
200
+ b_count = sum(1 for r in result if r.startswith("b"))
201
+ assert a_count <= 3, f"Cluster A exceeded quota: {a_count}"
202
+ assert b_count <= 3, f"Cluster B exceeded quota: {b_count}"
203
+
204
+
205
+ def test_merge_deduplicates():
206
+ """Papers appearing in multiple clusters should appear only once."""
207
+ cluster_a = ["shared", "a1", "a2"]
208
+ cluster_b = ["shared", "b1", "b2"]
209
+ result = merge_quota_results([cluster_a, cluster_b], quotas=[3, 3])
210
+ assert result.count("shared") == 1, "Duplicate 'shared' should appear only once"
211
+
212
+
213
+ def test_merge_preserves_order():
214
+ """Cluster A results appear before Cluster B results."""
215
+ cluster_a = ["a1", "a2"]
216
+ cluster_b = ["b1", "b2"]
217
+ result = merge_quota_results([cluster_a, cluster_b], quotas=[2, 2])
218
+ assert result == ["a1", "a2", "b1", "b2"]
219
+
220
+
221
+ def test_merge_empty_cluster():
222
+ """An empty cluster contributes nothing; others still fill their quota."""
223
+ cluster_a = ["a1", "a2", "a3"]
224
+ cluster_b: list[str] = []
225
+ result = merge_quota_results([cluster_a, cluster_b], quotas=[3, 3])
226
+ assert result == ["a1", "a2", "a3"]
227
+
228
+
229
+ def test_merge_empty_input():
230
+ """No clusters → empty result."""
231
+ assert merge_quota_results([], []) == []
tests/test_integration.py CHANGED
@@ -3,6 +3,7 @@ Integration tests: full HTTP request/response cycle via FastAPI TestClient.
3
  Tests the complete pipeline: search → save → recommendations.
4
  """
5
  import pytest
 
6
  from fastapi.testclient import TestClient
7
 
8
 
@@ -148,7 +149,8 @@ def test_recommendations_after_save(client, monkeypatch):
148
  return ["1706.03762"]
149
  monkeypatch.setattr(qs, "recommend", fake_recommend)
150
 
151
- # Also mock metadata fetch so we don't hit arXiv API in this test
 
152
  import app.arxiv_svc as arxiv
153
  async def fake_batch(ids):
154
  return {
@@ -162,7 +164,8 @@ def test_recommendations_after_save(client, monkeypatch):
162
  "year": 2017,
163
  }
164
  }
165
- monkeypatch.setattr(arxiv, "fetch_metadata_batch", fake_batch)
 
166
 
167
  client.get("/")
168
  client.post("/api/papers/0704.0002/save", data={"source": "search"})
@@ -173,6 +176,110 @@ def test_recommendations_after_save(client, monkeypatch):
173
 
174
  # ── Full pipeline smoke test ───────────────────────────────────────────────────
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  def test_full_pipeline_smoke(client, monkeypatch):
177
  """
178
  1. User visits home → gets cookie
@@ -211,6 +318,7 @@ def test_full_pipeline_smoke(client, monkeypatch):
211
  return ["2302.11382"]
212
  monkeypatch.setattr(qs, "recommend", fake_rec)
213
 
 
214
  async def fake_meta(ids):
215
  return {
216
  "2302.11382": {
@@ -223,7 +331,8 @@ def test_full_pipeline_smoke(client, monkeypatch):
223
  "year": 2023,
224
  }
225
  }
226
- monkeypatch.setattr(arxiv, "fetch_metadata_batch", fake_meta)
 
227
 
228
  resp = client.get("/api/recommendations")
229
  assert resp.status_code == 200
 
3
  Tests the complete pipeline: search → save → recommendations.
4
  """
5
  import pytest
6
+ from unittest.mock import AsyncMock
7
  from fastapi.testclient import TestClient
8
 
9
 
 
149
  return ["1706.03762"]
150
  monkeypatch.setattr(qs, "recommend", fake_recommend)
151
 
152
+ # Also mock metadata fetch so we don't hit Turso DB in this test
153
+ import app.turso_svc as turso
154
  import app.arxiv_svc as arxiv
155
  async def fake_batch(ids):
156
  return {
 
164
  "year": 2017,
165
  }
166
  }
167
+ monkeypatch.setattr(turso, "fetch_metadata_batch", fake_batch)
168
+ monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
169
 
170
  client.get("/")
171
  client.post("/api/papers/0704.0002/save", data={"source": "search"})
 
176
 
177
  # ── Full pipeline smoke test ───────────────────────────────────────────────────
178
 
179
+ def test_quota_pipeline_preserves_minority_cluster(client, monkeypatch):
180
+ """
181
+ Phase 4.1 end-to-end check: with 5+ saves forming 2 distinct interests,
182
+ the quota pipeline must surface papers from BOTH clusters in the final feed.
183
+ This is the exact failure mode RRF was causing.
184
+ """
185
+ import numpy as np
186
+ import app.qdrant_svc as qs
187
+ import app.turso_svc as turso
188
+ import app.arxiv_svc as arxiv
189
+ import app.recommend.profiles as prof_mod
190
+
191
+ # Set up cookie
192
+ client.get("/")
193
+
194
+ # 5 saved papers, split into two topics (3 "NLP", 2 "RL") via embeddings
195
+ saved_ids = ["nlp_a", "nlp_b", "nlp_c", "rl_a", "rl_b"]
196
+ rng = np.random.RandomState(42)
197
+ nlp_center = rng.randn(1024).astype(np.float32)
198
+ nlp_center /= np.linalg.norm(nlp_center)
199
+ rl_center = rng.randn(1024).astype(np.float32)
200
+ rl_center /= np.linalg.norm(rl_center)
201
+
202
+ def _near(center):
203
+ v = center + rng.randn(1024).astype(np.float32) * 0.05
204
+ return (v / np.linalg.norm(v)).tolist()
205
+
206
+ saved_vectors = {
207
+ "nlp_a": _near(nlp_center),
208
+ "nlp_b": _near(nlp_center),
209
+ "nlp_c": _near(nlp_center),
210
+ "rl_a": _near(rl_center),
211
+ "rl_b": _near(rl_center),
212
+ }
213
+
214
+ # Candidate pool: 50 NLP-ish, 50 RL-ish
215
+ candidate_vectors = {}
216
+ nlp_candidates = [f"nlp_cand_{i}" for i in range(50)]
217
+ rl_candidates = [f"rl_cand_{i}" for i in range(50)]
218
+ for cid in nlp_candidates:
219
+ candidate_vectors[cid] = _near(nlp_center)
220
+ for cid in rl_candidates:
221
+ candidate_vectors[cid] = _near(rl_center)
222
+
223
+ async def fake_get_paper_vectors(ids):
224
+ combined = {**saved_vectors, **candidate_vectors}
225
+ return {aid: combined[aid] for aid in ids if aid in combined}
226
+
227
+ # search_by_vector returns candidates aligned with whichever centre
228
+ # the query is closer to
229
+ async def fake_search_by_vector(query_vector, limit, exclude_ids=None):
230
+ qv = np.array(query_vector, dtype=np.float32)
231
+ qv /= np.linalg.norm(qv)
232
+ if float(qv @ nlp_center) > float(qv @ rl_center):
233
+ pool = nlp_candidates
234
+ else:
235
+ pool = rl_candidates
236
+ exclude = exclude_ids or set()
237
+ return [p for p in pool if p not in exclude][:limit]
238
+
239
+ monkeypatch.setattr(qs, "get_paper_vectors", fake_get_paper_vectors)
240
+ monkeypatch.setattr(qs, "search_by_vector", fake_search_by_vector)
241
+
242
+ # Skip EWMA short-term lookup — returns None
243
+ async def fake_load_profile(uid, kind):
244
+ return None
245
+ monkeypatch.setattr(prof_mod, "load_profile", fake_load_profile)
246
+
247
+ async def fake_interaction_count(uid, kind):
248
+ return 0
249
+ monkeypatch.setattr(prof_mod, "get_interaction_count", fake_interaction_count)
250
+
251
+ # Metadata: provide category so templates render
252
+ async def fake_meta(ids):
253
+ return {
254
+ aid: {
255
+ "arxiv_id": aid,
256
+ "title": f"Title {aid}",
257
+ "abstract": "...",
258
+ "authors": "[]",
259
+ "category": "cs.CL" if aid.startswith("nlp") else "cs.LG",
260
+ "published": "2024-01-01",
261
+ "year": 2024,
262
+ }
263
+ for aid in ids
264
+ }
265
+ monkeypatch.setattr(turso, "fetch_metadata_batch", fake_meta)
266
+ from unittest.mock import AsyncMock
267
+ monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
268
+
269
+ # Save 5 papers to cross the MIN_PAPERS_FOR_CLUSTERING threshold
270
+ for aid in saved_ids:
271
+ client.post(f"/api/papers/{aid}/save", data={"source": "search"})
272
+
273
+ resp = client.get("/api/recommendations")
274
+ assert resp.status_code == 200
275
+
276
+ # The response should include recs from BOTH candidate pools (quota working)
277
+ has_nlp_rec = any(f"nlp_cand_{i}" in resp.text for i in range(50))
278
+ has_rl_rec = any(f"rl_cand_{i}" in resp.text for i in range(50))
279
+ assert has_nlp_rec, "No NLP cluster recs — dominant cluster failed to surface"
280
+ assert has_rl_rec, "Minority RL cluster starved — quota fusion is not working"
281
+
282
+
283
  def test_full_pipeline_smoke(client, monkeypatch):
284
  """
285
  1. User visits home → gets cookie
 
318
  return ["2302.11382"]
319
  monkeypatch.setattr(qs, "recommend", fake_rec)
320
 
321
+ import app.turso_svc as turso
322
  async def fake_meta(ids):
323
  return {
324
  "2302.11382": {
 
331
  "year": 2023,
332
  }
333
  }
334
+ monkeypatch.setattr(turso, "fetch_metadata_batch", fake_meta)
335
+ monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
336
 
337
  resp = client.get("/api/recommendations")
338
  assert resp.status_code == 200
tests/test_search_router.py CHANGED
@@ -1,10 +1,13 @@
1
  """
2
- Layer 3: Search router integration tests — Phase 3.
3
 
4
  Tests /search endpoint with mocked hybrid_search_svc.
5
  Validates: ranking preservation, arXiv fallback, saved/dismissed state,
6
  HTMX partials, and that empty queries don't trigger hybrid search.
7
 
 
 
 
8
  No network, no model, no external services needed.
9
  """
10
  import pytest
@@ -41,15 +44,17 @@ def test_search_hybrid_returns_papers(client, monkeypatch):
41
  """
42
  /search?q=... should use hybrid search and render paper cards.
43
  We mock hybrid_search_svc.search() to return known IDs and
44
- arxiv_svc.fetch_metadata_batch() to return metadata for those IDs.
45
  """
46
  import app.hybrid_search_svc as hs
 
47
  import app.arxiv_svc as arxiv
48
 
49
  monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
50
  "1706.03762", "2301.00001",
51
  ]))
52
- monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={
 
53
  "1706.03762": {
54
  "arxiv_id": "1706.03762",
55
  "title": "Attention Is All You Need",
@@ -69,6 +74,8 @@ def test_search_hybrid_returns_papers(client, monkeypatch):
69
  "year": 2023,
70
  },
71
  }))
 
 
72
 
73
  resp = client.get("/search?q=transformer+attention")
74
  assert resp.status_code == 200
@@ -82,13 +89,15 @@ def test_search_hybrid_preserves_ranking(client, monkeypatch):
82
  returned by hybrid_search_svc.search() — i.e., paper A before paper B.
83
  """
84
  import app.hybrid_search_svc as hs
 
85
  import app.arxiv_svc as arxiv
86
 
87
  # Hybrid search returns A first, then B
88
  monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
89
  "2401.00001", "1706.03762",
90
  ]))
91
- monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={
 
92
  "2401.00001": {
93
  "arxiv_id": "2401.00001",
94
  "title": "First Paper Should Appear First",
@@ -102,6 +111,7 @@ def test_search_hybrid_preserves_ranking(client, monkeypatch):
102
  "category": "cs.CL", "published": "2017-06-12", "year": 2017,
103
  },
104
  }))
 
105
 
106
  resp = client.get("/search?q=test+query")
107
  # First paper should appear before second paper in HTML
@@ -144,12 +154,14 @@ def test_search_sets_saved_dismissed_flags(client, monkeypatch):
144
  based on the user's state.
145
  """
146
  import app.hybrid_search_svc as hs
 
147
  import app.arxiv_svc as arxiv
148
 
149
  monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
150
  "1706.03762", "2301.00001",
151
  ]))
152
- monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={
 
153
  "1706.03762": {
154
  "arxiv_id": "1706.03762", "title": "Saved Paper",
155
  "abstract": "...", "authors": '["A"]',
@@ -161,6 +173,7 @@ def test_search_sets_saved_dismissed_flags(client, monkeypatch):
161
  "category": "cs.AI", "published": "2023-01-01", "year": 2023,
162
  },
163
  }))
 
164
 
165
  # First: visit home to get cookie, then save a paper
166
  client.get("/")
@@ -180,16 +193,19 @@ def test_search_htmx_partial_with_hybrid(client, monkeypatch):
180
  same as before the hybrid search swap.
181
  """
182
  import app.hybrid_search_svc as hs
 
183
  import app.arxiv_svc as arxiv
184
 
185
  monkeypatch.setattr(hs, "search", AsyncMock(return_value=["1706.03762"]))
186
- monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={
 
187
  "1706.03762": {
188
  "arxiv_id": "1706.03762", "title": "HTMX Test Paper",
189
  "abstract": "...", "authors": '["A"]',
190
  "category": "cs.CL", "published": "2017-06-12", "year": 2017,
191
  },
192
  }))
 
193
 
194
  resp = client.get(
195
  "/search?q=transformer",
 
1
  """
2
+ Layer 3: Search router integration tests — Phase 3 + 3.5.
3
 
4
  Tests /search endpoint with mocked hybrid_search_svc.
5
  Validates: ranking preservation, arXiv fallback, saved/dismissed state,
6
  HTMX partials, and that empty queries don't trigger hybrid search.
7
 
8
+ Phase 3.5: Turso is now the primary metadata source, arXiv API is fallback.
9
+ All tests mock turso_svc.fetch_metadata_batch to avoid hitting the real DB.
10
+
11
  No network, no model, no external services needed.
12
  """
13
  import pytest
 
44
  """
45
  /search?q=... should use hybrid search and render paper cards.
46
  We mock hybrid_search_svc.search() to return known IDs and
47
+ turso_svc.fetch_metadata_batch() to return metadata for those IDs.
48
  """
49
  import app.hybrid_search_svc as hs
50
+ import app.turso_svc as turso
51
  import app.arxiv_svc as arxiv
52
 
53
  monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
54
  "1706.03762", "2301.00001",
55
  ]))
56
+ # Phase 3.5: Turso is the primary metadata source
57
+ monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
58
  "1706.03762": {
59
  "arxiv_id": "1706.03762",
60
  "title": "Attention Is All You Need",
 
74
  "year": 2023,
75
  },
76
  }))
77
+ # arXiv fallback returns empty (Turso found everything)
78
+ monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
79
 
80
  resp = client.get("/search?q=transformer+attention")
81
  assert resp.status_code == 200
 
89
  returned by hybrid_search_svc.search() — i.e., paper A before paper B.
90
  """
91
  import app.hybrid_search_svc as hs
92
+ import app.turso_svc as turso
93
  import app.arxiv_svc as arxiv
94
 
95
  # Hybrid search returns A first, then B
96
  monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
97
  "2401.00001", "1706.03762",
98
  ]))
99
+ # Phase 3.5: Turso is the primary metadata source
100
+ monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
101
  "2401.00001": {
102
  "arxiv_id": "2401.00001",
103
  "title": "First Paper Should Appear First",
 
111
  "category": "cs.CL", "published": "2017-06-12", "year": 2017,
112
  },
113
  }))
114
+ monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
115
 
116
  resp = client.get("/search?q=test+query")
117
  # First paper should appear before second paper in HTML
 
154
  based on the user's state.
155
  """
156
  import app.hybrid_search_svc as hs
157
+ import app.turso_svc as turso
158
  import app.arxiv_svc as arxiv
159
 
160
  monkeypatch.setattr(hs, "search", AsyncMock(return_value=[
161
  "1706.03762", "2301.00001",
162
  ]))
163
+ # Phase 3.5: Turso is the primary metadata source
164
+ monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
165
  "1706.03762": {
166
  "arxiv_id": "1706.03762", "title": "Saved Paper",
167
  "abstract": "...", "authors": '["A"]',
 
173
  "category": "cs.AI", "published": "2023-01-01", "year": 2023,
174
  },
175
  }))
176
+ monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
177
 
178
  # First: visit home to get cookie, then save a paper
179
  client.get("/")
 
193
  same as before the hybrid search swap.
194
  """
195
  import app.hybrid_search_svc as hs
196
+ import app.turso_svc as turso
197
  import app.arxiv_svc as arxiv
198
 
199
  monkeypatch.setattr(hs, "search", AsyncMock(return_value=["1706.03762"]))
200
+ # Phase 3.5: Turso is the primary metadata source
201
+ monkeypatch.setattr(turso, "fetch_metadata_batch", AsyncMock(return_value={
202
  "1706.03762": {
203
  "arxiv_id": "1706.03762", "title": "HTMX Test Paper",
204
  "abstract": "...", "authors": '["A"]',
205
  "category": "cs.CL", "published": "2017-06-12", "year": 2017,
206
  },
207
  }))
208
+ monkeypatch.setattr(arxiv, "fetch_metadata_batch", AsyncMock(return_value={}))
209
 
210
  resp = client.get(
211
  "/search?q=transformer",