siddhm11 commited on
Commit
3f58d41
Β·
1 Parent(s): d33f7fa

Phase 6.5 Day 1: Real Qdrant cosine scores (A1) + verification timestamp (A2)

Browse files

A1 β€” Feature 0 fix (qdrant_cosine_score):
- recommendations.py: Switch search_by_vector() -> search_by_vector_with_scores()
in both per-cluster and short-term supplement paths
- Build qdrant_score_map from actual cosine scores in same pass as paper_cluster_map
- Delete fake rank-decay approximation (1.0 - rank * 0.01)
- Feature 0 now receives real cosines instead of synthetic linear sequence

A2 β€” Live verification:
- PHASE6-Reranker-Framing.md: Tick all 6.3 checklist items, add verification timestamp
(2026-05-03: model_loaded=true, n_trees=141, fallback_active=false)

Test fix:
- test_integration.py: Update quota pipeline test mock from search_by_vector to
search_by_vector_with_scores (returns list[dict] not list[str])

Tests: 203 passed, 0 failures

app/routers/recommendations.py CHANGED
@@ -256,38 +256,49 @@ async def _multi_interest_recommend(
256
  st_vec = await profiles.load_profile(user_id, "short_term")
257
 
258
  search_tasks = [
259
- qdrant_svc.search_by_vector(
260
  query_vector=c.medoid_embedding.tolist(),
261
  limit=quota * _OVERSAMPLE,
262
  exclude_ids=seen,
263
  )
264
  for c, quota in zip(clusters, quotas)
265
  ]
266
- per_cluster_results = await asyncio.gather(*search_tasks)
267
 
268
- # Phase 4.5: Build paper β†’ cluster mapping BEFORE merge (so we know
269
- # which cluster each paper was retrieved from).
270
  paper_cluster_map: dict[str, int] = {}
271
- for cluster, result_ids in zip(clusters, per_cluster_results):
272
- for aid in result_ids:
 
 
273
  if aid not in paper_cluster_map: # first-occurrence wins
274
  paper_cluster_map[aid] = cluster.cluster_idx
 
 
 
275
 
276
- # Apply quota merge (dedup globally, respect per-cluster quotas)
277
- candidate_ids = merge_quota_results(list(per_cluster_results), quotas)
 
 
 
278
 
279
  # Supplement with short-term session context
280
  if st_vec is not None:
281
  seen_so_far = seen | set(candidate_ids)
282
- st_results = await qdrant_svc.search_by_vector(
283
  query_vector=st_vec.tolist(),
284
  limit=_ST_SUPPLEMENT,
285
  exclude_ids=seen_so_far,
286
  )
287
- for aid in st_results:
 
288
  if aid not in set(candidate_ids):
289
  candidate_ids.append(aid)
290
  paper_cluster_map[aid] = -1 # short-term supplement
 
 
291
 
292
  if not candidate_ids:
293
  return [], {}
@@ -326,17 +337,8 @@ async def _multi_interest_recommend(
326
  user_total_saves = len(state.positive_list)
327
  user_total_dismissals = len(state.negative_list)
328
 
329
- # Build qdrant_score_map from per_cluster_results
330
- # per_cluster_results is list[list[str]] β€” we need scores too.
331
- # Use the paper_cluster_map to approximate: score = 1.0 - (rank / total)
332
- # for now, as the current retrieval path returns only IDs.
333
- # TODO: Phase 6.2+ switch to search_by_vector_with_scores()
334
- qdrant_score_map: dict[str, float] = {}
335
- for cluster_ids in per_cluster_results:
336
- for rank, aid in enumerate(cluster_ids):
337
- if aid not in qdrant_score_map:
338
- # Approximate score from rank position (higher rank = higher score)
339
- qdrant_score_map[aid] = max(0.0, 1.0 - rank * 0.01)
340
 
341
  qdrant_scores = np.asarray(
342
  [qdrant_score_map.get(cid, 0.0) for cid in valid_ids],
 
256
  st_vec = await profiles.load_profile(user_id, "short_term")
257
 
258
  search_tasks = [
259
+ qdrant_svc.search_by_vector_with_scores(
260
  query_vector=c.medoid_embedding.tolist(),
261
  limit=quota * _OVERSAMPLE,
262
  exclude_ids=seen,
263
  )
264
  for c, quota in zip(clusters, quotas)
265
  ]
266
+ per_cluster_scored = await asyncio.gather(*search_tasks)
267
 
268
+ # Build paper β†’ cluster map AND real qdrant_score_map in one pass.
269
+ # Phase 6.5 A1: replaces the old rank-based linear decay approximation.
270
  paper_cluster_map: dict[str, int] = {}
271
+ qdrant_score_map: dict[str, float] = {}
272
+ for cluster, scored_results in zip(clusters, per_cluster_scored):
273
+ for hit in scored_results:
274
+ aid = hit["arxiv_id"]
275
  if aid not in paper_cluster_map: # first-occurrence wins
276
  paper_cluster_map[aid] = cluster.cluster_idx
277
+ # Keep highest cosine if a paper appears in multiple clusters
278
+ if aid not in qdrant_score_map or hit["score"] > qdrant_score_map[aid]:
279
+ qdrant_score_map[aid] = float(hit["score"])
280
 
281
+ # merge_quota_results expects list[list[str]] β€” extract IDs
282
+ per_cluster_ids = [
283
+ [h["arxiv_id"] for h in scored] for scored in per_cluster_scored
284
+ ]
285
+ candidate_ids = merge_quota_results(per_cluster_ids, quotas)
286
 
287
  # Supplement with short-term session context
288
  if st_vec is not None:
289
  seen_so_far = seen | set(candidate_ids)
290
+ st_scored = await qdrant_svc.search_by_vector_with_scores(
291
  query_vector=st_vec.tolist(),
292
  limit=_ST_SUPPLEMENT,
293
  exclude_ids=seen_so_far,
294
  )
295
+ for hit in st_scored:
296
+ aid = hit["arxiv_id"]
297
  if aid not in set(candidate_ids):
298
  candidate_ids.append(aid)
299
  paper_cluster_map[aid] = -1 # short-term supplement
300
+ if aid not in qdrant_score_map:
301
+ qdrant_score_map[aid] = float(hit["score"])
302
 
303
  if not candidate_ids:
304
  return [], {}
 
337
  user_total_saves = len(state.positive_list)
338
  user_total_dismissals = len(state.negative_list)
339
 
340
+ # qdrant_score_map was built above from real cosine scores
341
+ # (Phase 6.5 A1 β€” replaces the old rank-based approximation)
 
 
 
 
 
 
 
 
 
342
 
343
  qdrant_scores = np.asarray(
344
  [qdrant_score_map.get(cid, 0.0) for cid in valid_ids],
docs/phases/PHASE6-Reranker-Framing.md CHANGED
@@ -757,16 +757,17 @@ Two lines, verbatim:
757
  - [ ] Commit: "Phase 6.2: per-candidate cluster identity through reranker"
758
 
759
  ### Phase 6.3 β€” Deployment verification + Bug B
760
- - [ ] Decide deployment strategy: E.1.a (commit) vs E.1.b (snapshot_download). Recommend E.1.a.
761
- - [ ] Verify `models/reranker-phase6/production_model/reranker_v1.txt` is in working tree, not gitignored, not dockerignored
762
- - [ ] Push to HF Space; wait for build; check build logs for "[reranker] LightGBM model loaded"
763
- - [ ] Add `/healthz/reranker` route (Section E.2)
764
- - [ ] Add `_rr.is_model_loaded()`, `_rr.get_loaded_model_path()`, `_rr.get_num_trees()` accessors
765
- - [ ] `curl https://siddhm11-researchit.hf.space/healthz/reranker` β†’ confirm `model_loaded: true, n_trees: 141`
766
- - [ ] Add per-request `reranker.features` log line with `feature_nonzero_rate`
767
- - [ ] Fix Bug B: medoid_embedding_blob fallback in cluster reload (Section E.4)
768
- - [ ] Add `medoid_embedding_blob BLOB` column to clusters table on Turso (one-line ALTER)
769
- - [ ] Update CLAUDE.md / model card to reflect deployment story
 
770
 
771
  ### Phase 6 documentation
772
  - [ ] Write `docs/phases/PHASE6.md` retraining decision (Section F.4)
 
757
  - [ ] Commit: "Phase 6.2: per-candidate cluster identity through reranker"
758
 
759
  ### Phase 6.3 β€” Deployment verification + Bug B
760
+ - [x] Decide deployment strategy: E.1.a (commit) vs E.1.b (snapshot_download). Used E.1.a.
761
+ - [x] Verify `models/reranker-phase6/production_model/reranker_v1.txt` is in working tree, not gitignored, not dockerignored
762
+ - [x] Push to HF Space; wait for build; check build logs for "[reranker] LightGBM model loaded"
763
+ - [x] Add `/healthz/reranker` route (Section E.2)
764
+ - [x] Add `_rr.is_model_loaded()`, `_rr.get_loaded_model_path()`, `_rr.get_num_trees()` accessors
765
+ - [x] `curl https://siddhm11-researchit.hf.space/healthz/reranker` β†’ confirm `model_loaded: true, n_trees: 141`
766
+ > *Verified live at 2026-05-03: `model_loaded=true, n_trees=141, fallback_active=false, feature_count=37, feature_schema_hash=5d0b3de7b0c1`.*
767
+ - [x] Add per-request `reranker.features` log line with `feature_nonzero_rate`
768
+ - [x] Fix Bug B: medoid_embedding_blob fallback in cluster reload (Section E.4)
769
+ - [x] Add `medoid_embedding_blob BLOB` column to clusters table (SQLite ALTER migration)
770
+ - [x] Update CLAUDE.md / model card to reflect deployment story
771
 
772
  ### Phase 6 documentation
773
  - [ ] Write `docs/phases/PHASE6.md` retraining decision (Section F.4)
docs/phases/PHASE6.5-Implementation-Plan.md ADDED
@@ -0,0 +1,389 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Phase 6.5 β€” Implementation Plan
2
+
3
+ > **Source:** `docs/phases/PHASE6.5-Instrumentation-Framing.md`
4
+ > **Timeline:** 5 days (each day leaves the app in a working state)
5
+ > **Prerequisite for:** Phase 7 (Evaluation Framework)
6
+
7
+ ---
8
+
9
+ ## Day 1: Phase 6 Hot-fix (A1 + A2)
10
+
11
+ ### A1: Real Qdrant Cosine Scores (Feature 0 fix)
12
+
13
+ **Problem:** `recommendations.py:329-339` fakes Qdrant scores with linear rank decay (`1.0 - rank * 0.01`). Feature 0 is the model's #5 most important feature β€” it should be real cosines from Qdrant.
14
+
15
+ **Root cause:** The search calls use `search_by_vector()` (returns `list[str]`) instead of `search_by_vector_with_scores()` (returns `list[dict]` with `{"arxiv_id": str, "score": float}`).
16
+
17
+ ---
18
+
19
+ #### [MODIFY] [recommendations.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/recommendations.py)
20
+
21
+ **Change 1 β€” Per-cluster searches (line 258-266):**
22
+ Switch from `search_by_vector()` to `search_by_vector_with_scores()`:
23
+
24
+ ```diff
25
+ - search_tasks = [
26
+ - qdrant_svc.search_by_vector(
27
+ - query_vector=c.medoid_embedding.tolist(),
28
+ - limit=quota * _OVERSAMPLE,
29
+ - exclude_ids=seen,
30
+ - )
31
+ - for c, quota in zip(clusters, quotas)
32
+ - ]
33
+ - per_cluster_results = await asyncio.gather(*search_tasks)
34
+ + search_tasks = [
35
+ + qdrant_svc.search_by_vector_with_scores(
36
+ + query_vector=c.medoid_embedding.tolist(),
37
+ + limit=quota * _OVERSAMPLE,
38
+ + exclude_ids=seen,
39
+ + )
40
+ + for c, quota in zip(clusters, quotas)
41
+ + ]
42
+ + per_cluster_scored = await asyncio.gather(*search_tasks)
43
+ ```
44
+
45
+ **Change 2 β€” Build `paper_cluster_map` AND `qdrant_score_map` in one pass (line 268-277):**
46
+
47
+ ```diff
48
+ - paper_cluster_map: dict[str, int] = {}
49
+ - for cluster, result_ids in zip(clusters, per_cluster_results):
50
+ - for aid in result_ids:
51
+ - if aid not in paper_cluster_map:
52
+ - paper_cluster_map[aid] = cluster.cluster_idx
53
+ -
54
+ - candidate_ids = merge_quota_results(list(per_cluster_results), quotas)
55
+ + paper_cluster_map: dict[str, int] = {}
56
+ + qdrant_score_map: dict[str, float] = {}
57
+ + for cluster, scored_results in zip(clusters, per_cluster_scored):
58
+ + for hit in scored_results:
59
+ + aid = hit["arxiv_id"]
60
+ + if aid not in paper_cluster_map:
61
+ + paper_cluster_map[aid] = cluster.cluster_idx
62
+ + # Keep highest cosine if paper appears in multiple clusters
63
+ + if aid not in qdrant_score_map or hit["score"] > qdrant_score_map[aid]:
64
+ + qdrant_score_map[aid] = float(hit["score"])
65
+ +
66
+ + # merge_quota_results expects list[list[str]] β€” extract IDs
67
+ + per_cluster_ids = [[h["arxiv_id"] for h in scored] for scored in per_cluster_scored]
68
+ + candidate_ids = merge_quota_results(per_cluster_ids, quotas)
69
+ ```
70
+
71
+ **Change 3 β€” Short-term supplement search (line 280-290):**
72
+ Also switch to scored search:
73
+
74
+ ```diff
75
+ - st_results = await qdrant_svc.search_by_vector(
76
+ + st_scored = await qdrant_svc.search_by_vector_with_scores(
77
+ query_vector=st_vec.tolist(),
78
+ limit=_ST_SUPPLEMENT,
79
+ exclude_ids=seen_so_far,
80
+ )
81
+ - for aid in st_results:
82
+ - if aid not in set(candidate_ids):
83
+ - candidate_ids.append(aid)
84
+ + for hit in st_scored:
85
+ + aid = hit["arxiv_id"]
86
+ + if aid not in set(candidate_ids):
87
+ + candidate_ids.append(aid)
88
+ + if aid not in qdrant_score_map:
89
+ + qdrant_score_map[aid] = float(hit["score"])
90
+ paper_cluster_map[aid] = -1 # short-term supplement
91
+ ```
92
+
93
+ **Change 4 β€” Delete fake score block (line 329-339):**
94
+ The entire synthetic-decay block becomes dead code. Delete it:
95
+
96
+ ```diff
97
+ - # Build qdrant_score_map from per_cluster_results
98
+ - # per_cluster_results is list[list[str]] β€” we need scores too.
99
+ - # Use the paper_cluster_map to approximate: score = 1.0 - (rank / total)
100
+ - # for now, as the current retrieval path returns only IDs.
101
+ - # TODO: Phase 6.2+ switch to search_by_vector_with_scores()
102
+ - qdrant_score_map: dict[str, float] = {}
103
+ - for cluster_ids in per_cluster_results:
104
+ - for rank, aid in enumerate(cluster_ids):
105
+ - if aid not in qdrant_score_map:
106
+ - # Approximate score from rank position (higher rank = higher score)
107
+ - qdrant_score_map[aid] = max(0.0, 1.0 - rank * 0.01)
108
+ ```
109
+
110
+ The existing `qdrant_scores = np.asarray(...)` on line 341-344 stays as-is β€” it reads from `qdrant_score_map` which now has real cosines.
111
+
112
+ ### A2: Verify `/healthz/reranker` live
113
+
114
+ > βœ… **Already done.** Verified 2026-05-03: `model_loaded: true, n_trees: 141, fallback_active: false`.
115
+
116
+ Just need to add the timestamp to `PHASE6-Reranker-Framing.md`.
117
+
118
+ ---
119
+
120
+ ## Day 2: B1 β€” `query_id` Linkage
121
+
122
+ ### What it enables
123
+ Per-feed CTR: "out of 30 papers shown in this request, how many got saved?"
124
+
125
+ ### Current state verified
126
+ - `interactions` table already has a `query_id TEXT` column βœ… (line 31 in DDL)
127
+ - `db.log_interaction()` already accepts `query_id` βœ… (line 135)
128
+ - `events.py` already accepts and forwards `query_id` via `Form(default="")` βœ… (line 26)
129
+ - **Missing:** `recommendations.py` never generates or passes `query_id`. Search router never generates one either. Templates don't carry it.
130
+
131
+ ---
132
+
133
+ #### [MODIFY] [recommendations.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/recommendations.py)
134
+
135
+ **1. Generate `query_id` at the top of `get_recommendations()` (line 59):**
136
+
137
+ ```python
138
+ query_id = str(uuid.uuid4())
139
+ ```
140
+
141
+ **2. Thread `query_id` into `paper_tags` in all 3 tiers:**
142
+
143
+ - Tier 1: In `_multi_interest_recommend()` return value, add `"query_id": query_id` to each tag dict (line 455-458)
144
+ - Tier 2: EWMA fallback tags (line 116-120) β€” add `"query_id": query_id`
145
+ - Tier 3: Qdrant recommend tags (line 131-135) β€” add `"query_id": query_id`
146
+ - Trending fallback (line 85-87) β€” add `"query_id": query_id`
147
+
148
+ **3. Embed `query_id` + `position` into paper dicts (line 153-166):**
149
+
150
+ ```python
151
+ for idx, aid in enumerate(rec_arxiv_ids):
152
+ ...
153
+ papers.append({
154
+ **meta[aid],
155
+ "saved": False,
156
+ "dismissed": False,
157
+ "ranker_version": tags.get("ranker_version", _RANKER_VERSION),
158
+ "candidate_source": tags.get("candidate_source", ""),
159
+ "cluster_id": tags.get("cluster_id", ""),
160
+ "query_id": tags.get("query_id", ""), # NEW
161
+ "position": idx, # NEW
162
+ })
163
+ ```
164
+
165
+ > [!IMPORTANT]
166
+ > The `_multi_interest_recommend` signature needs updating to accept `query_id` as a parameter, since it's where the Tier 1 paper_tags are built. Alternatively, we generate `query_id` inside it and return it alongside the tags. I'll use the approach of passing it as a param.
167
+
168
+ ---
169
+
170
+ #### [MODIFY] [search.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/search.py)
171
+
172
+ **Generate `query_id` per search and embed in paper dicts (line 70-77):**
173
+
174
+ ```python
175
+ query_id = str(uuid.uuid4()) # generated once per /search request
176
+
177
+ for idx, p in enumerate(papers):
178
+ p["saved"] = p["arxiv_id"] in saved_ids
179
+ p["dismissed"] = p["arxiv_id"] in dismissed_ids
180
+ p["query_id"] = query_id # NEW
181
+ p["position"] = idx # NEW
182
+ ```
183
+
184
+ ---
185
+
186
+ #### [MODIFY] [action_buttons.html](file:///c:/Users/siddh/ResearchIT-Final/app/templates/partials/action_buttons.html)
187
+
188
+ **Add `query_id` and `position` to ALL three `hx-vals` JSON blobs:**
189
+
190
+ Add to template header:
191
+ ```jinja2
192
+ {% set _query_id = paper.query_id | default("") if paper is defined else "" %}
193
+ {% set _position = paper.position | default(0) if paper is defined else 0 %}
194
+ ```
195
+
196
+ Add to each `hx-vals`:
197
+ ```
198
+ "query_id": "{{ _query_id }}", "position": "{{ _position }}"
199
+ ```
200
+
201
+ The save button (line 37) already has `position` β€” update to use `_position`. The not-interested buttons (line 26, 45) need `query_id` and `position` added.
202
+
203
+ ---
204
+
205
+ ## Day 3: B2 β€” Propensity Logging
206
+
207
+ ### What it enables
208
+ Counterfactual evaluation (SNIPS estimator) β€” "what would have happened with ranker B?"
209
+
210
+ ---
211
+
212
+ #### [MODIFY] [db.py](file:///c:/Users/siddh/ResearchIT-Final/app/db.py)
213
+
214
+ **1. Migration (after `_MIGRATION_6_3`):**
215
+ ```python
216
+ _MIGRATION_6_5 = [
217
+ "ALTER TABLE interactions ADD COLUMN propensity REAL",
218
+ "ALTER TABLE interactions ADD COLUMN policy_id TEXT",
219
+ ]
220
+ ```
221
+
222
+ **2. Run in `init_db()`.**
223
+
224
+ **3. Extend `log_interaction()` signature (line 129-149):**
225
+ Add `propensity: float | None = None` and `policy_id: str | None = None` kwargs. Extend the INSERT.
226
+
227
+ ---
228
+
229
+ #### [MODIFY] [recommendations.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/recommendations.py)
230
+
231
+ **Compute propensity after `inject_exploration()` (line 443):**
232
+
233
+ ```python
234
+ # Exploration papers: uniformly sampled from pool
235
+ explore_pool_size = max(1, len(reranked_ids) - len(mmr_selected))
236
+ explore_propensity = len(exploration_set) / explore_pool_size if explore_pool_size > 0 else 0.0
237
+
238
+ # Exploitation (MMR-selected): deterministic β†’ propensity = 1.0
239
+ for aid in final:
240
+ paper_tags[aid]["propensity"] = (
241
+ explore_propensity if aid in exploration_set else 1.0
242
+ )
243
+ paper_tags[aid]["policy_id"] = _RANKER_VERSION
244
+ ```
245
+
246
+ Thread `propensity` and `policy_id` into template context the same way as `query_id`.
247
+
248
+ ---
249
+
250
+ #### [MODIFY] [search.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/search.py)
251
+
252
+ Search is fully deterministic β†’ `propensity = 1.0` for all results.
253
+
254
+ ---
255
+
256
+ #### [MODIFY] [action_buttons.html](file:///c:/Users/siddh/ResearchIT-Final/app/templates/partials/action_buttons.html)
257
+
258
+ Add `propensity` and `policy_id` to `hx-vals`.
259
+
260
+ ---
261
+
262
+ #### [MODIFY] [events.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/events.py)
263
+
264
+ Add `propensity: float = Form(default=0.0)` and `policy_id: str = Form(default="")` to both endpoints. Forward to `db.log_interaction()`.
265
+
266
+ ---
267
+
268
+ ## Day 4: B3 β€” Cluster Snapshot Versioning
269
+
270
+ ### What it enables
271
+ Cluster history, debugging "why did recs shift?", content-addressed key for Phase 8a LLM summary cache.
272
+
273
+ ---
274
+
275
+ #### [MODIFY] [db.py](file:///c:/Users/siddh/ResearchIT-Final/app/db.py)
276
+
277
+ **1. Add `cluster_snapshots` DDL to `_SCHEMA`:**
278
+ ```sql
279
+ CREATE TABLE IF NOT EXISTS cluster_snapshots (
280
+ user_id TEXT NOT NULL,
281
+ snapshot_id TEXT NOT NULL,
282
+ cluster_idx INTEGER NOT NULL,
283
+ medoid_paper_id TEXT NOT NULL,
284
+ importance REAL NOT NULL,
285
+ paper_ids TEXT NOT NULL,
286
+ medoid_embedding_blob BLOB,
287
+ snapshot_date TEXT NOT NULL DEFAULT (datetime('now')),
288
+ paper_ids_hash TEXT NOT NULL,
289
+ PRIMARY KEY (user_id, snapshot_id, cluster_idx)
290
+ );
291
+ CREATE INDEX IF NOT EXISTS idx_snap_user_date ON cluster_snapshots(user_id, snapshot_date DESC);
292
+ CREATE INDEX IF NOT EXISTS idx_snap_hash ON cluster_snapshots(paper_ids_hash);
293
+ ```
294
+
295
+ **2. Add `save_cluster_snapshot()` and `prune_old_snapshots()` functions.**
296
+
297
+ ---
298
+
299
+ #### [MODIFY] [recommendations.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/recommendations.py)
300
+
301
+ After `save_clusters_to_db(user_id, clusters)` (line ~253), call `db.save_cluster_snapshot()`.
302
+
303
+ ---
304
+
305
+ #### [MODIFY] [main.py](file:///c:/Users/siddh/ResearchIT-Final/app/main.py)
306
+
307
+ Call `db.prune_old_snapshots(retention_days=30)` in the lifespan handler after `init_db()`.
308
+
309
+ ---
310
+
311
+ ## Day 5: B4 β€” Semantic Scholar Author Import
312
+
313
+ ### What it enables
314
+ "Paste S2 URL β†’ 20 implicit saves" β€” replaces manual seed search friction.
315
+
316
+ ---
317
+
318
+ #### [NEW] [s2_svc.py](file:///c:/Users/siddh/ResearchIT-Final/app/s2_svc.py)
319
+
320
+ Functions:
321
+ - `parse_author_input(text) β†’ str | None` β€” accepts S2 URL, raw S2 ID, or ORCID
322
+ - `resolve_orcid(orcid) β†’ str | None` β€” resolves ORCID via S2 author search
323
+ - `fetch_author_arxiv_papers(author_id, limit=50) β†’ list[str]` β€” returns arXiv IDs
324
+
325
+ ---
326
+
327
+ #### [MODIFY] [config.py](file:///c:/Users/siddh/ResearchIT-Final/app/config.py)
328
+
329
+ Add `S2_API_KEY = os.getenv("S2_API_KEY", "")` β€” key already in `.env`.
330
+
331
+ ---
332
+
333
+ #### [MODIFY] [onboarding.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/onboarding.py)
334
+
335
+ Add `POST /api/onboarding/import-author` endpoint.
336
+
337
+ ---
338
+
339
+ #### [NEW] Template partials for import step
340
+
341
+ - `partials/import_author.html` β€” the import form step
342
+ - `partials/import_success.html` β€” success confirmation
343
+ - `partials/import_error.html` β€” error message
344
+
345
+ ---
346
+
347
+ ## Verification Plan
348
+
349
+ ### Automated Tests
350
+
351
+ After each day:
352
+
353
+ ```bash
354
+ python -m pytest tests/ -v --tb=short
355
+ ```
356
+
357
+ **New test files:**
358
+ - Day 1: Add `test_qdrant_scores_are_real_cosines` to `tests/test_phase6_feature_wiring.py`
359
+ - Day 2: Create `tests/test_instrumentation.py` β€” `test_query_id_round_trips`
360
+ - Day 3: Add `test_propensity_sums_correctly` to instrumentation tests
361
+ - Day 4: Add `test_snapshot_appended_on_each_recluster`, `test_prune_respects_retention`
362
+ - Day 5: Add `test_s2_import_saves_papers_with_correct_source_tag`
363
+
364
+ ### Manual Verification
365
+
366
+ - Day 1: `curl -s https://siddhm11-researchit.hf.space/healthz/reranker` β€” confirm model still loaded after code change
367
+ - Day 5: Test author import with real S2 profile URL
368
+
369
+ ---
370
+
371
+ ## Documentation Updates (after all days)
372
+
373
+ - [ ] CLAUDE.md: Add Rule 3.11 β€” "Every interaction must carry `query_id`, `propensity`, and `policy_id`"
374
+ - [ ] TASK-TRACKER.md: Add Phase 6.5 section with checklist
375
+ - [ ] README.md: Update test count
376
+ - [ ] PHASE6-Reranker-Framing.md: Add live verification timestamp
377
+
378
+ ---
379
+
380
+ ## Open Questions
381
+
382
+ > [!IMPORTANT]
383
+ > **Q1:** The framing doc proposes `_RANKER_VERSION` as the `policy_id`. Currently it's `"v4.1_quota_hungarian_suppression"`. Should we also bump this to `"v6.5_lightgbm_real_cosines"` when Day 1 lands? It would make A/B-style log analysis cleaner.
384
+
385
+ > [!IMPORTANT]
386
+ > **Q2:** Day 5 (S2 author import) requires `httpx` as a dependency. It's already used by `turso_svc.py`, so no new install needed β€” just confirming.
387
+
388
+ > [!NOTE]
389
+ > **Q3:** The framing doc suggests cluster snapshot pruning at startup. For a simple MVP this is fine. Phase 7 can upgrade to APScheduler if needed.
docs/phases/PHASE6.5-Instrumentation-Framing.md ADDED
@@ -0,0 +1,939 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PHASE 6.5 β€” Instrumentation Framing
2
+
3
+ > **Status:** πŸ“‹ Proposed (not started)
4
+ > **Scope:** Phase 6 hot-fix (Day 1) + Phase 6.5 instrumentation (Days 2–4) + Phase 5.1 cold-start completion (Day 5, parallel)
5
+ > **Prerequisite for:** Phase 7 (Evaluation Framework)
6
+ > **Supersedes:** Open items at the end of `PHASE6-Reranker-Framing.md` (Section E.1.a, E.2 verification, ADR A1/A4 deferrals)
7
+ > **Owner:** Amin
8
+ > **Authoring date:** 2026-05-03
9
+
10
+ ---
11
+
12
+ ## TL;DR
13
+
14
+ Phase 6 is **substantively complete** but has two open flags. Phase 7 (evaluation framework) cannot be built cleanly on top of the current schema β€” three pieces of telemetry are missing. This doc bundles three coherent units of work:
15
+
16
+ | Bucket | Identity | Days | Why it's separate |
17
+ | ------------------------- | --------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------ |
18
+ | **Phase 6 Hot-fix** | Close out Phase 6 cleanly | 1 | Two correctness/verification items left over from PHASE6-Reranker-Framing. Belongs to Phase 6, not later. |
19
+ | **Phase 6.5** | Telemetry foundation | 3 | Mirrors the Phase 4.5 precedent: a small instrumentation phase that exists *because* the next phase needs it. |
20
+ | **Phase 5.1 (side-quest)** | Cold-start completion | 1 | Author-import was the deferred Layer 2 from Phase 5's three-layer onboarding plan. Sits beside, not inside, 6.5. |
21
+
22
+ Total: **5 working days**. After this, Phase 7 starts on a clean substrate where all the prerequisite plumbing is already in production.
23
+
24
+ ---
25
+
26
+ ## 1. Why this doc exists (the reasoning)
27
+
28
+ The instinct is to fold all five days of work into Phase 7 β€” it's all "stuff that helps evaluation," after all. That instinct is wrong, and the reason matters.
29
+
30
+ **Phases in this project have always had one identity.** Look at the existing pattern:
31
+
32
+ - Phase 4 = quota fusion
33
+ - Phase 4.5 = instrumentation only (`ranker_version`, `candidate_source`, `cluster_id`)
34
+ - Phase 5 = onboarding
35
+ - Phase 6 = LightGBM reranker integration
36
+ - Phase 7 = **evaluation framework** (per master roadmap: nDCG@10, Recall@50, HR@10, ILS, category entropy, time-split eval, regression CI)
37
+
38
+ Phase 4.5 is the precedent. When instrumentation needed to land between Phase 4 and Phase 5, it didn't get folded into either β€” it got its own micro-phase precisely because it was load-bearing for everything downstream and had a single identity. The framing doc for Phase 6 (Part H) was also explicit about what Phase 6 is NOT β€” and "evaluation harness" was carved into Phase 7 deliberately.
39
+
40
+ **What happens if we fold everything into Phase 7?** Phase 7's master-roadmap budget is ~1 week. Adding ~3 days of prerequisite infrastructure either:
41
+
42
+ 1. Bloats Phase 7 to 2+ weeks, or
43
+ 2. Forces shortcuts on the actual harness work (offline regression, time-split eval, frozen `eval/eval_set_v1.0.parquet`, CI gates on >3% nDCG@10 drops) β€” which is a meaty deliverable in its own right.
44
+
45
+ **What happens if we leave the Phase 6 closeout for later?** The biggest item in the closeout is the `qdrant_cosine_score` fix β€” and that's a model-correctness bug. Feature 0 is the reranker's #5 most-important feature by training importance, and right now it's being fed synthetic linear decay (`1.0 - rank * 0.01`) instead of actual cosines. Every day it sits unfixed, the model is performing below its training-distribution capability. This belongs to Phase 6, full stop.
46
+
47
+ **What happens if the cold-start work waits?** B4 (S2 author import) is the single biggest cold-start lift available β€” replacing "manually save 5 papers" with "paste your S2 URL β†’ 20 saves." It's a Phase 5 completion, not a Phase 7 input. It can run in parallel with Phase 6.5 work because it touches a different code path (onboarding router, no schema changes to `interactions`).
48
+
49
+ **The structural answer:** three identities β†’ three buckets. This doc unifies them under one plan with one timeline.
50
+
51
+ ---
52
+
53
+ ## 2. Phase 6 Audit β€” current status
54
+
55
+ Cross-checked against `PHASE6-Reranker-Framing.md` (Parts A–G) and current code. Audit was performed 2026-05-03.
56
+
57
+ ### βœ… Phase 6.1 β€” Simplification Pass: DONE
58
+
59
+ In `app/routers/recommendations.py`:
60
+
61
+ - `suppressed` and `onboarding_categories` loaded **before** the rerank call
62
+ - `qdrant_score_map` built from `per_cluster_results`
63
+ - `user_total_saves` / `user_total_dismissals` computed and passed
64
+ - `is_suppressed_arr` and `onboarding_match_arr` computed per-candidate
65
+ - `rerank_candidates` called with the full Phase 6 kwarg signature
66
+
67
+ ### βœ… Phase 6.2 β€” Per-Candidate Plumbing: DONE
68
+
69
+ - `paper_cluster_map` is built before the merge β€” first-occurrence wins, exactly per spec
70
+ - `per_candidate_importance` is a `(N,)` array, not a scalar
71
+ - `per_candidate_medoids` is a `(N, 1024)` stack, not broadcast
72
+ - `app/recommend/reranker.py:287–298` slot 24 correctly handles both 1D (broadcast) and 2D (per-candidate) medoid shapes
73
+ - `test_phase6_feature_wiring.py::test_per_candidate_cluster_importance` and `test_per_candidate_medoid_distance` exist
74
+
75
+ ### βœ… Phase 6.3 β€” Deployment Verification: DONE (code), ⚠️ UNVERIFIED (live)
76
+
77
+ - `/healthz/reranker` endpoint exists in `app/routers/health.py`
78
+ - `is_model_loaded()`, `get_loaded_model_path()`, `get_num_trees()` accessors exist in `reranker.py`
79
+ - Per-request feature activation logging at `reranker.py:432–438`
80
+ - Bug B fix: `medoid_embedding_blob BLOB` column added via migration in `db.py:128`
81
+ - Hungarian fallback now prefers live vector β†’ persisted blob β†’ skip with warning
82
+
83
+ ### ⚠️ Two flags from Phase 6 (handled in §3 below)
84
+
85
+ 1. **`qdrant_scores` are still rank-approximated, not real cosines.** `recommendations.py:316–325` uses synthetic linear decay because the call site is still on `search_by_vector()` (returns `list[str]`) instead of `search_by_vector_with_scores()` (returns `[{"arxiv_id": ..., "score": ...}]`). The scored function already exists in `qdrant_svc.py:265` β€” the swap is mechanical.
86
+ 2. **`/healthz/reranker` not curl-verified live.** The endpoint exists in code. Production status is unknown β€” could be silently running heuristic fallback if the model file isn't being copied into the Docker image.
87
+
88
+ ### βœ… Phase 6.4 β€” Retraining: correctly deferred
89
+
90
+ Documented in `PHASE6-Reranker-Framing.md` Section F.4, gated on synthetic simulator OR 100 real users with β‰₯10 saves each.
91
+
92
+ ### Verdict
93
+
94
+ Phase 6 is substantively complete. The two flags above are polish, not blockers β€” but the qdrant-scores fix is feeding the model wrong data for one of its top-importance features and should ship as part of Phase 6 closeout, not deferred.
95
+
96
+ ---
97
+
98
+ ## 3. Bucket 1 β€” Phase 6 Hot-fix (Day 1)
99
+
100
+ ### 3.1 β€” A1: Real Qdrant Scores (the lying-feature-0 fix)
101
+
102
+ **The problem.** In `recommendations.py:248`, the per-cluster search calls `qdrant_svc.search_by_vector()` which returns `list[str]` β€” arXiv IDs only, no scores. Then around line 316, scores are faked by linear decay from rank position:
103
+
104
+ ```python
105
+ qdrant_score_map[aid] = max(0.0, 1.0 - rank * 0.01)
106
+ ```
107
+
108
+ A paper at rank 0 gets score 1.0, rank 50 gets 0.50, rank 100 gets 0.0. This bears almost no relationship to actual cosine similarity, where a top result might be 0.85 and rank 50 might be 0.78 β€” a much tighter band. Feature 0 (`qdrant_cosine_score`) is the model's #5 most-important feature by training importance. Feeding it a synthetic linear sequence caps how much the model can help.
109
+
110
+ **The fix.** Switch to `search_by_vector_with_scores()` (already exists at `qdrant_svc.py:265`), and build `qdrant_score_map` from actual cosines as part of the same loop that builds `paper_cluster_map`.
111
+
112
+ **Code change** in `app/routers/recommendations.py`, the `_multi_interest_recommend()` flow around line 245:
113
+
114
+ ```python
115
+ # OLD
116
+ search_tasks = [
117
+ qdrant_svc.search_by_vector(
118
+ query_vector=c.medoid_embedding.tolist(),
119
+ limit=quota * _OVERSAMPLE,
120
+ exclude_ids=seen,
121
+ )
122
+ for c, quota in zip(clusters, quotas)
123
+ ]
124
+ per_cluster_results = await asyncio.gather(*search_tasks)
125
+
126
+ # Phase 4.5: Build paper β†’ cluster mapping BEFORE merge
127
+ paper_cluster_map: dict[str, int] = {}
128
+ for cluster, result_ids in zip(clusters, per_cluster_results):
129
+ for aid in result_ids:
130
+ if aid not in paper_cluster_map:
131
+ paper_cluster_map[aid] = cluster.cluster_idx
132
+
133
+ # Apply quota merge
134
+ candidate_ids = merge_quota_results(list(per_cluster_results), quotas)
135
+ ```
136
+
137
+ becomes:
138
+
139
+ ```python
140
+ # NEW β€” fetch scores alongside IDs
141
+ search_tasks = [
142
+ qdrant_svc.search_by_vector_with_scores(
143
+ query_vector=c.medoid_embedding.tolist(),
144
+ limit=quota * _OVERSAMPLE,
145
+ exclude_ids=seen,
146
+ )
147
+ for c, quota in zip(clusters, quotas)
148
+ ]
149
+ per_cluster_scored = await asyncio.gather(*search_tasks)
150
+
151
+ # Build paper β†’ cluster map AND real qdrant_score_map in one pass
152
+ paper_cluster_map: dict[str, int] = {}
153
+ qdrant_score_map: dict[str, float] = {}
154
+ for cluster, scored_results in zip(clusters, per_cluster_scored):
155
+ for hit in scored_results:
156
+ aid = hit["arxiv_id"]
157
+ if aid not in paper_cluster_map:
158
+ paper_cluster_map[aid] = cluster.cluster_idx
159
+ # Keep highest cosine if paper appears in multiple clusters
160
+ if aid not in qdrant_score_map or hit["score"] > qdrant_score_map[aid]:
161
+ qdrant_score_map[aid] = float(hit["score"])
162
+
163
+ # merge_quota_results expects list[list[str]] β€” extract IDs
164
+ per_cluster_ids = [[hit["arxiv_id"] for hit in scored] for scored in per_cluster_scored]
165
+ candidate_ids = merge_quota_results(per_cluster_ids, quotas)
166
+ ```
167
+
168
+ Then **delete** the synthetic-score block (current `recommendations.py:313–325`):
169
+
170
+ ```python
171
+ # DELETE β€” qdrant_score_map is now built from real cosines above
172
+ # qdrant_score_map: dict[str, float] = {}
173
+ # for cluster_ids in per_cluster_results:
174
+ # for rank, aid in enumerate(cluster_ids):
175
+ # if aid not in qdrant_score_map:
176
+ # qdrant_score_map[aid] = max(0.0, 1.0 - rank * 0.01)
177
+ ```
178
+
179
+ **Don't forget the short-term supplement search.** Around line 263 (the path that pulls extra papers from `state.short_term_centroid` to fill the feed) does the same synthetic-decay trick. Same swap applies, with `paper_cluster_map[aid] = -1` (signalling "not from a long-term cluster") and `qdrant_score_map` populated from real scores.
180
+
181
+ **Test** (add to `tests/test_phase6_feature_wiring.py`):
182
+
183
+ ```python
184
+ def test_qdrant_scores_are_real_cosines_not_rank_proxies():
185
+ """Feature 0 should be actual cosine similarities β€” not a perfect linear
186
+ sequence from rank 0 β†’ N."""
187
+ # Mock search_by_vector_with_scores to return realistic clustered scores:
188
+ # e.g. [0.91, 0.89, 0.87, 0.86, 0.84, 0.83, ...] not [1.0, 0.99, 0.98, ...]
189
+ fake_hits = [
190
+ {"arxiv_id": f"24{i:02d}.{i:05d}", "score": 0.92 - 0.005 * i + (0.01 if i % 3 == 0 else 0)}
191
+ for i in range(20)
192
+ ]
193
+ # ... call _multi_interest_recommend, capture qdrant_score_map
194
+ # ... assert all values in [0.5, 1.0] (realistic cosine band, not 0.0–1.0 sweep)
195
+ # ... assert NOT a perfect linear sequence (variance > 0 in successive diffs)
196
+ diffs = [s2 - s1 for s1, s2 in zip(scores[:-1], scores[1:])]
197
+ assert max(diffs) - min(diffs) > 0.001, "scores look synthetically linear"
198
+ ```
199
+
200
+ **Estimated effort:** 2 hours (including the test).
201
+
202
+ ---
203
+
204
+ ### 3.2 β€” A2: Verify `/healthz/reranker` Live
205
+
206
+ **Not a code change** β€” a 5-minute verification command:
207
+
208
+ ```bash
209
+ curl -s https://siddhm11-researchit.hf.space/healthz/reranker | python -m json.tool
210
+ ```
211
+
212
+ **Three possible outcomes:**
213
+
214
+ | Response | Meaning | Action |
215
+ | --------------------------------------------------------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------------------- |
216
+ | `model_loaded: true, n_trees: 141, fallback_active: false` | βœ… Production is using LightGBM | Tick the box in TASK-TRACKER. Add timestamp to PHASE6-Reranker-Framing.md. |
217
+ | `model_loaded: false, fallback_active: true` | ⚠️ Space is silently using the heuristic | Debug per checklist below. |
218
+ | 404 or 500 | Endpoint isn't deployed yet | Push the latest commit; HF Spaces will rebuild. |
219
+
220
+ **If the model isn't loading, debug in this order:**
221
+
222
+ 1. **Is the model file in the Git repo?**
223
+ ```bash
224
+ git ls-files | grep reranker_v1.txt
225
+ ```
226
+ If empty: check `.gitignore` for any pattern that might catch it (e.g. `*.txt` in a subtree, or a too-broad `models/` rule). The current `.gitignore` looks safe but worth double-checking β€” the file is `models/reranker-phase6/production_model/reranker_v1.txt`.
227
+
228
+ 2. **Is the model file being copied into the Docker image?**
229
+ Check `Dockerfile` for `COPY models/ models/` or `COPY . .`. Check `.dockerignore` for any pattern that excludes `models/` or `*.txt`.
230
+
231
+ 3. **Does the path search in `reranker.py:35–44` find it from HF Spaces' working directory?** If HF Spaces runs from `/app` instead of the repo root, the relative paths might miss. Set `RERANKER_MODEL_PATH` explicitly in HF Secrets:
232
+ ```
233
+ RERANKER_MODEL_PATH=/app/models/reranker-phase6/production_model/reranker_v1.txt
234
+ ```
235
+
236
+ 4. **Check the build logs** for the line `[reranker] LightGBM model loaded from <path> (n_trees=141)`. If that line is missing, the loader is silently failing β€” turn on DEBUG logging in `reranker.py` to see why.
237
+
238
+ **If it's working**, update `PHASE6-Reranker-Framing.md` with a one-liner under Section E:
239
+
240
+ > *Verified live at 2026-MM-DD: `model_loaded=true, n_trees=141, fallback_active=false`.*
241
+
242
+ **Estimated effort:** 30 minutes including any Docker fixes.
243
+
244
+ ---
245
+
246
+ ## 4. Bucket 2 β€” Phase 6.5: Instrumentation Foundation (Days 2–4)
247
+
248
+ This is the new phase. Single identity: **telemetry schema and storage foundations that Phase 7 will sit on top of.** Three pieces of work, each a day, each independently shippable, each leaves the app in a working state.
249
+
250
+ ### 4.1 β€” B1: query_id Linkage (Day 2)
251
+
252
+ **Why this matters more than it sounds.** Right now, interaction logs look like this:
253
+
254
+ ```
255
+ user_id=u1, paper_id=2401.001, event=save, source=recommendation, candidate_source=cluster_0
256
+ user_id=u1, paper_id=2401.002, event=save, source=recommendation, candidate_source=cluster_1
257
+ ```
258
+
259
+ You can count saves but you cannot answer:
260
+
261
+ - *"Out of the 30 papers we showed in this single feed request, how many got saved?"* (CTR per query)
262
+ - *"Did this user save the paper from the same feed they saw it in, or come back 3 days later?"* (intra-session vs return)
263
+ - *"When ranker version changed, did CTR for the same user change?"* (ranker A/B comparison)
264
+
265
+ Without `query_id`, every interaction floats free of the request that generated it. Phase 7 evaluation cannot compute even the most basic feed-level metric.
266
+
267
+ **The fix in 4 steps:**
268
+
269
+ #### Step 1: Generate `query_id` in `recommendations.py`
270
+
271
+ At the top of `get_recommendations()`:
272
+
273
+ ```python
274
+ import uuid
275
+ query_id = str(uuid.uuid4())
276
+ ```
277
+
278
+ When building `paper_tags` (the per-paper instrumentation dict already used by Phase 4.5):
279
+
280
+ ```python
281
+ paper_tags[aid] = {
282
+ "ranker_version": _RANKER_VERSION,
283
+ "candidate_source": source,
284
+ "cluster_id": str(cluster_idx) if cluster_idx is not None and cluster_idx >= 0 else "",
285
+ "query_id": query_id, # NEW
286
+ "position": str(position), # NEW β€” index in final ranked list (0-based)
287
+ }
288
+ ```
289
+
290
+ #### Step 2: Same plumbing in `search.py`
291
+
292
+ Generate one `query_id` per `/search` request, attach to every paper card. Same shape as recommendations β€” different `source` value (`"search"` not `"recommendation"`) but same fields.
293
+
294
+ #### Step 3: Template plumbing
295
+
296
+ In `app/templates/partials/action_buttons.html`, extend the `hx-vals` JSON:
297
+
298
+ ```html
299
+ hx-vals='{
300
+ "source": "{{ _source }}",
301
+ "position": "{{ position | default(0) }}",
302
+ "ranker_version": "{{ _ranker_version }}",
303
+ "candidate_source": "{{ _candidate_source }}",
304
+ "cluster_id": "{{ _cluster_id }}",
305
+ "query_id": "{{ paper.query_id | default('') }}"
306
+ }'
307
+ ```
308
+
309
+ (The Jinja templates that currently render paper cards need the per-card `query_id` and `position` available in their context β€” pass them in via the loop variable when rendering the feed.)
310
+
311
+ #### Step 4: events.py forwards the field
312
+
313
+ `db.log_interaction()` already accepts a `query_id` parameter. Just ensure `events.py` forwards the Form field:
314
+
315
+ ```python
316
+ @router.post("/api/events")
317
+ async def log_event(
318
+ paper_id: str = Form(...),
319
+ event_type: str = Form(...),
320
+ source: str = Form(default=""),
321
+ position: int = Form(default=0),
322
+ ranker_version: str = Form(default=""),
323
+ candidate_source: str = Form(default=""),
324
+ cluster_id: str = Form(default=""),
325
+ query_id: str = Form(default=""), # NEW
326
+ user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
327
+ ):
328
+ await db.log_interaction(
329
+ user_id=user_id,
330
+ paper_id=paper_id,
331
+ event_type=event_type,
332
+ source=source,
333
+ position=position,
334
+ ranker_version=ranker_version,
335
+ candidate_source=candidate_source,
336
+ cluster_id=cluster_id or None,
337
+ query_id=query_id or None, # NEW
338
+ )
339
+ ```
340
+
341
+ **What this enables in Phase 7.** A single SQL query gives per-feed CTR by ranker version:
342
+
343
+ ```sql
344
+ SELECT
345
+ query_id,
346
+ ranker_version,
347
+ COUNT(*) FILTER (WHERE event_type = 'save') * 1.0 / COUNT(DISTINCT paper_id) AS save_rate
348
+ FROM interactions
349
+ WHERE source = 'recommendation'
350
+ GROUP BY query_id, ranker_version;
351
+ ```
352
+
353
+ **Test** (add to `tests/test_instrumentation.py`):
354
+
355
+ ```python
356
+ async def test_query_id_round_trips_from_request_to_db():
357
+ """A single /api/recommendations call should generate one query_id;
358
+ every paper card returned should carry it; saving any paper should
359
+ persist that exact query_id in interactions."""
360
+ resp = await client.get("/api/recommendations", cookies={"uid": "test-user"})
361
+ # Parse out query_id values from the rendered cards
362
+ query_ids = re.findall(r'"query_id":\s*"([0-9a-f-]{36})"', resp.text)
363
+ assert len(set(query_ids)) == 1, "all cards should share one query_id"
364
+ qid = query_ids[0]
365
+
366
+ # Save the first paper
367
+ paper_id = re.search(r'data-paper-id="([^"]+)"', resp.text).group(1)
368
+ await client.post("/api/events", data={
369
+ "paper_id": paper_id, "event_type": "save",
370
+ "source": "recommendation", "query_id": qid,
371
+ })
372
+ rows = await db.fetch_all("SELECT query_id FROM interactions WHERE paper_id = ?", paper_id)
373
+ assert rows[0]["query_id"] == qid
374
+ ```
375
+
376
+ **Estimated effort:** 3 hours.
377
+
378
+ ---
379
+
380
+ ### 4.2 β€” B2: Propensity Logging (Day 3)
381
+
382
+ **Why this is non-negotiable per the project's own framing doc.** ADR A4 in `PHASE6-Reranker-Framing.md` says verbatim:
383
+
384
+ > *Telemetry gaps bite hardest in Phase 5 (IPS impossible without propensities): freeze schema before any logging (A4); include policy_id, propensity, shown_position, ranker_version*
385
+
386
+ You already have `policy_id` in spirit (`ranker_version`) and `shown_position` (`position`). What's missing is `propensity` β€” the probability that the active policy chose to show this paper to this user in this slot.
387
+
388
+ Without propensity, **counterfactual evaluation is mathematically impossible**. You can never retrospectively answer "what would have happened if we'd used a different ranker?" because you cannot reweight observed clicks correctly. Adding the column to a table with 50K rows is a multi-week migration project; adding it to an empty table is 4 hours.
389
+
390
+ #### Schema migration
391
+
392
+ Add to `app/db.py`:
393
+
394
+ ```python
395
+ _MIGRATION_B2 = [
396
+ "ALTER TABLE interactions ADD COLUMN propensity REAL",
397
+ "ALTER TABLE interactions ADD COLUMN policy_id TEXT",
398
+ ]
399
+ ```
400
+
401
+ (`policy_id` is a synonym for `ranker_version` but more honest about what it represents β€” the identifier of the *full pipeline configuration* that chose to show this paper, including MMR Ξ», exploration rate Ξ΅, and any feature-flag state. Some systems keep both: `ranker_version` for the model file hash, `policy_id` for the pipeline hash. For now they can be the same value, but the column is there when you need to differentiate.)
402
+
403
+ Run the migration via the existing migration runner pattern in `db.py:128`:
404
+
405
+ ```python
406
+ async def _apply_migrations(conn):
407
+ # ... existing migrations ...
408
+ for sql in _MIGRATION_B2:
409
+ try:
410
+ await conn.execute(sql)
411
+ except aiosqlite.OperationalError as e:
412
+ if "duplicate column" not in str(e).lower():
413
+ raise
414
+ await conn.commit()
415
+ ```
416
+
417
+ Update `db.log_interaction()`:
418
+
419
+ ```python
420
+ async def log_interaction(
421
+ user_id: str,
422
+ paper_id: str,
423
+ event_type: str,
424
+ *,
425
+ source: str = "",
426
+ position: int = 0,
427
+ ranker_version: str | None = None,
428
+ candidate_source: str | None = None,
429
+ cluster_id: str | None = None,
430
+ query_id: str | None = None,
431
+ propensity: float | None = None, # NEW
432
+ policy_id: str | None = None, # NEW
433
+ ):
434
+ # ... INSERT statement extended with propensity, policy_id ...
435
+ ```
436
+
437
+ #### The propensity computation
438
+
439
+ In `recommendations.py`, after the final feed is built but before tags are returned, compute per-paper propensity. The math depends on which slot the paper occupies:
440
+
441
+ ```python
442
+ # Phase 6.5+B2: compute per-paper propensity
443
+ N_FINAL = len(final)
444
+ N_EXPLORE = len(exploration_set) # the Ξ΅ papers MMR didn't pick
445
+ N_EXPLOIT = N_FINAL - N_EXPLORE
446
+
447
+ # Exploration papers: uniformly sampled from `reranked_ids` not in mmr_selected
448
+ explore_pool_size = max(1, len(reranked_ids) - len(mmr_selected))
449
+ explore_propensity = N_EXPLORE / explore_pool_size if explore_pool_size > 0 else 0.0
450
+
451
+ # Exploitation papers: deterministically selected by MMR β†’ propensity = 1.0
452
+ # (this is the "logging policy = serving policy" case β€” IPS weight will be 1)
453
+
454
+ for aid in final:
455
+ paper_tags[aid]["propensity"] = (
456
+ explore_propensity if aid in exploration_set else 1.0
457
+ )
458
+ paper_tags[aid]["policy_id"] = _RANKER_VERSION # or compute pipeline hash
459
+ ```
460
+
461
+ Plumb through templates (add `propensity` and `policy_id` to `hx-vals` like with `query_id`), and store in `events.py`.
462
+
463
+ **For search**, propensity is `1.0` for every result (search is fully deterministic β€” no exploration). Set it explicitly so the column is always populated:
464
+
465
+ ```python
466
+ # search.py
467
+ paper_tags[aid]["propensity"] = 1.0
468
+ paper_tags[aid]["policy_id"] = _SEARCH_POLICY_ID
469
+ ```
470
+
471
+ #### Why this earns its day
472
+
473
+ Phase 7 evaluation will eventually want to test "ranker B vs ranker A" without a full A/B test (you don't have user volume for that). With propensity logging, you can use **SNIPS** (Self-Normalized Inverse Propensity Scoring) on existing logs to estimate "what would CTR have been if we'd used ranker B?" β€” purely from data ranker A already collected. The estimator is:
474
+
475
+ ```
476
+ Ξ£_i (r_i Γ— Ο€_B(a_i | x_i) / Ο€_A(a_i | x_i))
477
+ SNIPS(Ο€_B) = ─────────────────────────────────────────────────
478
+ Ξ£_i (Ο€_B(a_i | x_i) / Ο€_A(a_i | x_i))
479
+ ```
480
+
481
+ where `Ο€_A` is the logging policy (your current ranker, propensity stored at log time) and `Ο€_B` is the candidate policy you want to evaluate. Without `Ο€_A` stored at log time, this formula has a missing denominator and the estimator collapses.
482
+
483
+ **Test:**
484
+
485
+ ```python
486
+ async def test_propensity_sums_correctly_across_exploration_and_exploitation():
487
+ """For a feed of N papers with K exploration slots, the sum of propensities
488
+ over ALL candidates in the explore pool should equal K (each paper had K/|pool|
489
+ chance, summed over |pool| papers = K)."""
490
+ # Mock a recommendation flow with N=30, K_explore=2, pool_size=50
491
+ # Capture propensity values
492
+ explore_props = [p["propensity"] for p in tagged if p["aid"] in exploration_set]
493
+ assert all(0 < p <= 1 for p in explore_props)
494
+ # Each exploration paper has propensity = K/pool = 2/50 = 0.04
495
+ assert all(abs(p - 0.04) < 1e-6 for p in explore_props)
496
+ # Exploitation papers all have propensity = 1.0
497
+ exploit_props = [p["propensity"] for p in tagged if p["aid"] not in exploration_set]
498
+ assert all(p == 1.0 for p in exploit_props)
499
+ ```
500
+
501
+ **Estimated effort:** 4 hours.
502
+
503
+ ---
504
+
505
+ ### 4.3 β€” B3: Cluster Snapshot Versioning (Day 4)
506
+
507
+ **The current problem.** `db.save_user_clusters()` (around `db.py:235`) does:
508
+
509
+ ```python
510
+ await conn.execute("DELETE FROM user_clusters WHERE user_id = ?", (user_id,))
511
+ for c in clusters:
512
+ await conn.execute("INSERT INTO user_clusters ...")
513
+ ```
514
+
515
+ Every recluster, the previous cluster state is **destroyed**. You cannot answer:
516
+
517
+ - *"What clusters did this user have a week ago?"* β€” for debugging "why did the recs suddenly shift?"
518
+ - *"When did cluster 2 form?"* β€” for cluster lifecycle analytics
519
+ - Phase 8a's content-addressed LLM-summary cache key needs `(cluster_stable_id, snapshot_date)` per ADR A1 β€” and the snapshot_date doesn't exist as a concept yet
520
+
521
+ This implements **ADR A1** from `PHASE6-Reranker-Framing.md`.
522
+
523
+ #### Schema
524
+
525
+ ```sql
526
+ CREATE TABLE IF NOT EXISTS cluster_snapshots (
527
+ user_id TEXT NOT NULL,
528
+ snapshot_id TEXT NOT NULL, -- UUID, one per recluster event
529
+ cluster_idx INTEGER NOT NULL, -- stable index after Hungarian
530
+ medoid_paper_id TEXT NOT NULL,
531
+ importance REAL NOT NULL,
532
+ paper_ids TEXT NOT NULL, -- JSON array
533
+ medoid_embedding_blob BLOB,
534
+ snapshot_date TEXT NOT NULL DEFAULT (datetime('now')),
535
+ paper_ids_hash TEXT NOT NULL, -- sha256(sorted(paper_ids))[:16]
536
+ PRIMARY KEY (user_id, snapshot_id, cluster_idx)
537
+ );
538
+ CREATE INDEX IF NOT EXISTS idx_snap_user_date ON cluster_snapshots(user_id, snapshot_date DESC);
539
+ CREATE INDEX IF NOT EXISTS idx_snap_hash ON cluster_snapshots(paper_ids_hash);
540
+ ```
541
+
542
+ `paper_ids_hash` is the content-addressing key β€” Phase 8a will use this to dedupe LLM-summary generation across users. If two different users have a cluster with identical paper sets, they share one cached summary. The 16-character truncation is enough entropy at our scale (low birthday-collision risk for <100M clusters).
543
+
544
+ #### Write side
545
+
546
+ Add a new function in `db.py`:
547
+
548
+ ```python
549
+ import json
550
+ import hashlib
551
+ import uuid
552
+
553
+ async def save_cluster_snapshot(user_id: str, clusters: list[dict]) -> str:
554
+ """Append a new snapshot. Returns the snapshot_id (one per recluster event)."""
555
+ snapshot_id = str(uuid.uuid4())
556
+ async with aiosqlite.connect(DB_PATH) as conn:
557
+ for c in clusters:
558
+ paper_ids = json.loads(c["paper_ids"]) if isinstance(c["paper_ids"], str) else c["paper_ids"]
559
+ paper_ids_hash = hashlib.sha256(
560
+ json.dumps(sorted(paper_ids)).encode()
561
+ ).hexdigest()[:16]
562
+ await conn.execute(
563
+ """INSERT INTO cluster_snapshots
564
+ (user_id, snapshot_id, cluster_idx, medoid_paper_id,
565
+ importance, paper_ids, medoid_embedding_blob, paper_ids_hash)
566
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
567
+ (user_id, snapshot_id, c["cluster_idx"], c["medoid_paper_id"],
568
+ c["importance"], json.dumps(paper_ids),
569
+ c.get("medoid_embedding_blob"), paper_ids_hash),
570
+ )
571
+ await conn.commit()
572
+ return snapshot_id
573
+ ```
574
+
575
+ In `recommendations.py`, **after** `save_clusters_to_db(user_id, clusters)` (the existing call that maintains the "current state" view), add:
576
+
577
+ ```python
578
+ snapshot_id = await db.save_cluster_snapshot(user_id, [
579
+ {
580
+ "cluster_idx": c.cluster_idx,
581
+ "medoid_paper_id": c.medoid_paper_id,
582
+ "importance": c.importance,
583
+ "paper_ids": json.dumps(c.paper_ids),
584
+ "medoid_embedding_blob": c.medoid_embedding.astype(np.float32).tobytes(),
585
+ }
586
+ for c in clusters
587
+ ])
588
+ ```
589
+
590
+ Crucially: keep `save_clusters_to_db` doing exactly what it does today. `cluster_snapshots` is **purely additive history** β€” current-state queries still hit `user_clusters`, retrospective queries hit `cluster_snapshots`. No existing code path changes behaviour.
591
+
592
+ #### Retention policy
593
+
594
+ A nightly cleanup keeps the last 30 days per user (anything older is unlikely to be useful for debugging and bloats the snapshots table without bound):
595
+
596
+ ```python
597
+ async def prune_old_snapshots(retention_days: int = 30):
598
+ async with aiosqlite.connect(DB_PATH) as conn:
599
+ await conn.execute(
600
+ "DELETE FROM cluster_snapshots WHERE snapshot_date < datetime('now', ?)",
601
+ (f"-{retention_days} days",),
602
+ )
603
+ await conn.commit()
604
+ ```
605
+
606
+ For now, call it on startup (FastAPI lifespan handler). In Phase 7 you'll add a proper APScheduler cron.
607
+
608
+ **Tests:**
609
+
610
+ ```python
611
+ async def test_snapshot_appended_on_each_recluster():
612
+ """Two reclusters of the same user should produce two distinct snapshot_ids
613
+ and 2N rows in cluster_snapshots (where N = number of clusters)."""
614
+ user_id = "test-user"
615
+ clusters_v1 = [_make_cluster(idx=0, papers=["a", "b"])]
616
+ clusters_v2 = [_make_cluster(idx=0, papers=["a", "b", "c"])]
617
+ sid1 = await db.save_cluster_snapshot(user_id, clusters_v1)
618
+ sid2 = await db.save_cluster_snapshot(user_id, clusters_v2)
619
+ assert sid1 != sid2
620
+ rows = await db.fetch_all(
621
+ "SELECT snapshot_id, paper_ids_hash FROM cluster_snapshots WHERE user_id = ? ORDER BY snapshot_date",
622
+ user_id,
623
+ )
624
+ assert len(rows) == 2
625
+ assert rows[0]["paper_ids_hash"] != rows[1]["paper_ids_hash"] # content-addressed
626
+
627
+ async def test_prune_respects_retention():
628
+ """Snapshots older than retention_days should be deleted; newer ones kept."""
629
+ # Insert one snapshot dated 45 days ago, one dated 5 days ago
630
+ # Run prune_old_snapshots(retention_days=30)
631
+ # Assert only the recent one remains
632
+ ```
633
+
634
+ **Estimated effort:** 6 hours.
635
+
636
+ ---
637
+
638
+ ## 5. Bucket 3 β€” Phase 5.1: Cold-Start Completion (Day 5, parallel)
639
+
640
+ This sits **outside Phase 6.5** but ships as part of the same 5-day push. Single identity: **complete the Layer 2 of Phase 5's three-layer onboarding plan that was deferred at the time.** Original Phase 5 plan called for: (Layer 1) category selection, (Layer 2) author-paper import, (Layer 3) seed paper search. Layer 2 was cut for time. This is it.
641
+
642
+ ### 5.1 β€” B4: Semantic Scholar Author Import
643
+
644
+ **The user-visible win.** Before B4: a new user lands on `/onboarding`, picks 3 categories, then has to manually search for and save 5 seed papers β€” friction that bleeds users at the conversion step. After B4: paste your S2 author URL, the system pulls your authored papers, and you have 20 implicit "saves" instantly. First feed is genuinely personalized within seconds of arrival.
645
+
646
+ This is also the only piece of work in the 5-day push that touches user experience directly. The other four days are all infrastructure. It's worth shipping in the same window so the user-facing improvement masks the otherwise-invisible plumbing changes.
647
+
648
+ #### S2 API endpoint
649
+
650
+ ```
651
+ GET https://api.semanticscholar.org/graph/v1/author/{author_id}/papers
652
+ ?fields=externalIds,title,year,citationCount
653
+ &limit=100
654
+ ```
655
+
656
+ `externalIds.ArXiv` gives you the arXiv ID directly β€” no DOI translation needed. `S2_API_KEY` env var already exists (it's used in Phase 6 reranker training scripts).
657
+
658
+ #### The flow
659
+
660
+ **1. New onboarding step** (insert between "categories" and "seed papers" in the existing onboarding wizard):
661
+
662
+ ```
663
+ Step 2 of 3: Import your work (optional)
664
+
665
+ [ Paste your Semantic Scholar profile URL or ORCID ]
666
+ [ Import ]
667
+
668
+ [ Skip β€” I'll search for seed papers manually ]
669
+ ```
670
+
671
+ **2. New service file** `app/s2_svc.py`:
672
+
673
+ ```python
674
+ """Semantic Scholar API client for author paper import."""
675
+ import re
676
+ import httpx
677
+ from app import config
678
+
679
+ S2_BASE = "https://api.semanticscholar.org/graph/v1"
680
+
681
+
682
+ def parse_author_input(text: str) -> str | None:
683
+ """Accept S2 URL, raw S2 ID, or ORCID. Return S2 author ID or None."""
684
+ text = text.strip()
685
+ # S2 URL: https://www.semanticscholar.org/author/Name/12345678
686
+ m = re.search(r"semanticscholar\.org/author/[^/]+/(\d+)", text)
687
+ if m:
688
+ return m.group(1)
689
+ # Raw S2 ID
690
+ if text.isdigit():
691
+ return text
692
+ # ORCID: 0000-0002-1825-0097
693
+ if re.match(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$", text):
694
+ # Resolve ORCID β†’ S2 ID via S2's author search
695
+ return None # caller should call resolve_orcid()
696
+ return None
697
+
698
+
699
+ async def resolve_orcid(orcid: str) -> str | None:
700
+ """Resolve ORCID β†’ S2 author ID via S2's author search."""
701
+ headers = {"x-api-key": config.S2_API_KEY} if config.S2_API_KEY else {}
702
+ async with httpx.AsyncClient(timeout=10) as client:
703
+ resp = await client.get(
704
+ f"{S2_BASE}/author/search",
705
+ params={"query": f"ORCID:{orcid}", "limit": 1, "fields": "authorId"},
706
+ headers=headers,
707
+ )
708
+ resp.raise_for_status()
709
+ data = resp.json().get("data", [])
710
+ return data[0]["authorId"] if data else None
711
+
712
+
713
+ async def fetch_author_arxiv_papers(author_id: str, limit: int = 50) -> list[str]:
714
+ """Return arxiv_ids of papers authored by this S2 author, most-recent first."""
715
+ headers = {"x-api-key": config.S2_API_KEY} if config.S2_API_KEY else {}
716
+ async with httpx.AsyncClient(timeout=15) as client:
717
+ resp = await client.get(
718
+ f"{S2_BASE}/author/{author_id}/papers",
719
+ params={"fields": "externalIds,year", "limit": limit},
720
+ headers=headers,
721
+ )
722
+ resp.raise_for_status()
723
+ data = resp.json()
724
+ arxiv_ids = []
725
+ # Sort by year descending so we keep most-recent papers if we hit limit
726
+ papers = sorted(
727
+ data.get("data", []),
728
+ key=lambda p: p.get("year") or 0,
729
+ reverse=True,
730
+ )
731
+ for paper in papers:
732
+ ext = paper.get("externalIds") or {}
733
+ if arxiv_id := ext.get("ArXiv"):
734
+ arxiv_ids.append(str(arxiv_id)) # CLAUDE.md rule: arxiv_ids always strings
735
+ return arxiv_ids
736
+ ```
737
+
738
+ **3. New router endpoint** in `app/routers/onboarding.py`:
739
+
740
+ ```python
741
+ @router.post("/api/onboarding/import-author", response_class=HTMLResponse)
742
+ async def import_author(
743
+ request: Request,
744
+ author_input: str = Form(...),
745
+ user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
746
+ ):
747
+ user_id = user_id or str(uuid.uuid4())
748
+
749
+ # Parse: accept S2 URL, S2 ID, or ORCID
750
+ s2_author_id = s2_svc.parse_author_input(author_input)
751
+ if not s2_author_id:
752
+ # Try ORCID resolution
753
+ if re.match(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$", author_input.strip()):
754
+ s2_author_id = await s2_svc.resolve_orcid(author_input.strip())
755
+ if not s2_author_id:
756
+ return templates.TemplateResponse(
757
+ request, "partials/import_error.html",
758
+ {"error": "Could not parse input. Try a Semantic Scholar URL or ORCID."},
759
+ status_code=400,
760
+ )
761
+
762
+ # Fetch from S2 with timeout + graceful fallback
763
+ try:
764
+ arxiv_ids = await s2_svc.fetch_author_arxiv_papers(s2_author_id, limit=50)
765
+ except httpx.HTTPError as e:
766
+ log.warning("s2 author fetch failed: %s", e)
767
+ return templates.TemplateResponse(
768
+ request, "partials/import_error.html",
769
+ {"error": "Semantic Scholar is temporarily unavailable. Try seed search instead."},
770
+ status_code=503,
771
+ )
772
+
773
+ if not arxiv_ids:
774
+ return templates.TemplateResponse(
775
+ request, "partials/import_error.html",
776
+ {"error": "No arXiv papers found for this author. Try seed search instead."},
777
+ )
778
+
779
+ # Save each as a seed (triggers EWMA, clustering on next request)
780
+ saved_count = 0
781
+ for aid in arxiv_ids:
782
+ await db.log_interaction(
783
+ user_id=user_id,
784
+ paper_id=aid,
785
+ event_type="save",
786
+ source="onboarding_author_import",
787
+ )
788
+ us.record_positive(user_id, aid)
789
+ # Background: fetch vector + update EWMA (don't block the response)
790
+ asyncio.create_task(_update_profile_on_save(user_id, aid))
791
+ saved_count += 1
792
+
793
+ response = templates.TemplateResponse(
794
+ request, "partials/import_success.html",
795
+ {"saved_count": saved_count, "next_step": "seed_search"},
796
+ )
797
+ response.set_cookie(COOKIE_NAME, user_id, max_age=COOKIE_MAX_AGE)
798
+ return response
799
+ ```
800
+
801
+ **4. Tag the imports specially** β€” `source="onboarding_author_import"` distinguishes these from normal saves and from `source="onboarding_seed_search"`. Phase 7 evaluation can then ask: *"Do users who used author-import have higher week-1 retention than users who used only seed search?"*
802
+
803
+ #### Edge cases
804
+
805
+ | Case | Solution |
806
+ | ------------------------------------------------------------ | ------------------------------------------------------------------------------------- |
807
+ | Author has 200 papers | Cap at 50 most-recent (50 is plenty for clustering; year-sorted before cap) |
808
+ | Author has 0 arXiv papers (e.g. pure CS-conference profile) | Show "No arXiv papers found β€” try seed search instead" |
809
+ | User pastes ORCID instead of S2 URL | Resolved via S2's author search by ORCID |
810
+ | User pastes a paper URL by mistake | `parse_author_input` returns None β†’ friendly error |
811
+ | S2 API rate limit hit | Graceful 503 β†’ fall back to manual seed search |
812
+ | User imports, then dislikes everything | Negative EWMA self-corrects within 5–10 dismissals |
813
+ | User has S2 ID but multiple disambiguated profiles | Out of scope β€” they pick the right one when copying their URL |
814
+
815
+ **Test:**
816
+
817
+ ```python
818
+ async def test_s2_import_saves_papers_with_correct_source_tag():
819
+ # Mock fetch_author_arxiv_papers to return ["2401.001", "2401.002"]
820
+ # POST /api/onboarding/import-author with a fake S2 URL
821
+ rows = await db.fetch_all(
822
+ "SELECT paper_id, source FROM interactions WHERE user_id = ?", user_id,
823
+ )
824
+ assert {r["paper_id"] for r in rows} == {"2401.001", "2401.002"}
825
+ assert all(r["source"] == "onboarding_author_import" for r in rows)
826
+ ```
827
+
828
+ **Estimated effort:** 5 hours.
829
+
830
+ ---
831
+
832
+ ## 6. What Phase 7 inherits
833
+
834
+ After these 5 days, Phase 7 starts on a substrate where every prerequisite is already in production:
835
+
836
+ | Capability | Before this push | After this push |
837
+ | --------------------------------------- | ----------------------------------------- | ------------------------------------------------ |
838
+ | Feature 0 in LightGBM | ❌ rank-proxy lie | βœ… actual cosine |
839
+ | Production model verified live | ❓ unverified | βœ… green checkmark with timestamp |
840
+ | Per-feed CTR measurable | ❌ no `query_id` | βœ… one SQL query away |
841
+ | Counterfactual eval (SNIPS) possible | ❌ no propensity | βœ… schema ready, propensities flowing |
842
+ | Cluster history queryable | ❌ destroyed on each recluster | βœ… 30 days kept, content-addressed |
843
+ | Cold-start onboarding | ❌ manual 5-paper search only | βœ… paste S2 URL β†’ 20 implicit saves |
844
+
845
+ Phase 7's evaluation framework now has a real substrate. Without these, Phase 7 would have to spend its first week building this infrastructure anyway β€” better to do it deliberately as a pre-Phase-7 push than under deadline pressure.
846
+
847
+ ---
848
+
849
+ ## 7. Acceptance criteria
850
+
851
+ ### Bucket 1 β€” Phase 6 Hot-fix done when:
852
+
853
+ - [ ] `qdrant_score_map` is populated from `search_by_vector_with_scores()` in both the per-cluster path and the short-term supplement path
854
+ - [ ] Synthetic-decay block (current `recommendations.py:313–325`) is deleted
855
+ - [ ] `test_qdrant_scores_are_real_cosines_not_rank_proxies` passes
856
+ - [ ] `curl https://siddhm11-researchit.hf.space/healthz/reranker` returns `model_loaded: true, n_trees: 141, fallback_active: false`
857
+ - [ ] PHASE6-Reranker-Framing.md updated with verification timestamp
858
+
859
+ ### Bucket 2 β€” Phase 6.5 done when:
860
+
861
+ - [ ] `query_id` is generated per request in `recommendations.py` and `search.py` and round-trips through templates β†’ events β†’ DB
862
+ - [ ] `interactions` table has `propensity REAL` and `policy_id TEXT` columns
863
+ - [ ] Every interaction logged from a recommendation/search request has non-null `propensity` and `policy_id`
864
+ - [ ] `cluster_snapshots` table exists with the schema in Β§4.3
865
+ - [ ] Every recluster appends a new snapshot (verified by `test_snapshot_appended_on_each_recluster`)
866
+ - [ ] `prune_old_snapshots(retention_days=30)` is registered in the FastAPI lifespan handler
867
+ - [ ] All new tests pass; total test count in `README.md` updated
868
+
869
+ ### Bucket 3 β€” Phase 5.1 done when:
870
+
871
+ - [ ] `app/s2_svc.py` exists and `fetch_author_arxiv_papers` returns arxiv IDs (verified against a real S2 author profile)
872
+ - [ ] `/api/onboarding/import-author` accepts S2 URL, S2 ID, and ORCID input forms
873
+ - [ ] Imported papers are saved with `source="onboarding_author_import"`
874
+ - [ ] Background EWMA update fires for each imported paper
875
+ - [ ] All 6 edge cases in Β§5.1 are handled with graceful UX
876
+
877
+ ---
878
+
879
+ ## 8. Sequencing & timeline
880
+
881
+ ### Recommended order
882
+
883
+ ```
884
+ Day 1 (~3h) Bucket 1: A1 (real Qdrant scores) + A2 (curl /healthz)
885
+ Day 2 (~3h) Bucket 2.B1: query_id linkage
886
+ Day 3 (~4h) Bucket 2.B2: propensity logging
887
+ Day 4 (~6h) Bucket 2.B3: cluster snapshot versioning
888
+ Day 5 (~5h) Bucket 3.B4: S2 author import
889
+ ```
890
+
891
+ Each day leaves the app in a working state. No big-bang refactors. No day depends on a later day's work.
892
+
893
+ ### Parallelization options
894
+
895
+ If you have stretches where you want to context-switch:
896
+
897
+ - **Day 5 (B4) can run anytime** β€” it's onboarding code, doesn't touch the recommendation pipeline or schema. Could ship before Day 1 if you want a user-visible win first.
898
+ - **Day 1 should land before Day 2–4** β€” once `query_id` and `propensity` start flowing, you want feature 0 to already be real cosines so your first logged interactions are clean training data for any future retrain.
899
+ - **Days 2–4 should ship as a block** β€” the three pieces compound. Shipping B1 without B2 means logs have feed identity but no eval lever; shipping B2 without B1 means propensities can't be grouped by feed; shipping B3 without either means snapshots exist but you can't correlate them to actions.
900
+
901
+ ### What this defers (intentionally)
902
+
903
+ | Item | Why deferred |
904
+ | ----------------------------------------------------- | --------------------------------------------------------------------------------------------- |
905
+ | Track C: Full ORCID/Scholar import with disambiguation | B4 captures ~80% of the value. Full version waits until there's user-data evidence it's needed. |
906
+ | Track D: Cluster summary cards (Phase 8a preview) | Needs Phase 7 evaluation infrastructure to measure whether it actually helps users. |
907
+ | Phase 6.4 reranker retraining | Already gated on synthetic simulator OR 100 real users with β‰₯10 saves each. Unchanged. |
908
+
909
+ ---
910
+
911
+ ## 9. Documentation updates needed
912
+
913
+ After this push lands:
914
+
915
+ - [ ] Add line to `CLAUDE.md` non-negotiable rules: *"Rule 9: Every interaction logged from a recommendation/search request must carry `query_id`, `propensity`, and `policy_id`. These are load-bearing for Phase 7 evaluation."*
916
+ - [ ] Update `PHASE6-Reranker-Framing.md` Section E with the live verification timestamp
917
+ - [ ] Update `TASK-TRACKER.md`:
918
+ - Tick `[x] [reranker] LightGBM model loaded (verified live YYYY-MM-DD)`
919
+ - Tick `[x] [reranker] qdrant_cosine_score uses real cosines`
920
+ - Add new section `Phase 6.5 β€” Instrumentation Foundation` with checklist from Β§7
921
+ - [ ] Update `README.md` test count
922
+ - [ ] Update `docs/walkthroughs/04-Next-Steps-and-Phase-Plan.md`: insert Phase 6.5 between Phase 6 and Phase 7 in the master roadmap; note Phase 5.1 as a parallel side-quest
923
+ - [ ] Mark ADR A1 (cluster snapshot versioning) and ADR A4 (telemetry schema) as **Decided + Implemented** in the Phase 6 framing doc's ADR table
924
+
925
+ ---
926
+
927
+ ## 10. Out of scope (explicit)
928
+
929
+ To keep this doc focused, the following are **not** part of this push:
930
+
931
+ - Building the actual evaluation harness (offline regression, time-split eval, frozen `eval/eval_set_v1.0.parquet`, CI gates) β€” that's Phase 7 itself.
932
+ - LLM cluster summaries (Phase 8a) β€” depends on `paper_ids_hash` from B3, but the LLM call path itself is Phase 8.
933
+ - Reranker retraining (Phase 6.4) β€” gated on user-volume thresholds, unchanged.
934
+ - Google Scholar import β€” no public API, would need scraping. Defer until S2 import shows real adoption.
935
+ - Per-paper relevance dial in author import (not all of someone's authored papers represent current interest) β€” out of scope; let the EWMA negative path handle it organically.
936
+
937
+ ---
938
+
939
+ *End of framing doc.*
tests/test_integration.py CHANGED
@@ -231,20 +231,29 @@ def test_quota_pipeline_preserves_minority_cluster(client, monkeypatch):
231
  combined = {**saved_vectors, **candidate_vectors}
232
  return {aid: combined[aid] for aid in ids if aid in combined}
233
 
234
- # search_by_vector returns candidates aligned with whichever centre
235
- # the query is closer to
236
- async def fake_search_by_vector(query_vector, limit, exclude_ids=None):
237
  qv = np.array(query_vector, dtype=np.float32)
238
  qv /= np.linalg.norm(qv)
239
  if float(qv @ nlp_center) > float(qv @ rl_center):
240
  pool = nlp_candidates
 
241
  else:
242
  pool = rl_candidates
 
243
  exclude = exclude_ids or set()
244
- return [p for p in pool if p not in exclude][:limit]
 
 
 
 
 
 
 
245
 
246
  monkeypatch.setattr(qs, "get_paper_vectors", fake_get_paper_vectors)
247
- monkeypatch.setattr(qs, "search_by_vector", fake_search_by_vector)
248
 
249
  # Skip EWMA short-term lookup β€” returns None
250
  async def fake_load_profile(uid, kind):
 
231
  combined = {**saved_vectors, **candidate_vectors}
232
  return {aid: combined[aid] for aid in ids if aid in combined}
233
 
234
+ # search_by_vector_with_scores returns candidates with cosine scores,
235
+ # aligned with whichever centre the query is closer to
236
+ async def fake_search_by_vector_with_scores(query_vector, limit, exclude_ids=None):
237
  qv = np.array(query_vector, dtype=np.float32)
238
  qv /= np.linalg.norm(qv)
239
  if float(qv @ nlp_center) > float(qv @ rl_center):
240
  pool = nlp_candidates
241
+ center = nlp_center
242
  else:
243
  pool = rl_candidates
244
+ center = rl_center
245
  exclude = exclude_ids or set()
246
+ results = []
247
+ for p in pool:
248
+ if p not in exclude:
249
+ # Compute realistic cosine score
250
+ pv = np.array(candidate_vectors[p], dtype=np.float32)
251
+ score = float(qv @ pv / (np.linalg.norm(qv) * np.linalg.norm(pv) + 1e-10))
252
+ results.append({"arxiv_id": p, "score": score})
253
+ return results[:limit]
254
 
255
  monkeypatch.setattr(qs, "get_paper_vectors", fake_get_paper_vectors)
256
+ monkeypatch.setattr(qs, "search_by_vector_with_scores", fake_search_by_vector_with_scores)
257
 
258
  # Skip EWMA short-term lookup β€” returns None
259
  async def fake_load_profile(uid, kind):