Spaces:
Running
Phase 6.5 Day 1: Real Qdrant cosine scores (A1) + verification timestamp (A2)
Browse filesA1 β Feature 0 fix (qdrant_cosine_score):
- recommendations.py: Switch search_by_vector() -> search_by_vector_with_scores()
in both per-cluster and short-term supplement paths
- Build qdrant_score_map from actual cosine scores in same pass as paper_cluster_map
- Delete fake rank-decay approximation (1.0 - rank * 0.01)
- Feature 0 now receives real cosines instead of synthetic linear sequence
A2 β Live verification:
- PHASE6-Reranker-Framing.md: Tick all 6.3 checklist items, add verification timestamp
(2026-05-03: model_loaded=true, n_trees=141, fallback_active=false)
Test fix:
- test_integration.py: Update quota pipeline test mock from search_by_vector to
search_by_vector_with_scores (returns list[dict] not list[str])
Tests: 203 passed, 0 failures
|
@@ -256,38 +256,49 @@ async def _multi_interest_recommend(
|
|
| 256 |
st_vec = await profiles.load_profile(user_id, "short_term")
|
| 257 |
|
| 258 |
search_tasks = [
|
| 259 |
-
qdrant_svc.
|
| 260 |
query_vector=c.medoid_embedding.tolist(),
|
| 261 |
limit=quota * _OVERSAMPLE,
|
| 262 |
exclude_ids=seen,
|
| 263 |
)
|
| 264 |
for c, quota in zip(clusters, quotas)
|
| 265 |
]
|
| 266 |
-
|
| 267 |
|
| 268 |
-
#
|
| 269 |
-
#
|
| 270 |
paper_cluster_map: dict[str, int] = {}
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
| 273 |
if aid not in paper_cluster_map: # first-occurrence wins
|
| 274 |
paper_cluster_map[aid] = cluster.cluster_idx
|
|
|
|
|
|
|
|
|
|
| 275 |
|
| 276 |
-
#
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
# Supplement with short-term session context
|
| 280 |
if st_vec is not None:
|
| 281 |
seen_so_far = seen | set(candidate_ids)
|
| 282 |
-
|
| 283 |
query_vector=st_vec.tolist(),
|
| 284 |
limit=_ST_SUPPLEMENT,
|
| 285 |
exclude_ids=seen_so_far,
|
| 286 |
)
|
| 287 |
-
for
|
|
|
|
| 288 |
if aid not in set(candidate_ids):
|
| 289 |
candidate_ids.append(aid)
|
| 290 |
paper_cluster_map[aid] = -1 # short-term supplement
|
|
|
|
|
|
|
| 291 |
|
| 292 |
if not candidate_ids:
|
| 293 |
return [], {}
|
|
@@ -326,17 +337,8 @@ async def _multi_interest_recommend(
|
|
| 326 |
user_total_saves = len(state.positive_list)
|
| 327 |
user_total_dismissals = len(state.negative_list)
|
| 328 |
|
| 329 |
-
#
|
| 330 |
-
#
|
| 331 |
-
# Use the paper_cluster_map to approximate: score = 1.0 - (rank / total)
|
| 332 |
-
# for now, as the current retrieval path returns only IDs.
|
| 333 |
-
# TODO: Phase 6.2+ switch to search_by_vector_with_scores()
|
| 334 |
-
qdrant_score_map: dict[str, float] = {}
|
| 335 |
-
for cluster_ids in per_cluster_results:
|
| 336 |
-
for rank, aid in enumerate(cluster_ids):
|
| 337 |
-
if aid not in qdrant_score_map:
|
| 338 |
-
# Approximate score from rank position (higher rank = higher score)
|
| 339 |
-
qdrant_score_map[aid] = max(0.0, 1.0 - rank * 0.01)
|
| 340 |
|
| 341 |
qdrant_scores = np.asarray(
|
| 342 |
[qdrant_score_map.get(cid, 0.0) for cid in valid_ids],
|
|
|
|
| 256 |
st_vec = await profiles.load_profile(user_id, "short_term")
|
| 257 |
|
| 258 |
search_tasks = [
|
| 259 |
+
qdrant_svc.search_by_vector_with_scores(
|
| 260 |
query_vector=c.medoid_embedding.tolist(),
|
| 261 |
limit=quota * _OVERSAMPLE,
|
| 262 |
exclude_ids=seen,
|
| 263 |
)
|
| 264 |
for c, quota in zip(clusters, quotas)
|
| 265 |
]
|
| 266 |
+
per_cluster_scored = await asyncio.gather(*search_tasks)
|
| 267 |
|
| 268 |
+
# Build paper β cluster map AND real qdrant_score_map in one pass.
|
| 269 |
+
# Phase 6.5 A1: replaces the old rank-based linear decay approximation.
|
| 270 |
paper_cluster_map: dict[str, int] = {}
|
| 271 |
+
qdrant_score_map: dict[str, float] = {}
|
| 272 |
+
for cluster, scored_results in zip(clusters, per_cluster_scored):
|
| 273 |
+
for hit in scored_results:
|
| 274 |
+
aid = hit["arxiv_id"]
|
| 275 |
if aid not in paper_cluster_map: # first-occurrence wins
|
| 276 |
paper_cluster_map[aid] = cluster.cluster_idx
|
| 277 |
+
# Keep highest cosine if a paper appears in multiple clusters
|
| 278 |
+
if aid not in qdrant_score_map or hit["score"] > qdrant_score_map[aid]:
|
| 279 |
+
qdrant_score_map[aid] = float(hit["score"])
|
| 280 |
|
| 281 |
+
# merge_quota_results expects list[list[str]] β extract IDs
|
| 282 |
+
per_cluster_ids = [
|
| 283 |
+
[h["arxiv_id"] for h in scored] for scored in per_cluster_scored
|
| 284 |
+
]
|
| 285 |
+
candidate_ids = merge_quota_results(per_cluster_ids, quotas)
|
| 286 |
|
| 287 |
# Supplement with short-term session context
|
| 288 |
if st_vec is not None:
|
| 289 |
seen_so_far = seen | set(candidate_ids)
|
| 290 |
+
st_scored = await qdrant_svc.search_by_vector_with_scores(
|
| 291 |
query_vector=st_vec.tolist(),
|
| 292 |
limit=_ST_SUPPLEMENT,
|
| 293 |
exclude_ids=seen_so_far,
|
| 294 |
)
|
| 295 |
+
for hit in st_scored:
|
| 296 |
+
aid = hit["arxiv_id"]
|
| 297 |
if aid not in set(candidate_ids):
|
| 298 |
candidate_ids.append(aid)
|
| 299 |
paper_cluster_map[aid] = -1 # short-term supplement
|
| 300 |
+
if aid not in qdrant_score_map:
|
| 301 |
+
qdrant_score_map[aid] = float(hit["score"])
|
| 302 |
|
| 303 |
if not candidate_ids:
|
| 304 |
return [], {}
|
|
|
|
| 337 |
user_total_saves = len(state.positive_list)
|
| 338 |
user_total_dismissals = len(state.negative_list)
|
| 339 |
|
| 340 |
+
# qdrant_score_map was built above from real cosine scores
|
| 341 |
+
# (Phase 6.5 A1 β replaces the old rank-based approximation)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
qdrant_scores = np.asarray(
|
| 344 |
[qdrant_score_map.get(cid, 0.0) for cid in valid_ids],
|
|
@@ -757,16 +757,17 @@ Two lines, verbatim:
|
|
| 757 |
- [ ] Commit: "Phase 6.2: per-candidate cluster identity through reranker"
|
| 758 |
|
| 759 |
### Phase 6.3 β Deployment verification + Bug B
|
| 760 |
-
- [
|
| 761 |
-
- [
|
| 762 |
-
- [
|
| 763 |
-
- [
|
| 764 |
-
- [
|
| 765 |
-
- [
|
| 766 |
-
|
| 767 |
-
- [
|
| 768 |
-
- [
|
| 769 |
-
- [
|
|
|
|
| 770 |
|
| 771 |
### Phase 6 documentation
|
| 772 |
- [ ] Write `docs/phases/PHASE6.md` retraining decision (Section F.4)
|
|
|
|
| 757 |
- [ ] Commit: "Phase 6.2: per-candidate cluster identity through reranker"
|
| 758 |
|
| 759 |
### Phase 6.3 β Deployment verification + Bug B
|
| 760 |
+
- [x] Decide deployment strategy: E.1.a (commit) vs E.1.b (snapshot_download). Used E.1.a.
|
| 761 |
+
- [x] Verify `models/reranker-phase6/production_model/reranker_v1.txt` is in working tree, not gitignored, not dockerignored
|
| 762 |
+
- [x] Push to HF Space; wait for build; check build logs for "[reranker] LightGBM model loaded"
|
| 763 |
+
- [x] Add `/healthz/reranker` route (Section E.2)
|
| 764 |
+
- [x] Add `_rr.is_model_loaded()`, `_rr.get_loaded_model_path()`, `_rr.get_num_trees()` accessors
|
| 765 |
+
- [x] `curl https://siddhm11-researchit.hf.space/healthz/reranker` β confirm `model_loaded: true, n_trees: 141`
|
| 766 |
+
> *Verified live at 2026-05-03: `model_loaded=true, n_trees=141, fallback_active=false, feature_count=37, feature_schema_hash=5d0b3de7b0c1`.*
|
| 767 |
+
- [x] Add per-request `reranker.features` log line with `feature_nonzero_rate`
|
| 768 |
+
- [x] Fix Bug B: medoid_embedding_blob fallback in cluster reload (Section E.4)
|
| 769 |
+
- [x] Add `medoid_embedding_blob BLOB` column to clusters table (SQLite ALTER migration)
|
| 770 |
+
- [x] Update CLAUDE.md / model card to reflect deployment story
|
| 771 |
|
| 772 |
### Phase 6 documentation
|
| 773 |
- [ ] Write `docs/phases/PHASE6.md` retraining decision (Section F.4)
|
|
@@ -0,0 +1,389 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Phase 6.5 β Implementation Plan
|
| 2 |
+
|
| 3 |
+
> **Source:** `docs/phases/PHASE6.5-Instrumentation-Framing.md`
|
| 4 |
+
> **Timeline:** 5 days (each day leaves the app in a working state)
|
| 5 |
+
> **Prerequisite for:** Phase 7 (Evaluation Framework)
|
| 6 |
+
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
## Day 1: Phase 6 Hot-fix (A1 + A2)
|
| 10 |
+
|
| 11 |
+
### A1: Real Qdrant Cosine Scores (Feature 0 fix)
|
| 12 |
+
|
| 13 |
+
**Problem:** `recommendations.py:329-339` fakes Qdrant scores with linear rank decay (`1.0 - rank * 0.01`). Feature 0 is the model's #5 most important feature β it should be real cosines from Qdrant.
|
| 14 |
+
|
| 15 |
+
**Root cause:** The search calls use `search_by_vector()` (returns `list[str]`) instead of `search_by_vector_with_scores()` (returns `list[dict]` with `{"arxiv_id": str, "score": float}`).
|
| 16 |
+
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
#### [MODIFY] [recommendations.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/recommendations.py)
|
| 20 |
+
|
| 21 |
+
**Change 1 β Per-cluster searches (line 258-266):**
|
| 22 |
+
Switch from `search_by_vector()` to `search_by_vector_with_scores()`:
|
| 23 |
+
|
| 24 |
+
```diff
|
| 25 |
+
- search_tasks = [
|
| 26 |
+
- qdrant_svc.search_by_vector(
|
| 27 |
+
- query_vector=c.medoid_embedding.tolist(),
|
| 28 |
+
- limit=quota * _OVERSAMPLE,
|
| 29 |
+
- exclude_ids=seen,
|
| 30 |
+
- )
|
| 31 |
+
- for c, quota in zip(clusters, quotas)
|
| 32 |
+
- ]
|
| 33 |
+
- per_cluster_results = await asyncio.gather(*search_tasks)
|
| 34 |
+
+ search_tasks = [
|
| 35 |
+
+ qdrant_svc.search_by_vector_with_scores(
|
| 36 |
+
+ query_vector=c.medoid_embedding.tolist(),
|
| 37 |
+
+ limit=quota * _OVERSAMPLE,
|
| 38 |
+
+ exclude_ids=seen,
|
| 39 |
+
+ )
|
| 40 |
+
+ for c, quota in zip(clusters, quotas)
|
| 41 |
+
+ ]
|
| 42 |
+
+ per_cluster_scored = await asyncio.gather(*search_tasks)
|
| 43 |
+
```
|
| 44 |
+
|
| 45 |
+
**Change 2 β Build `paper_cluster_map` AND `qdrant_score_map` in one pass (line 268-277):**
|
| 46 |
+
|
| 47 |
+
```diff
|
| 48 |
+
- paper_cluster_map: dict[str, int] = {}
|
| 49 |
+
- for cluster, result_ids in zip(clusters, per_cluster_results):
|
| 50 |
+
- for aid in result_ids:
|
| 51 |
+
- if aid not in paper_cluster_map:
|
| 52 |
+
- paper_cluster_map[aid] = cluster.cluster_idx
|
| 53 |
+
-
|
| 54 |
+
- candidate_ids = merge_quota_results(list(per_cluster_results), quotas)
|
| 55 |
+
+ paper_cluster_map: dict[str, int] = {}
|
| 56 |
+
+ qdrant_score_map: dict[str, float] = {}
|
| 57 |
+
+ for cluster, scored_results in zip(clusters, per_cluster_scored):
|
| 58 |
+
+ for hit in scored_results:
|
| 59 |
+
+ aid = hit["arxiv_id"]
|
| 60 |
+
+ if aid not in paper_cluster_map:
|
| 61 |
+
+ paper_cluster_map[aid] = cluster.cluster_idx
|
| 62 |
+
+ # Keep highest cosine if paper appears in multiple clusters
|
| 63 |
+
+ if aid not in qdrant_score_map or hit["score"] > qdrant_score_map[aid]:
|
| 64 |
+
+ qdrant_score_map[aid] = float(hit["score"])
|
| 65 |
+
+
|
| 66 |
+
+ # merge_quota_results expects list[list[str]] β extract IDs
|
| 67 |
+
+ per_cluster_ids = [[h["arxiv_id"] for h in scored] for scored in per_cluster_scored]
|
| 68 |
+
+ candidate_ids = merge_quota_results(per_cluster_ids, quotas)
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
**Change 3 β Short-term supplement search (line 280-290):**
|
| 72 |
+
Also switch to scored search:
|
| 73 |
+
|
| 74 |
+
```diff
|
| 75 |
+
- st_results = await qdrant_svc.search_by_vector(
|
| 76 |
+
+ st_scored = await qdrant_svc.search_by_vector_with_scores(
|
| 77 |
+
query_vector=st_vec.tolist(),
|
| 78 |
+
limit=_ST_SUPPLEMENT,
|
| 79 |
+
exclude_ids=seen_so_far,
|
| 80 |
+
)
|
| 81 |
+
- for aid in st_results:
|
| 82 |
+
- if aid not in set(candidate_ids):
|
| 83 |
+
- candidate_ids.append(aid)
|
| 84 |
+
+ for hit in st_scored:
|
| 85 |
+
+ aid = hit["arxiv_id"]
|
| 86 |
+
+ if aid not in set(candidate_ids):
|
| 87 |
+
+ candidate_ids.append(aid)
|
| 88 |
+
+ if aid not in qdrant_score_map:
|
| 89 |
+
+ qdrant_score_map[aid] = float(hit["score"])
|
| 90 |
+
paper_cluster_map[aid] = -1 # short-term supplement
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
**Change 4 β Delete fake score block (line 329-339):**
|
| 94 |
+
The entire synthetic-decay block becomes dead code. Delete it:
|
| 95 |
+
|
| 96 |
+
```diff
|
| 97 |
+
- # Build qdrant_score_map from per_cluster_results
|
| 98 |
+
- # per_cluster_results is list[list[str]] β we need scores too.
|
| 99 |
+
- # Use the paper_cluster_map to approximate: score = 1.0 - (rank / total)
|
| 100 |
+
- # for now, as the current retrieval path returns only IDs.
|
| 101 |
+
- # TODO: Phase 6.2+ switch to search_by_vector_with_scores()
|
| 102 |
+
- qdrant_score_map: dict[str, float] = {}
|
| 103 |
+
- for cluster_ids in per_cluster_results:
|
| 104 |
+
- for rank, aid in enumerate(cluster_ids):
|
| 105 |
+
- if aid not in qdrant_score_map:
|
| 106 |
+
- # Approximate score from rank position (higher rank = higher score)
|
| 107 |
+
- qdrant_score_map[aid] = max(0.0, 1.0 - rank * 0.01)
|
| 108 |
+
```
|
| 109 |
+
|
| 110 |
+
The existing `qdrant_scores = np.asarray(...)` on line 341-344 stays as-is β it reads from `qdrant_score_map` which now has real cosines.
|
| 111 |
+
|
| 112 |
+
### A2: Verify `/healthz/reranker` live
|
| 113 |
+
|
| 114 |
+
> β
**Already done.** Verified 2026-05-03: `model_loaded: true, n_trees: 141, fallback_active: false`.
|
| 115 |
+
|
| 116 |
+
Just need to add the timestamp to `PHASE6-Reranker-Framing.md`.
|
| 117 |
+
|
| 118 |
+
---
|
| 119 |
+
|
| 120 |
+
## Day 2: B1 β `query_id` Linkage
|
| 121 |
+
|
| 122 |
+
### What it enables
|
| 123 |
+
Per-feed CTR: "out of 30 papers shown in this request, how many got saved?"
|
| 124 |
+
|
| 125 |
+
### Current state verified
|
| 126 |
+
- `interactions` table already has a `query_id TEXT` column β
(line 31 in DDL)
|
| 127 |
+
- `db.log_interaction()` already accepts `query_id` β
(line 135)
|
| 128 |
+
- `events.py` already accepts and forwards `query_id` via `Form(default="")` β
(line 26)
|
| 129 |
+
- **Missing:** `recommendations.py` never generates or passes `query_id`. Search router never generates one either. Templates don't carry it.
|
| 130 |
+
|
| 131 |
+
---
|
| 132 |
+
|
| 133 |
+
#### [MODIFY] [recommendations.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/recommendations.py)
|
| 134 |
+
|
| 135 |
+
**1. Generate `query_id` at the top of `get_recommendations()` (line 59):**
|
| 136 |
+
|
| 137 |
+
```python
|
| 138 |
+
query_id = str(uuid.uuid4())
|
| 139 |
+
```
|
| 140 |
+
|
| 141 |
+
**2. Thread `query_id` into `paper_tags` in all 3 tiers:**
|
| 142 |
+
|
| 143 |
+
- Tier 1: In `_multi_interest_recommend()` return value, add `"query_id": query_id` to each tag dict (line 455-458)
|
| 144 |
+
- Tier 2: EWMA fallback tags (line 116-120) β add `"query_id": query_id`
|
| 145 |
+
- Tier 3: Qdrant recommend tags (line 131-135) β add `"query_id": query_id`
|
| 146 |
+
- Trending fallback (line 85-87) β add `"query_id": query_id`
|
| 147 |
+
|
| 148 |
+
**3. Embed `query_id` + `position` into paper dicts (line 153-166):**
|
| 149 |
+
|
| 150 |
+
```python
|
| 151 |
+
for idx, aid in enumerate(rec_arxiv_ids):
|
| 152 |
+
...
|
| 153 |
+
papers.append({
|
| 154 |
+
**meta[aid],
|
| 155 |
+
"saved": False,
|
| 156 |
+
"dismissed": False,
|
| 157 |
+
"ranker_version": tags.get("ranker_version", _RANKER_VERSION),
|
| 158 |
+
"candidate_source": tags.get("candidate_source", ""),
|
| 159 |
+
"cluster_id": tags.get("cluster_id", ""),
|
| 160 |
+
"query_id": tags.get("query_id", ""), # NEW
|
| 161 |
+
"position": idx, # NEW
|
| 162 |
+
})
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
> [!IMPORTANT]
|
| 166 |
+
> The `_multi_interest_recommend` signature needs updating to accept `query_id` as a parameter, since it's where the Tier 1 paper_tags are built. Alternatively, we generate `query_id` inside it and return it alongside the tags. I'll use the approach of passing it as a param.
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
+
#### [MODIFY] [search.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/search.py)
|
| 171 |
+
|
| 172 |
+
**Generate `query_id` per search and embed in paper dicts (line 70-77):**
|
| 173 |
+
|
| 174 |
+
```python
|
| 175 |
+
query_id = str(uuid.uuid4()) # generated once per /search request
|
| 176 |
+
|
| 177 |
+
for idx, p in enumerate(papers):
|
| 178 |
+
p["saved"] = p["arxiv_id"] in saved_ids
|
| 179 |
+
p["dismissed"] = p["arxiv_id"] in dismissed_ids
|
| 180 |
+
p["query_id"] = query_id # NEW
|
| 181 |
+
p["position"] = idx # NEW
|
| 182 |
+
```
|
| 183 |
+
|
| 184 |
+
---
|
| 185 |
+
|
| 186 |
+
#### [MODIFY] [action_buttons.html](file:///c:/Users/siddh/ResearchIT-Final/app/templates/partials/action_buttons.html)
|
| 187 |
+
|
| 188 |
+
**Add `query_id` and `position` to ALL three `hx-vals` JSON blobs:**
|
| 189 |
+
|
| 190 |
+
Add to template header:
|
| 191 |
+
```jinja2
|
| 192 |
+
{% set _query_id = paper.query_id | default("") if paper is defined else "" %}
|
| 193 |
+
{% set _position = paper.position | default(0) if paper is defined else 0 %}
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
Add to each `hx-vals`:
|
| 197 |
+
```
|
| 198 |
+
"query_id": "{{ _query_id }}", "position": "{{ _position }}"
|
| 199 |
+
```
|
| 200 |
+
|
| 201 |
+
The save button (line 37) already has `position` β update to use `_position`. The not-interested buttons (line 26, 45) need `query_id` and `position` added.
|
| 202 |
+
|
| 203 |
+
---
|
| 204 |
+
|
| 205 |
+
## Day 3: B2 β Propensity Logging
|
| 206 |
+
|
| 207 |
+
### What it enables
|
| 208 |
+
Counterfactual evaluation (SNIPS estimator) β "what would have happened with ranker B?"
|
| 209 |
+
|
| 210 |
+
---
|
| 211 |
+
|
| 212 |
+
#### [MODIFY] [db.py](file:///c:/Users/siddh/ResearchIT-Final/app/db.py)
|
| 213 |
+
|
| 214 |
+
**1. Migration (after `_MIGRATION_6_3`):**
|
| 215 |
+
```python
|
| 216 |
+
_MIGRATION_6_5 = [
|
| 217 |
+
"ALTER TABLE interactions ADD COLUMN propensity REAL",
|
| 218 |
+
"ALTER TABLE interactions ADD COLUMN policy_id TEXT",
|
| 219 |
+
]
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
**2. Run in `init_db()`.**
|
| 223 |
+
|
| 224 |
+
**3. Extend `log_interaction()` signature (line 129-149):**
|
| 225 |
+
Add `propensity: float | None = None` and `policy_id: str | None = None` kwargs. Extend the INSERT.
|
| 226 |
+
|
| 227 |
+
---
|
| 228 |
+
|
| 229 |
+
#### [MODIFY] [recommendations.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/recommendations.py)
|
| 230 |
+
|
| 231 |
+
**Compute propensity after `inject_exploration()` (line 443):**
|
| 232 |
+
|
| 233 |
+
```python
|
| 234 |
+
# Exploration papers: uniformly sampled from pool
|
| 235 |
+
explore_pool_size = max(1, len(reranked_ids) - len(mmr_selected))
|
| 236 |
+
explore_propensity = len(exploration_set) / explore_pool_size if explore_pool_size > 0 else 0.0
|
| 237 |
+
|
| 238 |
+
# Exploitation (MMR-selected): deterministic β propensity = 1.0
|
| 239 |
+
for aid in final:
|
| 240 |
+
paper_tags[aid]["propensity"] = (
|
| 241 |
+
explore_propensity if aid in exploration_set else 1.0
|
| 242 |
+
)
|
| 243 |
+
paper_tags[aid]["policy_id"] = _RANKER_VERSION
|
| 244 |
+
```
|
| 245 |
+
|
| 246 |
+
Thread `propensity` and `policy_id` into template context the same way as `query_id`.
|
| 247 |
+
|
| 248 |
+
---
|
| 249 |
+
|
| 250 |
+
#### [MODIFY] [search.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/search.py)
|
| 251 |
+
|
| 252 |
+
Search is fully deterministic β `propensity = 1.0` for all results.
|
| 253 |
+
|
| 254 |
+
---
|
| 255 |
+
|
| 256 |
+
#### [MODIFY] [action_buttons.html](file:///c:/Users/siddh/ResearchIT-Final/app/templates/partials/action_buttons.html)
|
| 257 |
+
|
| 258 |
+
Add `propensity` and `policy_id` to `hx-vals`.
|
| 259 |
+
|
| 260 |
+
---
|
| 261 |
+
|
| 262 |
+
#### [MODIFY] [events.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/events.py)
|
| 263 |
+
|
| 264 |
+
Add `propensity: float = Form(default=0.0)` and `policy_id: str = Form(default="")` to both endpoints. Forward to `db.log_interaction()`.
|
| 265 |
+
|
| 266 |
+
---
|
| 267 |
+
|
| 268 |
+
## Day 4: B3 β Cluster Snapshot Versioning
|
| 269 |
+
|
| 270 |
+
### What it enables
|
| 271 |
+
Cluster history, debugging "why did recs shift?", content-addressed key for Phase 8a LLM summary cache.
|
| 272 |
+
|
| 273 |
+
---
|
| 274 |
+
|
| 275 |
+
#### [MODIFY] [db.py](file:///c:/Users/siddh/ResearchIT-Final/app/db.py)
|
| 276 |
+
|
| 277 |
+
**1. Add `cluster_snapshots` DDL to `_SCHEMA`:**
|
| 278 |
+
```sql
|
| 279 |
+
CREATE TABLE IF NOT EXISTS cluster_snapshots (
|
| 280 |
+
user_id TEXT NOT NULL,
|
| 281 |
+
snapshot_id TEXT NOT NULL,
|
| 282 |
+
cluster_idx INTEGER NOT NULL,
|
| 283 |
+
medoid_paper_id TEXT NOT NULL,
|
| 284 |
+
importance REAL NOT NULL,
|
| 285 |
+
paper_ids TEXT NOT NULL,
|
| 286 |
+
medoid_embedding_blob BLOB,
|
| 287 |
+
snapshot_date TEXT NOT NULL DEFAULT (datetime('now')),
|
| 288 |
+
paper_ids_hash TEXT NOT NULL,
|
| 289 |
+
PRIMARY KEY (user_id, snapshot_id, cluster_idx)
|
| 290 |
+
);
|
| 291 |
+
CREATE INDEX IF NOT EXISTS idx_snap_user_date ON cluster_snapshots(user_id, snapshot_date DESC);
|
| 292 |
+
CREATE INDEX IF NOT EXISTS idx_snap_hash ON cluster_snapshots(paper_ids_hash);
|
| 293 |
+
```
|
| 294 |
+
|
| 295 |
+
**2. Add `save_cluster_snapshot()` and `prune_old_snapshots()` functions.**
|
| 296 |
+
|
| 297 |
+
---
|
| 298 |
+
|
| 299 |
+
#### [MODIFY] [recommendations.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/recommendations.py)
|
| 300 |
+
|
| 301 |
+
After `save_clusters_to_db(user_id, clusters)` (line ~253), call `db.save_cluster_snapshot()`.
|
| 302 |
+
|
| 303 |
+
---
|
| 304 |
+
|
| 305 |
+
#### [MODIFY] [main.py](file:///c:/Users/siddh/ResearchIT-Final/app/main.py)
|
| 306 |
+
|
| 307 |
+
Call `db.prune_old_snapshots(retention_days=30)` in the lifespan handler after `init_db()`.
|
| 308 |
+
|
| 309 |
+
---
|
| 310 |
+
|
| 311 |
+
## Day 5: B4 β Semantic Scholar Author Import
|
| 312 |
+
|
| 313 |
+
### What it enables
|
| 314 |
+
"Paste S2 URL β 20 implicit saves" β replaces manual seed search friction.
|
| 315 |
+
|
| 316 |
+
---
|
| 317 |
+
|
| 318 |
+
#### [NEW] [s2_svc.py](file:///c:/Users/siddh/ResearchIT-Final/app/s2_svc.py)
|
| 319 |
+
|
| 320 |
+
Functions:
|
| 321 |
+
- `parse_author_input(text) β str | None` β accepts S2 URL, raw S2 ID, or ORCID
|
| 322 |
+
- `resolve_orcid(orcid) β str | None` β resolves ORCID via S2 author search
|
| 323 |
+
- `fetch_author_arxiv_papers(author_id, limit=50) β list[str]` β returns arXiv IDs
|
| 324 |
+
|
| 325 |
+
---
|
| 326 |
+
|
| 327 |
+
#### [MODIFY] [config.py](file:///c:/Users/siddh/ResearchIT-Final/app/config.py)
|
| 328 |
+
|
| 329 |
+
Add `S2_API_KEY = os.getenv("S2_API_KEY", "")` β key already in `.env`.
|
| 330 |
+
|
| 331 |
+
---
|
| 332 |
+
|
| 333 |
+
#### [MODIFY] [onboarding.py](file:///c:/Users/siddh/ResearchIT-Final/app/routers/onboarding.py)
|
| 334 |
+
|
| 335 |
+
Add `POST /api/onboarding/import-author` endpoint.
|
| 336 |
+
|
| 337 |
+
---
|
| 338 |
+
|
| 339 |
+
#### [NEW] Template partials for import step
|
| 340 |
+
|
| 341 |
+
- `partials/import_author.html` β the import form step
|
| 342 |
+
- `partials/import_success.html` β success confirmation
|
| 343 |
+
- `partials/import_error.html` β error message
|
| 344 |
+
|
| 345 |
+
---
|
| 346 |
+
|
| 347 |
+
## Verification Plan
|
| 348 |
+
|
| 349 |
+
### Automated Tests
|
| 350 |
+
|
| 351 |
+
After each day:
|
| 352 |
+
|
| 353 |
+
```bash
|
| 354 |
+
python -m pytest tests/ -v --tb=short
|
| 355 |
+
```
|
| 356 |
+
|
| 357 |
+
**New test files:**
|
| 358 |
+
- Day 1: Add `test_qdrant_scores_are_real_cosines` to `tests/test_phase6_feature_wiring.py`
|
| 359 |
+
- Day 2: Create `tests/test_instrumentation.py` β `test_query_id_round_trips`
|
| 360 |
+
- Day 3: Add `test_propensity_sums_correctly` to instrumentation tests
|
| 361 |
+
- Day 4: Add `test_snapshot_appended_on_each_recluster`, `test_prune_respects_retention`
|
| 362 |
+
- Day 5: Add `test_s2_import_saves_papers_with_correct_source_tag`
|
| 363 |
+
|
| 364 |
+
### Manual Verification
|
| 365 |
+
|
| 366 |
+
- Day 1: `curl -s https://siddhm11-researchit.hf.space/healthz/reranker` β confirm model still loaded after code change
|
| 367 |
+
- Day 5: Test author import with real S2 profile URL
|
| 368 |
+
|
| 369 |
+
---
|
| 370 |
+
|
| 371 |
+
## Documentation Updates (after all days)
|
| 372 |
+
|
| 373 |
+
- [ ] CLAUDE.md: Add Rule 3.11 β "Every interaction must carry `query_id`, `propensity`, and `policy_id`"
|
| 374 |
+
- [ ] TASK-TRACKER.md: Add Phase 6.5 section with checklist
|
| 375 |
+
- [ ] README.md: Update test count
|
| 376 |
+
- [ ] PHASE6-Reranker-Framing.md: Add live verification timestamp
|
| 377 |
+
|
| 378 |
+
---
|
| 379 |
+
|
| 380 |
+
## Open Questions
|
| 381 |
+
|
| 382 |
+
> [!IMPORTANT]
|
| 383 |
+
> **Q1:** The framing doc proposes `_RANKER_VERSION` as the `policy_id`. Currently it's `"v4.1_quota_hungarian_suppression"`. Should we also bump this to `"v6.5_lightgbm_real_cosines"` when Day 1 lands? It would make A/B-style log analysis cleaner.
|
| 384 |
+
|
| 385 |
+
> [!IMPORTANT]
|
| 386 |
+
> **Q2:** Day 5 (S2 author import) requires `httpx` as a dependency. It's already used by `turso_svc.py`, so no new install needed β just confirming.
|
| 387 |
+
|
| 388 |
+
> [!NOTE]
|
| 389 |
+
> **Q3:** The framing doc suggests cluster snapshot pruning at startup. For a simple MVP this is fine. Phase 7 can upgrade to APScheduler if needed.
|
|
@@ -0,0 +1,939 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PHASE 6.5 β Instrumentation Framing
|
| 2 |
+
|
| 3 |
+
> **Status:** π Proposed (not started)
|
| 4 |
+
> **Scope:** Phase 6 hot-fix (Day 1) + Phase 6.5 instrumentation (Days 2β4) + Phase 5.1 cold-start completion (Day 5, parallel)
|
| 5 |
+
> **Prerequisite for:** Phase 7 (Evaluation Framework)
|
| 6 |
+
> **Supersedes:** Open items at the end of `PHASE6-Reranker-Framing.md` (Section E.1.a, E.2 verification, ADR A1/A4 deferrals)
|
| 7 |
+
> **Owner:** Amin
|
| 8 |
+
> **Authoring date:** 2026-05-03
|
| 9 |
+
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## TL;DR
|
| 13 |
+
|
| 14 |
+
Phase 6 is **substantively complete** but has two open flags. Phase 7 (evaluation framework) cannot be built cleanly on top of the current schema β three pieces of telemetry are missing. This doc bundles three coherent units of work:
|
| 15 |
+
|
| 16 |
+
| Bucket | Identity | Days | Why it's separate |
|
| 17 |
+
| ------------------------- | --------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------ |
|
| 18 |
+
| **Phase 6 Hot-fix** | Close out Phase 6 cleanly | 1 | Two correctness/verification items left over from PHASE6-Reranker-Framing. Belongs to Phase 6, not later. |
|
| 19 |
+
| **Phase 6.5** | Telemetry foundation | 3 | Mirrors the Phase 4.5 precedent: a small instrumentation phase that exists *because* the next phase needs it. |
|
| 20 |
+
| **Phase 5.1 (side-quest)** | Cold-start completion | 1 | Author-import was the deferred Layer 2 from Phase 5's three-layer onboarding plan. Sits beside, not inside, 6.5. |
|
| 21 |
+
|
| 22 |
+
Total: **5 working days**. After this, Phase 7 starts on a clean substrate where all the prerequisite plumbing is already in production.
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
## 1. Why this doc exists (the reasoning)
|
| 27 |
+
|
| 28 |
+
The instinct is to fold all five days of work into Phase 7 β it's all "stuff that helps evaluation," after all. That instinct is wrong, and the reason matters.
|
| 29 |
+
|
| 30 |
+
**Phases in this project have always had one identity.** Look at the existing pattern:
|
| 31 |
+
|
| 32 |
+
- Phase 4 = quota fusion
|
| 33 |
+
- Phase 4.5 = instrumentation only (`ranker_version`, `candidate_source`, `cluster_id`)
|
| 34 |
+
- Phase 5 = onboarding
|
| 35 |
+
- Phase 6 = LightGBM reranker integration
|
| 36 |
+
- Phase 7 = **evaluation framework** (per master roadmap: nDCG@10, Recall@50, HR@10, ILS, category entropy, time-split eval, regression CI)
|
| 37 |
+
|
| 38 |
+
Phase 4.5 is the precedent. When instrumentation needed to land between Phase 4 and Phase 5, it didn't get folded into either β it got its own micro-phase precisely because it was load-bearing for everything downstream and had a single identity. The framing doc for Phase 6 (Part H) was also explicit about what Phase 6 is NOT β and "evaluation harness" was carved into Phase 7 deliberately.
|
| 39 |
+
|
| 40 |
+
**What happens if we fold everything into Phase 7?** Phase 7's master-roadmap budget is ~1 week. Adding ~3 days of prerequisite infrastructure either:
|
| 41 |
+
|
| 42 |
+
1. Bloats Phase 7 to 2+ weeks, or
|
| 43 |
+
2. Forces shortcuts on the actual harness work (offline regression, time-split eval, frozen `eval/eval_set_v1.0.parquet`, CI gates on >3% nDCG@10 drops) β which is a meaty deliverable in its own right.
|
| 44 |
+
|
| 45 |
+
**What happens if we leave the Phase 6 closeout for later?** The biggest item in the closeout is the `qdrant_cosine_score` fix β and that's a model-correctness bug. Feature 0 is the reranker's #5 most-important feature by training importance, and right now it's being fed synthetic linear decay (`1.0 - rank * 0.01`) instead of actual cosines. Every day it sits unfixed, the model is performing below its training-distribution capability. This belongs to Phase 6, full stop.
|
| 46 |
+
|
| 47 |
+
**What happens if the cold-start work waits?** B4 (S2 author import) is the single biggest cold-start lift available β replacing "manually save 5 papers" with "paste your S2 URL β 20 saves." It's a Phase 5 completion, not a Phase 7 input. It can run in parallel with Phase 6.5 work because it touches a different code path (onboarding router, no schema changes to `interactions`).
|
| 48 |
+
|
| 49 |
+
**The structural answer:** three identities β three buckets. This doc unifies them under one plan with one timeline.
|
| 50 |
+
|
| 51 |
+
---
|
| 52 |
+
|
| 53 |
+
## 2. Phase 6 Audit β current status
|
| 54 |
+
|
| 55 |
+
Cross-checked against `PHASE6-Reranker-Framing.md` (Parts AβG) and current code. Audit was performed 2026-05-03.
|
| 56 |
+
|
| 57 |
+
### β
Phase 6.1 β Simplification Pass: DONE
|
| 58 |
+
|
| 59 |
+
In `app/routers/recommendations.py`:
|
| 60 |
+
|
| 61 |
+
- `suppressed` and `onboarding_categories` loaded **before** the rerank call
|
| 62 |
+
- `qdrant_score_map` built from `per_cluster_results`
|
| 63 |
+
- `user_total_saves` / `user_total_dismissals` computed and passed
|
| 64 |
+
- `is_suppressed_arr` and `onboarding_match_arr` computed per-candidate
|
| 65 |
+
- `rerank_candidates` called with the full Phase 6 kwarg signature
|
| 66 |
+
|
| 67 |
+
### β
Phase 6.2 β Per-Candidate Plumbing: DONE
|
| 68 |
+
|
| 69 |
+
- `paper_cluster_map` is built before the merge β first-occurrence wins, exactly per spec
|
| 70 |
+
- `per_candidate_importance` is a `(N,)` array, not a scalar
|
| 71 |
+
- `per_candidate_medoids` is a `(N, 1024)` stack, not broadcast
|
| 72 |
+
- `app/recommend/reranker.py:287β298` slot 24 correctly handles both 1D (broadcast) and 2D (per-candidate) medoid shapes
|
| 73 |
+
- `test_phase6_feature_wiring.py::test_per_candidate_cluster_importance` and `test_per_candidate_medoid_distance` exist
|
| 74 |
+
|
| 75 |
+
### β
Phase 6.3 β Deployment Verification: DONE (code), β οΈ UNVERIFIED (live)
|
| 76 |
+
|
| 77 |
+
- `/healthz/reranker` endpoint exists in `app/routers/health.py`
|
| 78 |
+
- `is_model_loaded()`, `get_loaded_model_path()`, `get_num_trees()` accessors exist in `reranker.py`
|
| 79 |
+
- Per-request feature activation logging at `reranker.py:432β438`
|
| 80 |
+
- Bug B fix: `medoid_embedding_blob BLOB` column added via migration in `db.py:128`
|
| 81 |
+
- Hungarian fallback now prefers live vector β persisted blob β skip with warning
|
| 82 |
+
|
| 83 |
+
### β οΈ Two flags from Phase 6 (handled in Β§3 below)
|
| 84 |
+
|
| 85 |
+
1. **`qdrant_scores` are still rank-approximated, not real cosines.** `recommendations.py:316β325` uses synthetic linear decay because the call site is still on `search_by_vector()` (returns `list[str]`) instead of `search_by_vector_with_scores()` (returns `[{"arxiv_id": ..., "score": ...}]`). The scored function already exists in `qdrant_svc.py:265` β the swap is mechanical.
|
| 86 |
+
2. **`/healthz/reranker` not curl-verified live.** The endpoint exists in code. Production status is unknown β could be silently running heuristic fallback if the model file isn't being copied into the Docker image.
|
| 87 |
+
|
| 88 |
+
### β
Phase 6.4 β Retraining: correctly deferred
|
| 89 |
+
|
| 90 |
+
Documented in `PHASE6-Reranker-Framing.md` Section F.4, gated on synthetic simulator OR 100 real users with β₯10 saves each.
|
| 91 |
+
|
| 92 |
+
### Verdict
|
| 93 |
+
|
| 94 |
+
Phase 6 is substantively complete. The two flags above are polish, not blockers β but the qdrant-scores fix is feeding the model wrong data for one of its top-importance features and should ship as part of Phase 6 closeout, not deferred.
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
## 3. Bucket 1 β Phase 6 Hot-fix (Day 1)
|
| 99 |
+
|
| 100 |
+
### 3.1 β A1: Real Qdrant Scores (the lying-feature-0 fix)
|
| 101 |
+
|
| 102 |
+
**The problem.** In `recommendations.py:248`, the per-cluster search calls `qdrant_svc.search_by_vector()` which returns `list[str]` β arXiv IDs only, no scores. Then around line 316, scores are faked by linear decay from rank position:
|
| 103 |
+
|
| 104 |
+
```python
|
| 105 |
+
qdrant_score_map[aid] = max(0.0, 1.0 - rank * 0.01)
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
A paper at rank 0 gets score 1.0, rank 50 gets 0.50, rank 100 gets 0.0. This bears almost no relationship to actual cosine similarity, where a top result might be 0.85 and rank 50 might be 0.78 β a much tighter band. Feature 0 (`qdrant_cosine_score`) is the model's #5 most-important feature by training importance. Feeding it a synthetic linear sequence caps how much the model can help.
|
| 109 |
+
|
| 110 |
+
**The fix.** Switch to `search_by_vector_with_scores()` (already exists at `qdrant_svc.py:265`), and build `qdrant_score_map` from actual cosines as part of the same loop that builds `paper_cluster_map`.
|
| 111 |
+
|
| 112 |
+
**Code change** in `app/routers/recommendations.py`, the `_multi_interest_recommend()` flow around line 245:
|
| 113 |
+
|
| 114 |
+
```python
|
| 115 |
+
# OLD
|
| 116 |
+
search_tasks = [
|
| 117 |
+
qdrant_svc.search_by_vector(
|
| 118 |
+
query_vector=c.medoid_embedding.tolist(),
|
| 119 |
+
limit=quota * _OVERSAMPLE,
|
| 120 |
+
exclude_ids=seen,
|
| 121 |
+
)
|
| 122 |
+
for c, quota in zip(clusters, quotas)
|
| 123 |
+
]
|
| 124 |
+
per_cluster_results = await asyncio.gather(*search_tasks)
|
| 125 |
+
|
| 126 |
+
# Phase 4.5: Build paper β cluster mapping BEFORE merge
|
| 127 |
+
paper_cluster_map: dict[str, int] = {}
|
| 128 |
+
for cluster, result_ids in zip(clusters, per_cluster_results):
|
| 129 |
+
for aid in result_ids:
|
| 130 |
+
if aid not in paper_cluster_map:
|
| 131 |
+
paper_cluster_map[aid] = cluster.cluster_idx
|
| 132 |
+
|
| 133 |
+
# Apply quota merge
|
| 134 |
+
candidate_ids = merge_quota_results(list(per_cluster_results), quotas)
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
becomes:
|
| 138 |
+
|
| 139 |
+
```python
|
| 140 |
+
# NEW β fetch scores alongside IDs
|
| 141 |
+
search_tasks = [
|
| 142 |
+
qdrant_svc.search_by_vector_with_scores(
|
| 143 |
+
query_vector=c.medoid_embedding.tolist(),
|
| 144 |
+
limit=quota * _OVERSAMPLE,
|
| 145 |
+
exclude_ids=seen,
|
| 146 |
+
)
|
| 147 |
+
for c, quota in zip(clusters, quotas)
|
| 148 |
+
]
|
| 149 |
+
per_cluster_scored = await asyncio.gather(*search_tasks)
|
| 150 |
+
|
| 151 |
+
# Build paper β cluster map AND real qdrant_score_map in one pass
|
| 152 |
+
paper_cluster_map: dict[str, int] = {}
|
| 153 |
+
qdrant_score_map: dict[str, float] = {}
|
| 154 |
+
for cluster, scored_results in zip(clusters, per_cluster_scored):
|
| 155 |
+
for hit in scored_results:
|
| 156 |
+
aid = hit["arxiv_id"]
|
| 157 |
+
if aid not in paper_cluster_map:
|
| 158 |
+
paper_cluster_map[aid] = cluster.cluster_idx
|
| 159 |
+
# Keep highest cosine if paper appears in multiple clusters
|
| 160 |
+
if aid not in qdrant_score_map or hit["score"] > qdrant_score_map[aid]:
|
| 161 |
+
qdrant_score_map[aid] = float(hit["score"])
|
| 162 |
+
|
| 163 |
+
# merge_quota_results expects list[list[str]] β extract IDs
|
| 164 |
+
per_cluster_ids = [[hit["arxiv_id"] for hit in scored] for scored in per_cluster_scored]
|
| 165 |
+
candidate_ids = merge_quota_results(per_cluster_ids, quotas)
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
Then **delete** the synthetic-score block (current `recommendations.py:313β325`):
|
| 169 |
+
|
| 170 |
+
```python
|
| 171 |
+
# DELETE β qdrant_score_map is now built from real cosines above
|
| 172 |
+
# qdrant_score_map: dict[str, float] = {}
|
| 173 |
+
# for cluster_ids in per_cluster_results:
|
| 174 |
+
# for rank, aid in enumerate(cluster_ids):
|
| 175 |
+
# if aid not in qdrant_score_map:
|
| 176 |
+
# qdrant_score_map[aid] = max(0.0, 1.0 - rank * 0.01)
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
**Don't forget the short-term supplement search.** Around line 263 (the path that pulls extra papers from `state.short_term_centroid` to fill the feed) does the same synthetic-decay trick. Same swap applies, with `paper_cluster_map[aid] = -1` (signalling "not from a long-term cluster") and `qdrant_score_map` populated from real scores.
|
| 180 |
+
|
| 181 |
+
**Test** (add to `tests/test_phase6_feature_wiring.py`):
|
| 182 |
+
|
| 183 |
+
```python
|
| 184 |
+
def test_qdrant_scores_are_real_cosines_not_rank_proxies():
|
| 185 |
+
"""Feature 0 should be actual cosine similarities β not a perfect linear
|
| 186 |
+
sequence from rank 0 β N."""
|
| 187 |
+
# Mock search_by_vector_with_scores to return realistic clustered scores:
|
| 188 |
+
# e.g. [0.91, 0.89, 0.87, 0.86, 0.84, 0.83, ...] not [1.0, 0.99, 0.98, ...]
|
| 189 |
+
fake_hits = [
|
| 190 |
+
{"arxiv_id": f"24{i:02d}.{i:05d}", "score": 0.92 - 0.005 * i + (0.01 if i % 3 == 0 else 0)}
|
| 191 |
+
for i in range(20)
|
| 192 |
+
]
|
| 193 |
+
# ... call _multi_interest_recommend, capture qdrant_score_map
|
| 194 |
+
# ... assert all values in [0.5, 1.0] (realistic cosine band, not 0.0β1.0 sweep)
|
| 195 |
+
# ... assert NOT a perfect linear sequence (variance > 0 in successive diffs)
|
| 196 |
+
diffs = [s2 - s1 for s1, s2 in zip(scores[:-1], scores[1:])]
|
| 197 |
+
assert max(diffs) - min(diffs) > 0.001, "scores look synthetically linear"
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
**Estimated effort:** 2 hours (including the test).
|
| 201 |
+
|
| 202 |
+
---
|
| 203 |
+
|
| 204 |
+
### 3.2 β A2: Verify `/healthz/reranker` Live
|
| 205 |
+
|
| 206 |
+
**Not a code change** β a 5-minute verification command:
|
| 207 |
+
|
| 208 |
+
```bash
|
| 209 |
+
curl -s https://siddhm11-researchit.hf.space/healthz/reranker | python -m json.tool
|
| 210 |
+
```
|
| 211 |
+
|
| 212 |
+
**Three possible outcomes:**
|
| 213 |
+
|
| 214 |
+
| Response | Meaning | Action |
|
| 215 |
+
| --------------------------------------------------------------------- | ---------------------------------------------------- | ----------------------------------------------------------------------------------------------------- |
|
| 216 |
+
| `model_loaded: true, n_trees: 141, fallback_active: false` | β
Production is using LightGBM | Tick the box in TASK-TRACKER. Add timestamp to PHASE6-Reranker-Framing.md. |
|
| 217 |
+
| `model_loaded: false, fallback_active: true` | β οΈ Space is silently using the heuristic | Debug per checklist below. |
|
| 218 |
+
| 404 or 500 | Endpoint isn't deployed yet | Push the latest commit; HF Spaces will rebuild. |
|
| 219 |
+
|
| 220 |
+
**If the model isn't loading, debug in this order:**
|
| 221 |
+
|
| 222 |
+
1. **Is the model file in the Git repo?**
|
| 223 |
+
```bash
|
| 224 |
+
git ls-files | grep reranker_v1.txt
|
| 225 |
+
```
|
| 226 |
+
If empty: check `.gitignore` for any pattern that might catch it (e.g. `*.txt` in a subtree, or a too-broad `models/` rule). The current `.gitignore` looks safe but worth double-checking β the file is `models/reranker-phase6/production_model/reranker_v1.txt`.
|
| 227 |
+
|
| 228 |
+
2. **Is the model file being copied into the Docker image?**
|
| 229 |
+
Check `Dockerfile` for `COPY models/ models/` or `COPY . .`. Check `.dockerignore` for any pattern that excludes `models/` or `*.txt`.
|
| 230 |
+
|
| 231 |
+
3. **Does the path search in `reranker.py:35β44` find it from HF Spaces' working directory?** If HF Spaces runs from `/app` instead of the repo root, the relative paths might miss. Set `RERANKER_MODEL_PATH` explicitly in HF Secrets:
|
| 232 |
+
```
|
| 233 |
+
RERANKER_MODEL_PATH=/app/models/reranker-phase6/production_model/reranker_v1.txt
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
4. **Check the build logs** for the line `[reranker] LightGBM model loaded from <path> (n_trees=141)`. If that line is missing, the loader is silently failing β turn on DEBUG logging in `reranker.py` to see why.
|
| 237 |
+
|
| 238 |
+
**If it's working**, update `PHASE6-Reranker-Framing.md` with a one-liner under Section E:
|
| 239 |
+
|
| 240 |
+
> *Verified live at 2026-MM-DD: `model_loaded=true, n_trees=141, fallback_active=false`.*
|
| 241 |
+
|
| 242 |
+
**Estimated effort:** 30 minutes including any Docker fixes.
|
| 243 |
+
|
| 244 |
+
---
|
| 245 |
+
|
| 246 |
+
## 4. Bucket 2 β Phase 6.5: Instrumentation Foundation (Days 2β4)
|
| 247 |
+
|
| 248 |
+
This is the new phase. Single identity: **telemetry schema and storage foundations that Phase 7 will sit on top of.** Three pieces of work, each a day, each independently shippable, each leaves the app in a working state.
|
| 249 |
+
|
| 250 |
+
### 4.1 β B1: query_id Linkage (Day 2)
|
| 251 |
+
|
| 252 |
+
**Why this matters more than it sounds.** Right now, interaction logs look like this:
|
| 253 |
+
|
| 254 |
+
```
|
| 255 |
+
user_id=u1, paper_id=2401.001, event=save, source=recommendation, candidate_source=cluster_0
|
| 256 |
+
user_id=u1, paper_id=2401.002, event=save, source=recommendation, candidate_source=cluster_1
|
| 257 |
+
```
|
| 258 |
+
|
| 259 |
+
You can count saves but you cannot answer:
|
| 260 |
+
|
| 261 |
+
- *"Out of the 30 papers we showed in this single feed request, how many got saved?"* (CTR per query)
|
| 262 |
+
- *"Did this user save the paper from the same feed they saw it in, or come back 3 days later?"* (intra-session vs return)
|
| 263 |
+
- *"When ranker version changed, did CTR for the same user change?"* (ranker A/B comparison)
|
| 264 |
+
|
| 265 |
+
Without `query_id`, every interaction floats free of the request that generated it. Phase 7 evaluation cannot compute even the most basic feed-level metric.
|
| 266 |
+
|
| 267 |
+
**The fix in 4 steps:**
|
| 268 |
+
|
| 269 |
+
#### Step 1: Generate `query_id` in `recommendations.py`
|
| 270 |
+
|
| 271 |
+
At the top of `get_recommendations()`:
|
| 272 |
+
|
| 273 |
+
```python
|
| 274 |
+
import uuid
|
| 275 |
+
query_id = str(uuid.uuid4())
|
| 276 |
+
```
|
| 277 |
+
|
| 278 |
+
When building `paper_tags` (the per-paper instrumentation dict already used by Phase 4.5):
|
| 279 |
+
|
| 280 |
+
```python
|
| 281 |
+
paper_tags[aid] = {
|
| 282 |
+
"ranker_version": _RANKER_VERSION,
|
| 283 |
+
"candidate_source": source,
|
| 284 |
+
"cluster_id": str(cluster_idx) if cluster_idx is not None and cluster_idx >= 0 else "",
|
| 285 |
+
"query_id": query_id, # NEW
|
| 286 |
+
"position": str(position), # NEW β index in final ranked list (0-based)
|
| 287 |
+
}
|
| 288 |
+
```
|
| 289 |
+
|
| 290 |
+
#### Step 2: Same plumbing in `search.py`
|
| 291 |
+
|
| 292 |
+
Generate one `query_id` per `/search` request, attach to every paper card. Same shape as recommendations β different `source` value (`"search"` not `"recommendation"`) but same fields.
|
| 293 |
+
|
| 294 |
+
#### Step 3: Template plumbing
|
| 295 |
+
|
| 296 |
+
In `app/templates/partials/action_buttons.html`, extend the `hx-vals` JSON:
|
| 297 |
+
|
| 298 |
+
```html
|
| 299 |
+
hx-vals='{
|
| 300 |
+
"source": "{{ _source }}",
|
| 301 |
+
"position": "{{ position | default(0) }}",
|
| 302 |
+
"ranker_version": "{{ _ranker_version }}",
|
| 303 |
+
"candidate_source": "{{ _candidate_source }}",
|
| 304 |
+
"cluster_id": "{{ _cluster_id }}",
|
| 305 |
+
"query_id": "{{ paper.query_id | default('') }}"
|
| 306 |
+
}'
|
| 307 |
+
```
|
| 308 |
+
|
| 309 |
+
(The Jinja templates that currently render paper cards need the per-card `query_id` and `position` available in their context β pass them in via the loop variable when rendering the feed.)
|
| 310 |
+
|
| 311 |
+
#### Step 4: events.py forwards the field
|
| 312 |
+
|
| 313 |
+
`db.log_interaction()` already accepts a `query_id` parameter. Just ensure `events.py` forwards the Form field:
|
| 314 |
+
|
| 315 |
+
```python
|
| 316 |
+
@router.post("/api/events")
|
| 317 |
+
async def log_event(
|
| 318 |
+
paper_id: str = Form(...),
|
| 319 |
+
event_type: str = Form(...),
|
| 320 |
+
source: str = Form(default=""),
|
| 321 |
+
position: int = Form(default=0),
|
| 322 |
+
ranker_version: str = Form(default=""),
|
| 323 |
+
candidate_source: str = Form(default=""),
|
| 324 |
+
cluster_id: str = Form(default=""),
|
| 325 |
+
query_id: str = Form(default=""), # NEW
|
| 326 |
+
user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
|
| 327 |
+
):
|
| 328 |
+
await db.log_interaction(
|
| 329 |
+
user_id=user_id,
|
| 330 |
+
paper_id=paper_id,
|
| 331 |
+
event_type=event_type,
|
| 332 |
+
source=source,
|
| 333 |
+
position=position,
|
| 334 |
+
ranker_version=ranker_version,
|
| 335 |
+
candidate_source=candidate_source,
|
| 336 |
+
cluster_id=cluster_id or None,
|
| 337 |
+
query_id=query_id or None, # NEW
|
| 338 |
+
)
|
| 339 |
+
```
|
| 340 |
+
|
| 341 |
+
**What this enables in Phase 7.** A single SQL query gives per-feed CTR by ranker version:
|
| 342 |
+
|
| 343 |
+
```sql
|
| 344 |
+
SELECT
|
| 345 |
+
query_id,
|
| 346 |
+
ranker_version,
|
| 347 |
+
COUNT(*) FILTER (WHERE event_type = 'save') * 1.0 / COUNT(DISTINCT paper_id) AS save_rate
|
| 348 |
+
FROM interactions
|
| 349 |
+
WHERE source = 'recommendation'
|
| 350 |
+
GROUP BY query_id, ranker_version;
|
| 351 |
+
```
|
| 352 |
+
|
| 353 |
+
**Test** (add to `tests/test_instrumentation.py`):
|
| 354 |
+
|
| 355 |
+
```python
|
| 356 |
+
async def test_query_id_round_trips_from_request_to_db():
|
| 357 |
+
"""A single /api/recommendations call should generate one query_id;
|
| 358 |
+
every paper card returned should carry it; saving any paper should
|
| 359 |
+
persist that exact query_id in interactions."""
|
| 360 |
+
resp = await client.get("/api/recommendations", cookies={"uid": "test-user"})
|
| 361 |
+
# Parse out query_id values from the rendered cards
|
| 362 |
+
query_ids = re.findall(r'"query_id":\s*"([0-9a-f-]{36})"', resp.text)
|
| 363 |
+
assert len(set(query_ids)) == 1, "all cards should share one query_id"
|
| 364 |
+
qid = query_ids[0]
|
| 365 |
+
|
| 366 |
+
# Save the first paper
|
| 367 |
+
paper_id = re.search(r'data-paper-id="([^"]+)"', resp.text).group(1)
|
| 368 |
+
await client.post("/api/events", data={
|
| 369 |
+
"paper_id": paper_id, "event_type": "save",
|
| 370 |
+
"source": "recommendation", "query_id": qid,
|
| 371 |
+
})
|
| 372 |
+
rows = await db.fetch_all("SELECT query_id FROM interactions WHERE paper_id = ?", paper_id)
|
| 373 |
+
assert rows[0]["query_id"] == qid
|
| 374 |
+
```
|
| 375 |
+
|
| 376 |
+
**Estimated effort:** 3 hours.
|
| 377 |
+
|
| 378 |
+
---
|
| 379 |
+
|
| 380 |
+
### 4.2 β B2: Propensity Logging (Day 3)
|
| 381 |
+
|
| 382 |
+
**Why this is non-negotiable per the project's own framing doc.** ADR A4 in `PHASE6-Reranker-Framing.md` says verbatim:
|
| 383 |
+
|
| 384 |
+
> *Telemetry gaps bite hardest in Phase 5 (IPS impossible without propensities): freeze schema before any logging (A4); include policy_id, propensity, shown_position, ranker_version*
|
| 385 |
+
|
| 386 |
+
You already have `policy_id` in spirit (`ranker_version`) and `shown_position` (`position`). What's missing is `propensity` β the probability that the active policy chose to show this paper to this user in this slot.
|
| 387 |
+
|
| 388 |
+
Without propensity, **counterfactual evaluation is mathematically impossible**. You can never retrospectively answer "what would have happened if we'd used a different ranker?" because you cannot reweight observed clicks correctly. Adding the column to a table with 50K rows is a multi-week migration project; adding it to an empty table is 4 hours.
|
| 389 |
+
|
| 390 |
+
#### Schema migration
|
| 391 |
+
|
| 392 |
+
Add to `app/db.py`:
|
| 393 |
+
|
| 394 |
+
```python
|
| 395 |
+
_MIGRATION_B2 = [
|
| 396 |
+
"ALTER TABLE interactions ADD COLUMN propensity REAL",
|
| 397 |
+
"ALTER TABLE interactions ADD COLUMN policy_id TEXT",
|
| 398 |
+
]
|
| 399 |
+
```
|
| 400 |
+
|
| 401 |
+
(`policy_id` is a synonym for `ranker_version` but more honest about what it represents β the identifier of the *full pipeline configuration* that chose to show this paper, including MMR Ξ», exploration rate Ξ΅, and any feature-flag state. Some systems keep both: `ranker_version` for the model file hash, `policy_id` for the pipeline hash. For now they can be the same value, but the column is there when you need to differentiate.)
|
| 402 |
+
|
| 403 |
+
Run the migration via the existing migration runner pattern in `db.py:128`:
|
| 404 |
+
|
| 405 |
+
```python
|
| 406 |
+
async def _apply_migrations(conn):
|
| 407 |
+
# ... existing migrations ...
|
| 408 |
+
for sql in _MIGRATION_B2:
|
| 409 |
+
try:
|
| 410 |
+
await conn.execute(sql)
|
| 411 |
+
except aiosqlite.OperationalError as e:
|
| 412 |
+
if "duplicate column" not in str(e).lower():
|
| 413 |
+
raise
|
| 414 |
+
await conn.commit()
|
| 415 |
+
```
|
| 416 |
+
|
| 417 |
+
Update `db.log_interaction()`:
|
| 418 |
+
|
| 419 |
+
```python
|
| 420 |
+
async def log_interaction(
|
| 421 |
+
user_id: str,
|
| 422 |
+
paper_id: str,
|
| 423 |
+
event_type: str,
|
| 424 |
+
*,
|
| 425 |
+
source: str = "",
|
| 426 |
+
position: int = 0,
|
| 427 |
+
ranker_version: str | None = None,
|
| 428 |
+
candidate_source: str | None = None,
|
| 429 |
+
cluster_id: str | None = None,
|
| 430 |
+
query_id: str | None = None,
|
| 431 |
+
propensity: float | None = None, # NEW
|
| 432 |
+
policy_id: str | None = None, # NEW
|
| 433 |
+
):
|
| 434 |
+
# ... INSERT statement extended with propensity, policy_id ...
|
| 435 |
+
```
|
| 436 |
+
|
| 437 |
+
#### The propensity computation
|
| 438 |
+
|
| 439 |
+
In `recommendations.py`, after the final feed is built but before tags are returned, compute per-paper propensity. The math depends on which slot the paper occupies:
|
| 440 |
+
|
| 441 |
+
```python
|
| 442 |
+
# Phase 6.5+B2: compute per-paper propensity
|
| 443 |
+
N_FINAL = len(final)
|
| 444 |
+
N_EXPLORE = len(exploration_set) # the Ξ΅ papers MMR didn't pick
|
| 445 |
+
N_EXPLOIT = N_FINAL - N_EXPLORE
|
| 446 |
+
|
| 447 |
+
# Exploration papers: uniformly sampled from `reranked_ids` not in mmr_selected
|
| 448 |
+
explore_pool_size = max(1, len(reranked_ids) - len(mmr_selected))
|
| 449 |
+
explore_propensity = N_EXPLORE / explore_pool_size if explore_pool_size > 0 else 0.0
|
| 450 |
+
|
| 451 |
+
# Exploitation papers: deterministically selected by MMR β propensity = 1.0
|
| 452 |
+
# (this is the "logging policy = serving policy" case β IPS weight will be 1)
|
| 453 |
+
|
| 454 |
+
for aid in final:
|
| 455 |
+
paper_tags[aid]["propensity"] = (
|
| 456 |
+
explore_propensity if aid in exploration_set else 1.0
|
| 457 |
+
)
|
| 458 |
+
paper_tags[aid]["policy_id"] = _RANKER_VERSION # or compute pipeline hash
|
| 459 |
+
```
|
| 460 |
+
|
| 461 |
+
Plumb through templates (add `propensity` and `policy_id` to `hx-vals` like with `query_id`), and store in `events.py`.
|
| 462 |
+
|
| 463 |
+
**For search**, propensity is `1.0` for every result (search is fully deterministic β no exploration). Set it explicitly so the column is always populated:
|
| 464 |
+
|
| 465 |
+
```python
|
| 466 |
+
# search.py
|
| 467 |
+
paper_tags[aid]["propensity"] = 1.0
|
| 468 |
+
paper_tags[aid]["policy_id"] = _SEARCH_POLICY_ID
|
| 469 |
+
```
|
| 470 |
+
|
| 471 |
+
#### Why this earns its day
|
| 472 |
+
|
| 473 |
+
Phase 7 evaluation will eventually want to test "ranker B vs ranker A" without a full A/B test (you don't have user volume for that). With propensity logging, you can use **SNIPS** (Self-Normalized Inverse Propensity Scoring) on existing logs to estimate "what would CTR have been if we'd used ranker B?" β purely from data ranker A already collected. The estimator is:
|
| 474 |
+
|
| 475 |
+
```
|
| 476 |
+
Ξ£_i (r_i Γ Ο_B(a_i | x_i) / Ο_A(a_i | x_i))
|
| 477 |
+
SNIPS(Ο_B) = βββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 478 |
+
Ξ£_i (Ο_B(a_i | x_i) / Ο_A(a_i | x_i))
|
| 479 |
+
```
|
| 480 |
+
|
| 481 |
+
where `Ο_A` is the logging policy (your current ranker, propensity stored at log time) and `Ο_B` is the candidate policy you want to evaluate. Without `Ο_A` stored at log time, this formula has a missing denominator and the estimator collapses.
|
| 482 |
+
|
| 483 |
+
**Test:**
|
| 484 |
+
|
| 485 |
+
```python
|
| 486 |
+
async def test_propensity_sums_correctly_across_exploration_and_exploitation():
|
| 487 |
+
"""For a feed of N papers with K exploration slots, the sum of propensities
|
| 488 |
+
over ALL candidates in the explore pool should equal K (each paper had K/|pool|
|
| 489 |
+
chance, summed over |pool| papers = K)."""
|
| 490 |
+
# Mock a recommendation flow with N=30, K_explore=2, pool_size=50
|
| 491 |
+
# Capture propensity values
|
| 492 |
+
explore_props = [p["propensity"] for p in tagged if p["aid"] in exploration_set]
|
| 493 |
+
assert all(0 < p <= 1 for p in explore_props)
|
| 494 |
+
# Each exploration paper has propensity = K/pool = 2/50 = 0.04
|
| 495 |
+
assert all(abs(p - 0.04) < 1e-6 for p in explore_props)
|
| 496 |
+
# Exploitation papers all have propensity = 1.0
|
| 497 |
+
exploit_props = [p["propensity"] for p in tagged if p["aid"] not in exploration_set]
|
| 498 |
+
assert all(p == 1.0 for p in exploit_props)
|
| 499 |
+
```
|
| 500 |
+
|
| 501 |
+
**Estimated effort:** 4 hours.
|
| 502 |
+
|
| 503 |
+
---
|
| 504 |
+
|
| 505 |
+
### 4.3 β B3: Cluster Snapshot Versioning (Day 4)
|
| 506 |
+
|
| 507 |
+
**The current problem.** `db.save_user_clusters()` (around `db.py:235`) does:
|
| 508 |
+
|
| 509 |
+
```python
|
| 510 |
+
await conn.execute("DELETE FROM user_clusters WHERE user_id = ?", (user_id,))
|
| 511 |
+
for c in clusters:
|
| 512 |
+
await conn.execute("INSERT INTO user_clusters ...")
|
| 513 |
+
```
|
| 514 |
+
|
| 515 |
+
Every recluster, the previous cluster state is **destroyed**. You cannot answer:
|
| 516 |
+
|
| 517 |
+
- *"What clusters did this user have a week ago?"* β for debugging "why did the recs suddenly shift?"
|
| 518 |
+
- *"When did cluster 2 form?"* β for cluster lifecycle analytics
|
| 519 |
+
- Phase 8a's content-addressed LLM-summary cache key needs `(cluster_stable_id, snapshot_date)` per ADR A1 β and the snapshot_date doesn't exist as a concept yet
|
| 520 |
+
|
| 521 |
+
This implements **ADR A1** from `PHASE6-Reranker-Framing.md`.
|
| 522 |
+
|
| 523 |
+
#### Schema
|
| 524 |
+
|
| 525 |
+
```sql
|
| 526 |
+
CREATE TABLE IF NOT EXISTS cluster_snapshots (
|
| 527 |
+
user_id TEXT NOT NULL,
|
| 528 |
+
snapshot_id TEXT NOT NULL, -- UUID, one per recluster event
|
| 529 |
+
cluster_idx INTEGER NOT NULL, -- stable index after Hungarian
|
| 530 |
+
medoid_paper_id TEXT NOT NULL,
|
| 531 |
+
importance REAL NOT NULL,
|
| 532 |
+
paper_ids TEXT NOT NULL, -- JSON array
|
| 533 |
+
medoid_embedding_blob BLOB,
|
| 534 |
+
snapshot_date TEXT NOT NULL DEFAULT (datetime('now')),
|
| 535 |
+
paper_ids_hash TEXT NOT NULL, -- sha256(sorted(paper_ids))[:16]
|
| 536 |
+
PRIMARY KEY (user_id, snapshot_id, cluster_idx)
|
| 537 |
+
);
|
| 538 |
+
CREATE INDEX IF NOT EXISTS idx_snap_user_date ON cluster_snapshots(user_id, snapshot_date DESC);
|
| 539 |
+
CREATE INDEX IF NOT EXISTS idx_snap_hash ON cluster_snapshots(paper_ids_hash);
|
| 540 |
+
```
|
| 541 |
+
|
| 542 |
+
`paper_ids_hash` is the content-addressing key β Phase 8a will use this to dedupe LLM-summary generation across users. If two different users have a cluster with identical paper sets, they share one cached summary. The 16-character truncation is enough entropy at our scale (low birthday-collision risk for <100M clusters).
|
| 543 |
+
|
| 544 |
+
#### Write side
|
| 545 |
+
|
| 546 |
+
Add a new function in `db.py`:
|
| 547 |
+
|
| 548 |
+
```python
|
| 549 |
+
import json
|
| 550 |
+
import hashlib
|
| 551 |
+
import uuid
|
| 552 |
+
|
| 553 |
+
async def save_cluster_snapshot(user_id: str, clusters: list[dict]) -> str:
|
| 554 |
+
"""Append a new snapshot. Returns the snapshot_id (one per recluster event)."""
|
| 555 |
+
snapshot_id = str(uuid.uuid4())
|
| 556 |
+
async with aiosqlite.connect(DB_PATH) as conn:
|
| 557 |
+
for c in clusters:
|
| 558 |
+
paper_ids = json.loads(c["paper_ids"]) if isinstance(c["paper_ids"], str) else c["paper_ids"]
|
| 559 |
+
paper_ids_hash = hashlib.sha256(
|
| 560 |
+
json.dumps(sorted(paper_ids)).encode()
|
| 561 |
+
).hexdigest()[:16]
|
| 562 |
+
await conn.execute(
|
| 563 |
+
"""INSERT INTO cluster_snapshots
|
| 564 |
+
(user_id, snapshot_id, cluster_idx, medoid_paper_id,
|
| 565 |
+
importance, paper_ids, medoid_embedding_blob, paper_ids_hash)
|
| 566 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
| 567 |
+
(user_id, snapshot_id, c["cluster_idx"], c["medoid_paper_id"],
|
| 568 |
+
c["importance"], json.dumps(paper_ids),
|
| 569 |
+
c.get("medoid_embedding_blob"), paper_ids_hash),
|
| 570 |
+
)
|
| 571 |
+
await conn.commit()
|
| 572 |
+
return snapshot_id
|
| 573 |
+
```
|
| 574 |
+
|
| 575 |
+
In `recommendations.py`, **after** `save_clusters_to_db(user_id, clusters)` (the existing call that maintains the "current state" view), add:
|
| 576 |
+
|
| 577 |
+
```python
|
| 578 |
+
snapshot_id = await db.save_cluster_snapshot(user_id, [
|
| 579 |
+
{
|
| 580 |
+
"cluster_idx": c.cluster_idx,
|
| 581 |
+
"medoid_paper_id": c.medoid_paper_id,
|
| 582 |
+
"importance": c.importance,
|
| 583 |
+
"paper_ids": json.dumps(c.paper_ids),
|
| 584 |
+
"medoid_embedding_blob": c.medoid_embedding.astype(np.float32).tobytes(),
|
| 585 |
+
}
|
| 586 |
+
for c in clusters
|
| 587 |
+
])
|
| 588 |
+
```
|
| 589 |
+
|
| 590 |
+
Crucially: keep `save_clusters_to_db` doing exactly what it does today. `cluster_snapshots` is **purely additive history** β current-state queries still hit `user_clusters`, retrospective queries hit `cluster_snapshots`. No existing code path changes behaviour.
|
| 591 |
+
|
| 592 |
+
#### Retention policy
|
| 593 |
+
|
| 594 |
+
A nightly cleanup keeps the last 30 days per user (anything older is unlikely to be useful for debugging and bloats the snapshots table without bound):
|
| 595 |
+
|
| 596 |
+
```python
|
| 597 |
+
async def prune_old_snapshots(retention_days: int = 30):
|
| 598 |
+
async with aiosqlite.connect(DB_PATH) as conn:
|
| 599 |
+
await conn.execute(
|
| 600 |
+
"DELETE FROM cluster_snapshots WHERE snapshot_date < datetime('now', ?)",
|
| 601 |
+
(f"-{retention_days} days",),
|
| 602 |
+
)
|
| 603 |
+
await conn.commit()
|
| 604 |
+
```
|
| 605 |
+
|
| 606 |
+
For now, call it on startup (FastAPI lifespan handler). In Phase 7 you'll add a proper APScheduler cron.
|
| 607 |
+
|
| 608 |
+
**Tests:**
|
| 609 |
+
|
| 610 |
+
```python
|
| 611 |
+
async def test_snapshot_appended_on_each_recluster():
|
| 612 |
+
"""Two reclusters of the same user should produce two distinct snapshot_ids
|
| 613 |
+
and 2N rows in cluster_snapshots (where N = number of clusters)."""
|
| 614 |
+
user_id = "test-user"
|
| 615 |
+
clusters_v1 = [_make_cluster(idx=0, papers=["a", "b"])]
|
| 616 |
+
clusters_v2 = [_make_cluster(idx=0, papers=["a", "b", "c"])]
|
| 617 |
+
sid1 = await db.save_cluster_snapshot(user_id, clusters_v1)
|
| 618 |
+
sid2 = await db.save_cluster_snapshot(user_id, clusters_v2)
|
| 619 |
+
assert sid1 != sid2
|
| 620 |
+
rows = await db.fetch_all(
|
| 621 |
+
"SELECT snapshot_id, paper_ids_hash FROM cluster_snapshots WHERE user_id = ? ORDER BY snapshot_date",
|
| 622 |
+
user_id,
|
| 623 |
+
)
|
| 624 |
+
assert len(rows) == 2
|
| 625 |
+
assert rows[0]["paper_ids_hash"] != rows[1]["paper_ids_hash"] # content-addressed
|
| 626 |
+
|
| 627 |
+
async def test_prune_respects_retention():
|
| 628 |
+
"""Snapshots older than retention_days should be deleted; newer ones kept."""
|
| 629 |
+
# Insert one snapshot dated 45 days ago, one dated 5 days ago
|
| 630 |
+
# Run prune_old_snapshots(retention_days=30)
|
| 631 |
+
# Assert only the recent one remains
|
| 632 |
+
```
|
| 633 |
+
|
| 634 |
+
**Estimated effort:** 6 hours.
|
| 635 |
+
|
| 636 |
+
---
|
| 637 |
+
|
| 638 |
+
## 5. Bucket 3 β Phase 5.1: Cold-Start Completion (Day 5, parallel)
|
| 639 |
+
|
| 640 |
+
This sits **outside Phase 6.5** but ships as part of the same 5-day push. Single identity: **complete the Layer 2 of Phase 5's three-layer onboarding plan that was deferred at the time.** Original Phase 5 plan called for: (Layer 1) category selection, (Layer 2) author-paper import, (Layer 3) seed paper search. Layer 2 was cut for time. This is it.
|
| 641 |
+
|
| 642 |
+
### 5.1 β B4: Semantic Scholar Author Import
|
| 643 |
+
|
| 644 |
+
**The user-visible win.** Before B4: a new user lands on `/onboarding`, picks 3 categories, then has to manually search for and save 5 seed papers β friction that bleeds users at the conversion step. After B4: paste your S2 author URL, the system pulls your authored papers, and you have 20 implicit "saves" instantly. First feed is genuinely personalized within seconds of arrival.
|
| 645 |
+
|
| 646 |
+
This is also the only piece of work in the 5-day push that touches user experience directly. The other four days are all infrastructure. It's worth shipping in the same window so the user-facing improvement masks the otherwise-invisible plumbing changes.
|
| 647 |
+
|
| 648 |
+
#### S2 API endpoint
|
| 649 |
+
|
| 650 |
+
```
|
| 651 |
+
GET https://api.semanticscholar.org/graph/v1/author/{author_id}/papers
|
| 652 |
+
?fields=externalIds,title,year,citationCount
|
| 653 |
+
&limit=100
|
| 654 |
+
```
|
| 655 |
+
|
| 656 |
+
`externalIds.ArXiv` gives you the arXiv ID directly β no DOI translation needed. `S2_API_KEY` env var already exists (it's used in Phase 6 reranker training scripts).
|
| 657 |
+
|
| 658 |
+
#### The flow
|
| 659 |
+
|
| 660 |
+
**1. New onboarding step** (insert between "categories" and "seed papers" in the existing onboarding wizard):
|
| 661 |
+
|
| 662 |
+
```
|
| 663 |
+
Step 2 of 3: Import your work (optional)
|
| 664 |
+
|
| 665 |
+
[ Paste your Semantic Scholar profile URL or ORCID ]
|
| 666 |
+
[ Import ]
|
| 667 |
+
|
| 668 |
+
[ Skip β I'll search for seed papers manually ]
|
| 669 |
+
```
|
| 670 |
+
|
| 671 |
+
**2. New service file** `app/s2_svc.py`:
|
| 672 |
+
|
| 673 |
+
```python
|
| 674 |
+
"""Semantic Scholar API client for author paper import."""
|
| 675 |
+
import re
|
| 676 |
+
import httpx
|
| 677 |
+
from app import config
|
| 678 |
+
|
| 679 |
+
S2_BASE = "https://api.semanticscholar.org/graph/v1"
|
| 680 |
+
|
| 681 |
+
|
| 682 |
+
def parse_author_input(text: str) -> str | None:
|
| 683 |
+
"""Accept S2 URL, raw S2 ID, or ORCID. Return S2 author ID or None."""
|
| 684 |
+
text = text.strip()
|
| 685 |
+
# S2 URL: https://www.semanticscholar.org/author/Name/12345678
|
| 686 |
+
m = re.search(r"semanticscholar\.org/author/[^/]+/(\d+)", text)
|
| 687 |
+
if m:
|
| 688 |
+
return m.group(1)
|
| 689 |
+
# Raw S2 ID
|
| 690 |
+
if text.isdigit():
|
| 691 |
+
return text
|
| 692 |
+
# ORCID: 0000-0002-1825-0097
|
| 693 |
+
if re.match(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$", text):
|
| 694 |
+
# Resolve ORCID β S2 ID via S2's author search
|
| 695 |
+
return None # caller should call resolve_orcid()
|
| 696 |
+
return None
|
| 697 |
+
|
| 698 |
+
|
| 699 |
+
async def resolve_orcid(orcid: str) -> str | None:
|
| 700 |
+
"""Resolve ORCID β S2 author ID via S2's author search."""
|
| 701 |
+
headers = {"x-api-key": config.S2_API_KEY} if config.S2_API_KEY else {}
|
| 702 |
+
async with httpx.AsyncClient(timeout=10) as client:
|
| 703 |
+
resp = await client.get(
|
| 704 |
+
f"{S2_BASE}/author/search",
|
| 705 |
+
params={"query": f"ORCID:{orcid}", "limit": 1, "fields": "authorId"},
|
| 706 |
+
headers=headers,
|
| 707 |
+
)
|
| 708 |
+
resp.raise_for_status()
|
| 709 |
+
data = resp.json().get("data", [])
|
| 710 |
+
return data[0]["authorId"] if data else None
|
| 711 |
+
|
| 712 |
+
|
| 713 |
+
async def fetch_author_arxiv_papers(author_id: str, limit: int = 50) -> list[str]:
|
| 714 |
+
"""Return arxiv_ids of papers authored by this S2 author, most-recent first."""
|
| 715 |
+
headers = {"x-api-key": config.S2_API_KEY} if config.S2_API_KEY else {}
|
| 716 |
+
async with httpx.AsyncClient(timeout=15) as client:
|
| 717 |
+
resp = await client.get(
|
| 718 |
+
f"{S2_BASE}/author/{author_id}/papers",
|
| 719 |
+
params={"fields": "externalIds,year", "limit": limit},
|
| 720 |
+
headers=headers,
|
| 721 |
+
)
|
| 722 |
+
resp.raise_for_status()
|
| 723 |
+
data = resp.json()
|
| 724 |
+
arxiv_ids = []
|
| 725 |
+
# Sort by year descending so we keep most-recent papers if we hit limit
|
| 726 |
+
papers = sorted(
|
| 727 |
+
data.get("data", []),
|
| 728 |
+
key=lambda p: p.get("year") or 0,
|
| 729 |
+
reverse=True,
|
| 730 |
+
)
|
| 731 |
+
for paper in papers:
|
| 732 |
+
ext = paper.get("externalIds") or {}
|
| 733 |
+
if arxiv_id := ext.get("ArXiv"):
|
| 734 |
+
arxiv_ids.append(str(arxiv_id)) # CLAUDE.md rule: arxiv_ids always strings
|
| 735 |
+
return arxiv_ids
|
| 736 |
+
```
|
| 737 |
+
|
| 738 |
+
**3. New router endpoint** in `app/routers/onboarding.py`:
|
| 739 |
+
|
| 740 |
+
```python
|
| 741 |
+
@router.post("/api/onboarding/import-author", response_class=HTMLResponse)
|
| 742 |
+
async def import_author(
|
| 743 |
+
request: Request,
|
| 744 |
+
author_input: str = Form(...),
|
| 745 |
+
user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
|
| 746 |
+
):
|
| 747 |
+
user_id = user_id or str(uuid.uuid4())
|
| 748 |
+
|
| 749 |
+
# Parse: accept S2 URL, S2 ID, or ORCID
|
| 750 |
+
s2_author_id = s2_svc.parse_author_input(author_input)
|
| 751 |
+
if not s2_author_id:
|
| 752 |
+
# Try ORCID resolution
|
| 753 |
+
if re.match(r"^\d{4}-\d{4}-\d{4}-\d{3}[\dX]$", author_input.strip()):
|
| 754 |
+
s2_author_id = await s2_svc.resolve_orcid(author_input.strip())
|
| 755 |
+
if not s2_author_id:
|
| 756 |
+
return templates.TemplateResponse(
|
| 757 |
+
request, "partials/import_error.html",
|
| 758 |
+
{"error": "Could not parse input. Try a Semantic Scholar URL or ORCID."},
|
| 759 |
+
status_code=400,
|
| 760 |
+
)
|
| 761 |
+
|
| 762 |
+
# Fetch from S2 with timeout + graceful fallback
|
| 763 |
+
try:
|
| 764 |
+
arxiv_ids = await s2_svc.fetch_author_arxiv_papers(s2_author_id, limit=50)
|
| 765 |
+
except httpx.HTTPError as e:
|
| 766 |
+
log.warning("s2 author fetch failed: %s", e)
|
| 767 |
+
return templates.TemplateResponse(
|
| 768 |
+
request, "partials/import_error.html",
|
| 769 |
+
{"error": "Semantic Scholar is temporarily unavailable. Try seed search instead."},
|
| 770 |
+
status_code=503,
|
| 771 |
+
)
|
| 772 |
+
|
| 773 |
+
if not arxiv_ids:
|
| 774 |
+
return templates.TemplateResponse(
|
| 775 |
+
request, "partials/import_error.html",
|
| 776 |
+
{"error": "No arXiv papers found for this author. Try seed search instead."},
|
| 777 |
+
)
|
| 778 |
+
|
| 779 |
+
# Save each as a seed (triggers EWMA, clustering on next request)
|
| 780 |
+
saved_count = 0
|
| 781 |
+
for aid in arxiv_ids:
|
| 782 |
+
await db.log_interaction(
|
| 783 |
+
user_id=user_id,
|
| 784 |
+
paper_id=aid,
|
| 785 |
+
event_type="save",
|
| 786 |
+
source="onboarding_author_import",
|
| 787 |
+
)
|
| 788 |
+
us.record_positive(user_id, aid)
|
| 789 |
+
# Background: fetch vector + update EWMA (don't block the response)
|
| 790 |
+
asyncio.create_task(_update_profile_on_save(user_id, aid))
|
| 791 |
+
saved_count += 1
|
| 792 |
+
|
| 793 |
+
response = templates.TemplateResponse(
|
| 794 |
+
request, "partials/import_success.html",
|
| 795 |
+
{"saved_count": saved_count, "next_step": "seed_search"},
|
| 796 |
+
)
|
| 797 |
+
response.set_cookie(COOKIE_NAME, user_id, max_age=COOKIE_MAX_AGE)
|
| 798 |
+
return response
|
| 799 |
+
```
|
| 800 |
+
|
| 801 |
+
**4. Tag the imports specially** β `source="onboarding_author_import"` distinguishes these from normal saves and from `source="onboarding_seed_search"`. Phase 7 evaluation can then ask: *"Do users who used author-import have higher week-1 retention than users who used only seed search?"*
|
| 802 |
+
|
| 803 |
+
#### Edge cases
|
| 804 |
+
|
| 805 |
+
| Case | Solution |
|
| 806 |
+
| ------------------------------------------------------------ | ------------------------------------------------------------------------------------- |
|
| 807 |
+
| Author has 200 papers | Cap at 50 most-recent (50 is plenty for clustering; year-sorted before cap) |
|
| 808 |
+
| Author has 0 arXiv papers (e.g. pure CS-conference profile) | Show "No arXiv papers found β try seed search instead" |
|
| 809 |
+
| User pastes ORCID instead of S2 URL | Resolved via S2's author search by ORCID |
|
| 810 |
+
| User pastes a paper URL by mistake | `parse_author_input` returns None β friendly error |
|
| 811 |
+
| S2 API rate limit hit | Graceful 503 β fall back to manual seed search |
|
| 812 |
+
| User imports, then dislikes everything | Negative EWMA self-corrects within 5β10 dismissals |
|
| 813 |
+
| User has S2 ID but multiple disambiguated profiles | Out of scope β they pick the right one when copying their URL |
|
| 814 |
+
|
| 815 |
+
**Test:**
|
| 816 |
+
|
| 817 |
+
```python
|
| 818 |
+
async def test_s2_import_saves_papers_with_correct_source_tag():
|
| 819 |
+
# Mock fetch_author_arxiv_papers to return ["2401.001", "2401.002"]
|
| 820 |
+
# POST /api/onboarding/import-author with a fake S2 URL
|
| 821 |
+
rows = await db.fetch_all(
|
| 822 |
+
"SELECT paper_id, source FROM interactions WHERE user_id = ?", user_id,
|
| 823 |
+
)
|
| 824 |
+
assert {r["paper_id"] for r in rows} == {"2401.001", "2401.002"}
|
| 825 |
+
assert all(r["source"] == "onboarding_author_import" for r in rows)
|
| 826 |
+
```
|
| 827 |
+
|
| 828 |
+
**Estimated effort:** 5 hours.
|
| 829 |
+
|
| 830 |
+
---
|
| 831 |
+
|
| 832 |
+
## 6. What Phase 7 inherits
|
| 833 |
+
|
| 834 |
+
After these 5 days, Phase 7 starts on a substrate where every prerequisite is already in production:
|
| 835 |
+
|
| 836 |
+
| Capability | Before this push | After this push |
|
| 837 |
+
| --------------------------------------- | ----------------------------------------- | ------------------------------------------------ |
|
| 838 |
+
| Feature 0 in LightGBM | β rank-proxy lie | β
actual cosine |
|
| 839 |
+
| Production model verified live | β unverified | β
green checkmark with timestamp |
|
| 840 |
+
| Per-feed CTR measurable | β no `query_id` | β
one SQL query away |
|
| 841 |
+
| Counterfactual eval (SNIPS) possible | β no propensity | β
schema ready, propensities flowing |
|
| 842 |
+
| Cluster history queryable | β destroyed on each recluster | β
30 days kept, content-addressed |
|
| 843 |
+
| Cold-start onboarding | β manual 5-paper search only | β
paste S2 URL β 20 implicit saves |
|
| 844 |
+
|
| 845 |
+
Phase 7's evaluation framework now has a real substrate. Without these, Phase 7 would have to spend its first week building this infrastructure anyway β better to do it deliberately as a pre-Phase-7 push than under deadline pressure.
|
| 846 |
+
|
| 847 |
+
---
|
| 848 |
+
|
| 849 |
+
## 7. Acceptance criteria
|
| 850 |
+
|
| 851 |
+
### Bucket 1 β Phase 6 Hot-fix done when:
|
| 852 |
+
|
| 853 |
+
- [ ] `qdrant_score_map` is populated from `search_by_vector_with_scores()` in both the per-cluster path and the short-term supplement path
|
| 854 |
+
- [ ] Synthetic-decay block (current `recommendations.py:313β325`) is deleted
|
| 855 |
+
- [ ] `test_qdrant_scores_are_real_cosines_not_rank_proxies` passes
|
| 856 |
+
- [ ] `curl https://siddhm11-researchit.hf.space/healthz/reranker` returns `model_loaded: true, n_trees: 141, fallback_active: false`
|
| 857 |
+
- [ ] PHASE6-Reranker-Framing.md updated with verification timestamp
|
| 858 |
+
|
| 859 |
+
### Bucket 2 β Phase 6.5 done when:
|
| 860 |
+
|
| 861 |
+
- [ ] `query_id` is generated per request in `recommendations.py` and `search.py` and round-trips through templates β events β DB
|
| 862 |
+
- [ ] `interactions` table has `propensity REAL` and `policy_id TEXT` columns
|
| 863 |
+
- [ ] Every interaction logged from a recommendation/search request has non-null `propensity` and `policy_id`
|
| 864 |
+
- [ ] `cluster_snapshots` table exists with the schema in Β§4.3
|
| 865 |
+
- [ ] Every recluster appends a new snapshot (verified by `test_snapshot_appended_on_each_recluster`)
|
| 866 |
+
- [ ] `prune_old_snapshots(retention_days=30)` is registered in the FastAPI lifespan handler
|
| 867 |
+
- [ ] All new tests pass; total test count in `README.md` updated
|
| 868 |
+
|
| 869 |
+
### Bucket 3 β Phase 5.1 done when:
|
| 870 |
+
|
| 871 |
+
- [ ] `app/s2_svc.py` exists and `fetch_author_arxiv_papers` returns arxiv IDs (verified against a real S2 author profile)
|
| 872 |
+
- [ ] `/api/onboarding/import-author` accepts S2 URL, S2 ID, and ORCID input forms
|
| 873 |
+
- [ ] Imported papers are saved with `source="onboarding_author_import"`
|
| 874 |
+
- [ ] Background EWMA update fires for each imported paper
|
| 875 |
+
- [ ] All 6 edge cases in Β§5.1 are handled with graceful UX
|
| 876 |
+
|
| 877 |
+
---
|
| 878 |
+
|
| 879 |
+
## 8. Sequencing & timeline
|
| 880 |
+
|
| 881 |
+
### Recommended order
|
| 882 |
+
|
| 883 |
+
```
|
| 884 |
+
Day 1 (~3h) Bucket 1: A1 (real Qdrant scores) + A2 (curl /healthz)
|
| 885 |
+
Day 2 (~3h) Bucket 2.B1: query_id linkage
|
| 886 |
+
Day 3 (~4h) Bucket 2.B2: propensity logging
|
| 887 |
+
Day 4 (~6h) Bucket 2.B3: cluster snapshot versioning
|
| 888 |
+
Day 5 (~5h) Bucket 3.B4: S2 author import
|
| 889 |
+
```
|
| 890 |
+
|
| 891 |
+
Each day leaves the app in a working state. No big-bang refactors. No day depends on a later day's work.
|
| 892 |
+
|
| 893 |
+
### Parallelization options
|
| 894 |
+
|
| 895 |
+
If you have stretches where you want to context-switch:
|
| 896 |
+
|
| 897 |
+
- **Day 5 (B4) can run anytime** β it's onboarding code, doesn't touch the recommendation pipeline or schema. Could ship before Day 1 if you want a user-visible win first.
|
| 898 |
+
- **Day 1 should land before Day 2β4** β once `query_id` and `propensity` start flowing, you want feature 0 to already be real cosines so your first logged interactions are clean training data for any future retrain.
|
| 899 |
+
- **Days 2β4 should ship as a block** β the three pieces compound. Shipping B1 without B2 means logs have feed identity but no eval lever; shipping B2 without B1 means propensities can't be grouped by feed; shipping B3 without either means snapshots exist but you can't correlate them to actions.
|
| 900 |
+
|
| 901 |
+
### What this defers (intentionally)
|
| 902 |
+
|
| 903 |
+
| Item | Why deferred |
|
| 904 |
+
| ----------------------------------------------------- | --------------------------------------------------------------------------------------------- |
|
| 905 |
+
| Track C: Full ORCID/Scholar import with disambiguation | B4 captures ~80% of the value. Full version waits until there's user-data evidence it's needed. |
|
| 906 |
+
| Track D: Cluster summary cards (Phase 8a preview) | Needs Phase 7 evaluation infrastructure to measure whether it actually helps users. |
|
| 907 |
+
| Phase 6.4 reranker retraining | Already gated on synthetic simulator OR 100 real users with β₯10 saves each. Unchanged. |
|
| 908 |
+
|
| 909 |
+
---
|
| 910 |
+
|
| 911 |
+
## 9. Documentation updates needed
|
| 912 |
+
|
| 913 |
+
After this push lands:
|
| 914 |
+
|
| 915 |
+
- [ ] Add line to `CLAUDE.md` non-negotiable rules: *"Rule 9: Every interaction logged from a recommendation/search request must carry `query_id`, `propensity`, and `policy_id`. These are load-bearing for Phase 7 evaluation."*
|
| 916 |
+
- [ ] Update `PHASE6-Reranker-Framing.md` Section E with the live verification timestamp
|
| 917 |
+
- [ ] Update `TASK-TRACKER.md`:
|
| 918 |
+
- Tick `[x] [reranker] LightGBM model loaded (verified live YYYY-MM-DD)`
|
| 919 |
+
- Tick `[x] [reranker] qdrant_cosine_score uses real cosines`
|
| 920 |
+
- Add new section `Phase 6.5 β Instrumentation Foundation` with checklist from Β§7
|
| 921 |
+
- [ ] Update `README.md` test count
|
| 922 |
+
- [ ] Update `docs/walkthroughs/04-Next-Steps-and-Phase-Plan.md`: insert Phase 6.5 between Phase 6 and Phase 7 in the master roadmap; note Phase 5.1 as a parallel side-quest
|
| 923 |
+
- [ ] Mark ADR A1 (cluster snapshot versioning) and ADR A4 (telemetry schema) as **Decided + Implemented** in the Phase 6 framing doc's ADR table
|
| 924 |
+
|
| 925 |
+
---
|
| 926 |
+
|
| 927 |
+
## 10. Out of scope (explicit)
|
| 928 |
+
|
| 929 |
+
To keep this doc focused, the following are **not** part of this push:
|
| 930 |
+
|
| 931 |
+
- Building the actual evaluation harness (offline regression, time-split eval, frozen `eval/eval_set_v1.0.parquet`, CI gates) β that's Phase 7 itself.
|
| 932 |
+
- LLM cluster summaries (Phase 8a) β depends on `paper_ids_hash` from B3, but the LLM call path itself is Phase 8.
|
| 933 |
+
- Reranker retraining (Phase 6.4) β gated on user-volume thresholds, unchanged.
|
| 934 |
+
- Google Scholar import β no public API, would need scraping. Defer until S2 import shows real adoption.
|
| 935 |
+
- Per-paper relevance dial in author import (not all of someone's authored papers represent current interest) β out of scope; let the EWMA negative path handle it organically.
|
| 936 |
+
|
| 937 |
+
---
|
| 938 |
+
|
| 939 |
+
*End of framing doc.*
|
|
@@ -231,20 +231,29 @@ def test_quota_pipeline_preserves_minority_cluster(client, monkeypatch):
|
|
| 231 |
combined = {**saved_vectors, **candidate_vectors}
|
| 232 |
return {aid: combined[aid] for aid in ids if aid in combined}
|
| 233 |
|
| 234 |
-
#
|
| 235 |
-
# the query is closer to
|
| 236 |
-
async def
|
| 237 |
qv = np.array(query_vector, dtype=np.float32)
|
| 238 |
qv /= np.linalg.norm(qv)
|
| 239 |
if float(qv @ nlp_center) > float(qv @ rl_center):
|
| 240 |
pool = nlp_candidates
|
|
|
|
| 241 |
else:
|
| 242 |
pool = rl_candidates
|
|
|
|
| 243 |
exclude = exclude_ids or set()
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
monkeypatch.setattr(qs, "get_paper_vectors", fake_get_paper_vectors)
|
| 247 |
-
monkeypatch.setattr(qs, "
|
| 248 |
|
| 249 |
# Skip EWMA short-term lookup β returns None
|
| 250 |
async def fake_load_profile(uid, kind):
|
|
|
|
| 231 |
combined = {**saved_vectors, **candidate_vectors}
|
| 232 |
return {aid: combined[aid] for aid in ids if aid in combined}
|
| 233 |
|
| 234 |
+
# search_by_vector_with_scores returns candidates with cosine scores,
|
| 235 |
+
# aligned with whichever centre the query is closer to
|
| 236 |
+
async def fake_search_by_vector_with_scores(query_vector, limit, exclude_ids=None):
|
| 237 |
qv = np.array(query_vector, dtype=np.float32)
|
| 238 |
qv /= np.linalg.norm(qv)
|
| 239 |
if float(qv @ nlp_center) > float(qv @ rl_center):
|
| 240 |
pool = nlp_candidates
|
| 241 |
+
center = nlp_center
|
| 242 |
else:
|
| 243 |
pool = rl_candidates
|
| 244 |
+
center = rl_center
|
| 245 |
exclude = exclude_ids or set()
|
| 246 |
+
results = []
|
| 247 |
+
for p in pool:
|
| 248 |
+
if p not in exclude:
|
| 249 |
+
# Compute realistic cosine score
|
| 250 |
+
pv = np.array(candidate_vectors[p], dtype=np.float32)
|
| 251 |
+
score = float(qv @ pv / (np.linalg.norm(qv) * np.linalg.norm(pv) + 1e-10))
|
| 252 |
+
results.append({"arxiv_id": p, "score": score})
|
| 253 |
+
return results[:limit]
|
| 254 |
|
| 255 |
monkeypatch.setattr(qs, "get_paper_vectors", fake_get_paper_vectors)
|
| 256 |
+
monkeypatch.setattr(qs, "search_by_vector_with_scores", fake_search_by_vector_with_scores)
|
| 257 |
|
| 258 |
# Skip EWMA short-term lookup β returns None
|
| 259 |
async def fake_load_profile(uid, kind):
|