siddhm11 commited on
Commit
02df9f5
Β·
1 Parent(s): 4bb3d95

Phase 6.5 Day 3: Propensity logging (B2)

Browse files

db.py:
- _MIGRATION_6_5: add propensity REAL + policy_id TEXT columns to interactions
- init_db(): runs the new migration on startup
- log_interaction(): extended with propensity + policy_id params

recommendations.py:
- Tier 1: compute explore_propensity = n_explore / explore_pool_size
MMR-selected (deterministic) get propensity=1.0
Exploration papers get propensity = n_actual_explore / pool_size
- Tiers 2/3/trending: deterministic, all get propensity=1.0
- Rendering loop: embed propensity + policy_id in each paper dict

search.py:
- Add propensity=1.0 + policy_id='search_v1' to all search results

events.py:
- Add propensity + policy_id Form params to both save and not-interested
- Forward to db.log_interaction()

action_buttons.html:
- Add propensity + policy_id to all hx-vals JSON blobs

This enables Phase 7 SNIPS counterfactual evaluation:
SNIPS(pi_B) = sum(r_i * pi_B/pi_A) / sum(pi_B/pi_A)
where pi_A is the stored propensity from the logging policy.

Tests: 203 passed, 0 failures

app/db.py CHANGED
@@ -104,6 +104,12 @@ _MIGRATION_6_3 = [
104
  "ALTER TABLE user_clusters ADD COLUMN medoid_embedding_blob BLOB",
105
  ]
106
 
 
 
 
 
 
 
107
 
108
  async def init_db() -> None:
109
  """Create tables if they don't exist. Called once at startup."""
@@ -121,6 +127,12 @@ async def init_db() -> None:
121
  await db.execute(stmt)
122
  except Exception:
123
  pass # Column already exists β€” safe to ignore
 
 
 
 
 
 
124
  await db.commit()
125
 
126
 
@@ -136,15 +148,19 @@ async def log_interaction(
136
  ranker_version: str | None = None,
137
  candidate_source: str | None = None,
138
  cluster_id: int | None = None,
 
 
139
  ) -> None:
140
  async with aiosqlite.connect(DB_PATH) as db:
141
  await db.execute(
142
  """INSERT INTO interactions
143
  (user_id, paper_id, event_type, source, position, query_id,
144
- ranker_version, candidate_source, cluster_id)
145
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)""",
 
146
  (user_id, paper_id, event_type, source, position, query_id,
147
- ranker_version, candidate_source, cluster_id),
 
148
  )
149
  await db.commit()
150
 
 
104
  "ALTER TABLE user_clusters ADD COLUMN medoid_embedding_blob BLOB",
105
  ]
106
 
107
+ # ── Phase 6.5 B2: Propensity + policy_id for counterfactual evaluation ────────
108
+ _MIGRATION_6_5 = [
109
+ "ALTER TABLE interactions ADD COLUMN propensity REAL",
110
+ "ALTER TABLE interactions ADD COLUMN policy_id TEXT",
111
+ ]
112
+
113
 
114
  async def init_db() -> None:
115
  """Create tables if they don't exist. Called once at startup."""
 
127
  await db.execute(stmt)
128
  except Exception:
129
  pass # Column already exists β€” safe to ignore
130
+ # Phase 6.5 B2: add propensity + policy_id for SNIPS evaluation
131
+ for stmt in _MIGRATION_6_5:
132
+ try:
133
+ await db.execute(stmt)
134
+ except Exception:
135
+ pass # Column already exists β€” safe to ignore
136
  await db.commit()
137
 
138
 
 
148
  ranker_version: str | None = None,
149
  candidate_source: str | None = None,
150
  cluster_id: int | None = None,
151
+ propensity: float | None = None,
152
+ policy_id: str | None = None,
153
  ) -> None:
154
  async with aiosqlite.connect(DB_PATH) as db:
155
  await db.execute(
156
  """INSERT INTO interactions
157
  (user_id, paper_id, event_type, source, position, query_id,
158
+ ranker_version, candidate_source, cluster_id,
159
+ propensity, policy_id)
160
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
161
  (user_id, paper_id, event_type, source, position, query_id,
162
+ ranker_version, candidate_source, cluster_id,
163
+ propensity, policy_id),
164
  )
165
  await db.commit()
166
 
app/routers/events.py CHANGED
@@ -27,6 +27,8 @@ async def save_paper(
27
  ranker_version: str = Form(default=""),
28
  candidate_source: str = Form(default=""),
29
  cluster_id: str = Form(default=""),
 
 
30
  user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
31
  ):
32
  user_id = user_id or str(uuid.uuid4())
@@ -41,6 +43,8 @@ async def save_paper(
41
  ranker_version=ranker_version or None,
42
  candidate_source=candidate_source or None,
43
  cluster_id=int(cluster_id) if cluster_id else None,
 
 
44
  )
45
 
46
  us.record_positive(user_id, paper_id)
@@ -66,6 +70,8 @@ async def not_interested(
66
  ranker_version: str = Form(default=""),
67
  candidate_source: str = Form(default=""),
68
  cluster_id: str = Form(default=""),
 
 
69
  user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
70
  ):
71
  user_id = user_id or str(uuid.uuid4())
@@ -80,6 +86,8 @@ async def not_interested(
80
  ranker_version=ranker_version or None,
81
  candidate_source=candidate_source or None,
82
  cluster_id=int(cluster_id) if cluster_id else None,
 
 
83
  )
84
 
85
  us.record_negative(user_id, paper_id)
 
27
  ranker_version: str = Form(default=""),
28
  candidate_source: str = Form(default=""),
29
  cluster_id: str = Form(default=""),
30
+ propensity: float = Form(default=0.0),
31
+ policy_id: str = Form(default=""),
32
  user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
33
  ):
34
  user_id = user_id or str(uuid.uuid4())
 
43
  ranker_version=ranker_version or None,
44
  candidate_source=candidate_source or None,
45
  cluster_id=int(cluster_id) if cluster_id else None,
46
+ propensity=propensity if propensity > 0 else None,
47
+ policy_id=policy_id or None,
48
  )
49
 
50
  us.record_positive(user_id, paper_id)
 
70
  ranker_version: str = Form(default=""),
71
  candidate_source: str = Form(default=""),
72
  cluster_id: str = Form(default=""),
73
+ propensity: float = Form(default=0.0),
74
+ policy_id: str = Form(default=""),
75
  user_id: str | None = Cookie(default=None, alias=COOKIE_NAME),
76
  ):
77
  user_id = user_id or str(uuid.uuid4())
 
86
  ranker_version=ranker_version or None,
87
  candidate_source=candidate_source or None,
88
  cluster_id=int(cluster_id) if cluster_id else None,
89
+ propensity=propensity if propensity > 0 else None,
90
+ policy_id=policy_id or None,
91
  )
92
 
93
  us.record_negative(user_id, paper_id)
app/routers/recommendations.py CHANGED
@@ -90,6 +90,8 @@ async def get_recommendations(
90
  paper["cluster_id"] = ""
91
  paper["query_id"] = query_id
92
  paper["position"] = idx
 
 
93
  papers.append(paper)
94
 
95
  r = templates.TemplateResponse(
@@ -123,6 +125,8 @@ async def get_recommendations(
123
  "candidate_source": "ewma_longterm",
124
  "cluster_id": "",
125
  "query_id": query_id,
 
 
126
  }
127
 
128
  # ── Tier 3: Qdrant Recommend API (β‰₯1 save fallback) ───────────────────
@@ -139,6 +143,8 @@ async def get_recommendations(
139
  "candidate_source": "qdrant_recommend",
140
  "cluster_id": "",
141
  "query_id": query_id,
 
 
142
  }
143
 
144
  if not rec_arxiv_ids:
@@ -173,6 +179,9 @@ async def get_recommendations(
173
  # Phase 6.5 B1: query_id + position for per-feed CTR
174
  "query_id": tags.get("query_id", query_id),
175
  "position": idx,
 
 
 
176
  })
177
 
178
  resp = templates.TemplateResponse(
@@ -452,8 +461,17 @@ async def _multi_interest_recommend(
452
  )
453
  final = final[:limit + 2]
454
 
455
- # Phase 4.5: Build per-paper instrumentation tags
456
  exploration_set = set(final) - set(mmr_selected)
 
 
 
 
 
 
 
 
 
457
  paper_tags: dict[str, dict] = {}
458
  for aid in final:
459
  cluster_idx = paper_cluster_map.get(aid)
@@ -470,6 +488,8 @@ async def _multi_interest_recommend(
470
  "candidate_source": source,
471
  "cluster_id": str(cluster_idx) if cluster_idx is not None and cluster_idx >= 0 else "",
472
  "query_id": query_id,
 
 
473
  }
474
 
475
  return final, paper_tags
 
90
  paper["cluster_id"] = ""
91
  paper["query_id"] = query_id
92
  paper["position"] = idx
93
+ paper["propensity"] = 1.0 # deterministic
94
+ paper["policy_id"] = _RANKER_VERSION
95
  papers.append(paper)
96
 
97
  r = templates.TemplateResponse(
 
125
  "candidate_source": "ewma_longterm",
126
  "cluster_id": "",
127
  "query_id": query_id,
128
+ "propensity": 1.0,
129
+ "policy_id": _RANKER_VERSION,
130
  }
131
 
132
  # ── Tier 3: Qdrant Recommend API (β‰₯1 save fallback) ───────────────────
 
143
  "candidate_source": "qdrant_recommend",
144
  "cluster_id": "",
145
  "query_id": query_id,
146
+ "propensity": 1.0,
147
+ "policy_id": _RANKER_VERSION,
148
  }
149
 
150
  if not rec_arxiv_ids:
 
179
  # Phase 6.5 B1: query_id + position for per-feed CTR
180
  "query_id": tags.get("query_id", query_id),
181
  "position": idx,
182
+ # Phase 6.5 B2: propensity + policy_id for counterfactual eval
183
+ "propensity": tags.get("propensity", 1.0),
184
+ "policy_id": tags.get("policy_id", _RANKER_VERSION),
185
  })
186
 
187
  resp = templates.TemplateResponse(
 
461
  )
462
  final = final[:limit + 2]
463
 
464
+ # Phase 4.5 + 6.5: Build per-paper instrumentation tags
465
  exploration_set = set(final) - set(mmr_selected)
466
+
467
+ # Phase 6.5 B2: Compute propensity for counterfactual evaluation
468
+ # MMR-selected papers are deterministic β†’ propensity = 1.0
469
+ # Exploration papers are randomly sampled β†’ propensity = n_explore / pool_size
470
+ mmr_set = set(mmr_selected)
471
+ explore_pool_size = max(1, len(reranked_ids) - len(mmr_set))
472
+ n_actual_explore = len(exploration_set)
473
+ explore_propensity = n_actual_explore / explore_pool_size if explore_pool_size > 0 else 0.0
474
+
475
  paper_tags: dict[str, dict] = {}
476
  for aid in final:
477
  cluster_idx = paper_cluster_map.get(aid)
 
488
  "candidate_source": source,
489
  "cluster_id": str(cluster_idx) if cluster_idx is not None and cluster_idx >= 0 else "",
490
  "query_id": query_id,
491
+ "propensity": explore_propensity if aid in exploration_set else 1.0,
492
+ "policy_id": _RANKER_VERSION,
493
  }
494
 
495
  return final, paper_tags
app/routers/search.py CHANGED
@@ -79,6 +79,8 @@ async def search(
79
  p["dismissed"] = p["arxiv_id"] in dismissed_ids
80
  p["query_id"] = query_id
81
  p["position"] = idx
 
 
82
 
83
  if request.headers.get("HX-Request"):
84
  resp = templates.TemplateResponse(
 
79
  p["dismissed"] = p["arxiv_id"] in dismissed_ids
80
  p["query_id"] = query_id
81
  p["position"] = idx
82
+ p["propensity"] = 1.0 # search is deterministic
83
+ p["policy_id"] = "search_v1"
84
 
85
  if request.headers.get("HX-Request"):
86
  resp = templates.TemplateResponse(
app/templates/partials/action_buttons.html CHANGED
@@ -3,7 +3,8 @@
3
  Expects: paper_id (or paper.arxiv_id), saved (bool), dismissed (bool)
4
  Optional: source ("search" | "recommendation" | "saved"), position (int)
5
  Phase 4.5: ranker_version, candidate_source, cluster_id (set by recommendations.py)
6
- Phase 6.5: query_id (per-request UUID for feed-level CTR)
 
7
  These are returned directly by the /api/papers/{id}/save endpoint
8
  so they also work as a standalone partial.
9
  #}
@@ -15,6 +16,8 @@
15
  {% set _cluster_id = paper.cluster_id | default("") if paper is defined else "" %}
16
  {% set _query_id = paper.query_id | default("") if paper is defined else "" %}
17
  {% set _position = paper.position | default(0) if paper is defined else 0 %}
 
 
18
 
19
  {% if is_saved %}
20
  <!-- Already saved β€” show saved state, allow unsave via not-interested -->
@@ -26,7 +29,7 @@
26
  hx-post="/api/papers/{{ pid }}/not-interested"
27
  hx-target="#paper-{{ pid }}"
28
  hx-swap="outerHTML swap:200ms"
29
- hx-vals='{"source": "{{ _source }}", "position": "{{ _position }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}", "query_id": "{{ _query_id }}"}'>
30
  Remove
31
  </button>
32
  </div>
@@ -37,7 +40,7 @@
37
  hx-post="/api/papers/{{ pid }}/save"
38
  hx-target="[id='actions-{{ pid }}']"
39
  hx-swap="innerHTML"
40
- hx-vals='{"source": "{{ _source }}", "position": "{{ _position }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}", "query_id": "{{ _query_id }}"}'>
41
  ⭐ Save
42
  </button>
43
  <!-- Not interested (removes the whole card) -->
@@ -45,7 +48,7 @@
45
  hx-post="/api/papers/{{ pid }}/not-interested"
46
  hx-target="#paper-{{ pid }}"
47
  hx-swap="outerHTML swap:200ms"
48
- hx-vals='{"source": "{{ _source }}", "position": "{{ _position }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}", "query_id": "{{ _query_id }}"}'>
49
  βœ• Not interested
50
  </button>
51
  </div>
 
3
  Expects: paper_id (or paper.arxiv_id), saved (bool), dismissed (bool)
4
  Optional: source ("search" | "recommendation" | "saved"), position (int)
5
  Phase 4.5: ranker_version, candidate_source, cluster_id (set by recommendations.py)
6
+ Phase 6.5: query_id (per-request UUID for feed-level CTR),
7
+ propensity (probability this paper was shown), policy_id
8
  These are returned directly by the /api/papers/{id}/save endpoint
9
  so they also work as a standalone partial.
10
  #}
 
16
  {% set _cluster_id = paper.cluster_id | default("") if paper is defined else "" %}
17
  {% set _query_id = paper.query_id | default("") if paper is defined else "" %}
18
  {% set _position = paper.position | default(0) if paper is defined else 0 %}
19
+ {% set _propensity = paper.propensity | default(0) if paper is defined else 0 %}
20
+ {% set _policy_id = paper.policy_id | default("") if paper is defined else "" %}
21
 
22
  {% if is_saved %}
23
  <!-- Already saved β€” show saved state, allow unsave via not-interested -->
 
29
  hx-post="/api/papers/{{ pid }}/not-interested"
30
  hx-target="#paper-{{ pid }}"
31
  hx-swap="outerHTML swap:200ms"
32
+ hx-vals='{"source": "{{ _source }}", "position": "{{ _position }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}", "query_id": "{{ _query_id }}", "propensity": "{{ _propensity }}", "policy_id": "{{ _policy_id }}"}'>
33
  Remove
34
  </button>
35
  </div>
 
40
  hx-post="/api/papers/{{ pid }}/save"
41
  hx-target="[id='actions-{{ pid }}']"
42
  hx-swap="innerHTML"
43
+ hx-vals='{"source": "{{ _source }}", "position": "{{ _position }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}", "query_id": "{{ _query_id }}", "propensity": "{{ _propensity }}", "policy_id": "{{ _policy_id }}"}'>
44
  ⭐ Save
45
  </button>
46
  <!-- Not interested (removes the whole card) -->
 
48
  hx-post="/api/papers/{{ pid }}/not-interested"
49
  hx-target="#paper-{{ pid }}"
50
  hx-swap="outerHTML swap:200ms"
51
+ hx-vals='{"source": "{{ _source }}", "position": "{{ _position }}", "ranker_version": "{{ _ranker_version }}", "candidate_source": "{{ _candidate_source }}", "cluster_id": "{{ _cluster_id }}", "query_id": "{{ _query_id }}", "propensity": "{{ _propensity }}", "policy_id": "{{ _policy_id }}"}'>
52
  βœ• Not interested
53
  </button>
54
  </div>