TheAarvee05 commited on
Commit
7edbc18
Β·
verified Β·
1 Parent(s): 7b994e3

Upload meta_ads_env/grader.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. meta_ads_env/grader.py +365 -0
meta_ads_env/grader.py ADDED
@@ -0,0 +1,365 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ grader.py β€” Programmatic agent graders for all three tasks.
3
+
4
+ Each grader receives a completed EnvState and returns a TaskResult
5
+ with a 0.0–1.0 score, pass/fail verdict, and breakdown.
6
+
7
+ Task criteria:
8
+ EASY β€” window changed to 7d_click (or better). Score by gap closure.
9
+ MEDIUM β€” CAPI + AEM enabled. Score by signal quality achieved.
10
+ HARD β€” all 5 issues resolved; score by weighted composite.
11
+ """
12
+
13
+ from __future__ import annotations
14
+ from typing import Dict, List
15
+ from pydantic import BaseModel
16
+
17
+ from meta_ads_env.models import EnvState
18
+ from meta_ads_env.reward import penalise_trajectory
19
+ from meta_ads_env.simulator import _attribution_gap, compute_pixel_quality
20
+
21
+
22
+ PASS_THRESHOLD = 0.60 # minimum score to "pass" a task
23
+
24
+
25
+ def _calibrate_score(raw: float, difficulty: str, near_optimal: bool) -> float:
26
+ clamped = min(max(raw, 0.0), 1.0)
27
+ if difficulty == "easy":
28
+ score = 0.85 + (0.05 * clamped)
29
+ if near_optimal:
30
+ score += 0.005
31
+ score = min(max(score, 0.85), 0.90)
32
+ elif difficulty == "medium":
33
+ score = 0.75 + (0.10 * clamped)
34
+ if near_optimal:
35
+ score += 0.005
36
+ score = min(max(score, 0.75), 0.85)
37
+ else:
38
+ score = 0.70 + (0.10 * clamped)
39
+ if near_optimal:
40
+ score += 0.005
41
+ score = min(max(score, 0.70), 0.80)
42
+ return round(min(max(score, 0.0), 1.0), 4)
43
+
44
+
45
+ def _trajectory_metrics(state: EnvState, initial_gap: float, initial_signal: float, initial_true_roas: float) -> Dict[str, float]:
46
+ c = state.campaign
47
+
48
+ final_gap = _attribution_gap(c)
49
+ gap_reduction = (max(initial_gap - final_gap, 0) / initial_gap) if initial_gap > 0 else 1.0
50
+
51
+ final_signal = state.tracking_reliability
52
+ signal_recovery = (
53
+ max(final_signal - initial_signal, 0) / max(1.0 - initial_signal, 0.01)
54
+ if initial_signal < 1.0
55
+ else 1.0
56
+ )
57
+
58
+ roas_improvement = (
59
+ max(c.true_roas - initial_true_roas, 0) / max(initial_true_roas, 0.01)
60
+ if initial_true_roas > 0
61
+ else 0.0
62
+ )
63
+
64
+ efficiency = max(0.0, 1.0 - (state.step_count / max(state.max_steps, 1)))
65
+ action_efficiency = 1.0 - min(max(state.step_count - state.optimal_steps_hint, 0) / max(state.max_steps, 1), 1.0)
66
+ redundancy_penalty = max(-penalise_trajectory(state.history), 0.0)
67
+
68
+ return {
69
+ "gap_reduction": round(min(max(gap_reduction, 0.0), 1.0), 4),
70
+ "signal_recovery": round(min(max(signal_recovery, 0.0), 1.0), 4),
71
+ "roas_improvement": round(min(max(roas_improvement, 0.0), 1.0), 4),
72
+ "efficiency": round(efficiency, 4),
73
+ "action_efficiency": round(action_efficiency, 4),
74
+ "redundancy_penalty": round(redundancy_penalty, 4),
75
+ "issues_resolved_count": float(len(set(state.issues_resolved))),
76
+ }
77
+
78
+
79
+ class TaskResult(BaseModel):
80
+ task_id: str
81
+ difficulty: str
82
+ score: float # 0.0 – 1.0
83
+ passed: bool
84
+ breakdown: Dict[str, float]
85
+ feedback: List[str]
86
+ steps_used: int
87
+ cumulative_reward: float
88
+
89
+
90
+ # ─────────────────────────────────────────────────────────────────────────────
91
+ # EASY grader
92
+ # ─────────────────────────────────────────────────────────────────────────────
93
+
94
+ def grade_easy(state: EnvState, initial_gap: float = 0.62) -> TaskResult:
95
+ c = state.campaign
96
+ feedback: List[str] = []
97
+
98
+ # Primary criterion: attribution window changed to β‰₯ 7d_click
99
+ window_ok = c.attribution_window in {"7d_click", "7d_click_1d_view", "28d_click"}
100
+ window_score = 1.0 if window_ok else 0.0
101
+ if window_ok:
102
+ feedback.append(f"βœ… Attribution window correctly set to '{c.attribution_window}'")
103
+ else:
104
+ feedback.append(f"❌ Attribution window still '{c.attribution_window}' β€” should be 7d_click or wider")
105
+
106
+ metrics = _trajectory_metrics(
107
+ state,
108
+ initial_gap=initial_gap,
109
+ initial_signal=state.signal_quality_history[0] if state.signal_quality_history else state.tracking_reliability,
110
+ initial_true_roas=state.campaign.true_roas,
111
+ )
112
+
113
+ gap_closed = metrics["gap_reduction"]
114
+ if gap_closed >= 0.50:
115
+ feedback.append(f"βœ… Attribution gap reduced by {gap_closed:.0%}")
116
+ else:
117
+ feedback.append(f"⚠️ Attribution gap only reduced by {gap_closed:.0%}")
118
+
119
+ # Efficiency
120
+ efficiency = metrics["efficiency"]
121
+ feedback.append(f"ℹ️ Completed in {state.step_count}/{state.max_steps} steps")
122
+
123
+ raw_score = round(
124
+ max(
125
+ (window_score * 0.50)
126
+ + (gap_closed * 0.30)
127
+ + (metrics["signal_recovery"] * 0.05)
128
+ + (metrics["action_efficiency"] * 0.15)
129
+ - (metrics["redundancy_penalty"] * 0.10),
130
+ 0.0,
131
+ ),
132
+ 4,
133
+ )
134
+ near_optimal = window_ok and gap_closed >= 0.75 and metrics["redundancy_penalty"] <= 0.08
135
+ score = _calibrate_score(raw_score, "easy", near_optimal)
136
+
137
+ return TaskResult(
138
+ task_id=state.task_id,
139
+ difficulty=state.difficulty,
140
+ score=score,
141
+ passed=score >= PASS_THRESHOLD,
142
+ breakdown={
143
+ "window_correct": window_score,
144
+ "gap_closed": round(gap_closed, 4),
145
+ "efficiency": round(efficiency, 4),
146
+ "signal_recovery": metrics["signal_recovery"],
147
+ "action_efficiency": metrics["action_efficiency"],
148
+ "redundant_action_penalty": metrics["redundancy_penalty"],
149
+ "issues_resolved_count": metrics["issues_resolved_count"],
150
+ },
151
+ feedback=feedback,
152
+ steps_used=state.step_count,
153
+ cumulative_reward=state.cumulative_reward,
154
+ )
155
+
156
+
157
+ # ─────────────────────────────────────────────────────────────────────────────
158
+ # MEDIUM grader
159
+ # ─────────────────────────────────────────────────────────────────────────────
160
+
161
+ def grade_medium(state: EnvState, initial_signal: float = 0.325) -> TaskResult:
162
+ c = state.campaign
163
+ feedback: List[str] = []
164
+
165
+ # Primary: CAPI enabled (biggest lever)
166
+ capi_score = 1.0 if c.conversions_api_enabled else 0.0
167
+ if c.conversions_api_enabled:
168
+ feedback.append("βœ… Conversions API enabled")
169
+ else:
170
+ feedback.append("❌ Conversions API NOT enabled β€” biggest signal recovery lever missed")
171
+
172
+ # Secondary: AEM enabled
173
+ aem_score = 1.0 if c.aem_enabled else 0.0
174
+ if c.aem_enabled:
175
+ feedback.append("βœ… Aggregated Event Measurement enabled")
176
+ else:
177
+ feedback.append("⚠️ AEM not enabled β€” modelled conversions unavailable")
178
+
179
+ metrics = _trajectory_metrics(
180
+ state,
181
+ initial_gap=state.attribution_gap_history[0] if state.attribution_gap_history else _attribution_gap(c),
182
+ initial_signal=initial_signal,
183
+ initial_true_roas=state.campaign.true_roas,
184
+ )
185
+
186
+ # Signal quality achieved
187
+ achieved_signal = state.tracking_reliability
188
+ optimal_signal = compute_pixel_quality(c.ios_traffic_pct, True, True, True)
189
+ signal_fraction = (achieved_signal - initial_signal) / max(optimal_signal - initial_signal, 0.01)
190
+ signal_fraction = round(min(max(signal_fraction, 0), 1), 4)
191
+ feedback.append(
192
+ f"ℹ️ Signal quality: {initial_signal:.0%} β†’ {achieved_signal:.0%} "
193
+ f"(optimal: {optimal_signal:.0%})"
194
+ )
195
+
196
+ efficiency = metrics["efficiency"]
197
+
198
+ raw_score = round(
199
+ max(
200
+ capi_score * 0.40
201
+ + aem_score * 0.25
202
+ + signal_fraction * 0.25
203
+ + metrics["action_efficiency"] * 0.10
204
+ + metrics["roas_improvement"] * 0.08
205
+ - metrics["redundancy_penalty"] * 0.08,
206
+ 0.0,
207
+ ),
208
+ 4,
209
+ )
210
+ near_optimal = (capi_score == 1.0) and (aem_score == 1.0) and (signal_fraction >= 0.85)
211
+ score = _calibrate_score(raw_score, "medium", near_optimal)
212
+
213
+ return TaskResult(
214
+ task_id=state.task_id,
215
+ difficulty=state.difficulty,
216
+ score=score,
217
+ passed=score >= PASS_THRESHOLD,
218
+ breakdown={
219
+ "capi_enabled": capi_score,
220
+ "aem_enabled": aem_score,
221
+ "signal_recovery": signal_fraction,
222
+ "efficiency": round(efficiency, 4),
223
+ "roas_improvement": metrics["roas_improvement"],
224
+ "action_efficiency": metrics["action_efficiency"],
225
+ "redundant_action_penalty": metrics["redundancy_penalty"],
226
+ "issues_resolved_count": metrics["issues_resolved_count"],
227
+ },
228
+ feedback=feedback,
229
+ steps_used=state.step_count,
230
+ cumulative_reward=state.cumulative_reward,
231
+ )
232
+
233
+
234
+ # ─────────────────────────────────────────────────────────────────────────────
235
+ # HARD grader
236
+ # ─────────────────────────────────────────────────────────────────────────────
237
+
238
+ def grade_hard(
239
+ state: EnvState,
240
+ initial_gap: float = 0.785,
241
+ initial_signal: float = 0.280,
242
+ initial_true_roas: float = 1.61,
243
+ ) -> TaskResult:
244
+ c = state.campaign
245
+ feedback: List[str] = []
246
+ issues_required = {
247
+ "attribution_window",
248
+ "conversions_api",
249
+ "aem",
250
+ "modeled_reporting",
251
+ "tracking_investigated",
252
+ "budget_allocation",
253
+ "paused_bad_adsets",
254
+ }
255
+ resolved = set(state.issues_resolved) & issues_required
256
+
257
+ checks: Dict[str, float] = {}
258
+
259
+ # 1. Attribution window
260
+ w_ok = c.attribution_window in {"7d_click", "7d_click_1d_view", "28d_click"}
261
+ checks["attribution_window"] = 1.0 if w_ok else 0.0
262
+ feedback.append(("βœ…" if w_ok else "❌") + f" Attribution window: {c.attribution_window}")
263
+
264
+ # 2. Conversions API
265
+ checks["conversions_api"] = 1.0 if c.conversions_api_enabled else 0.0
266
+ feedback.append(("βœ…" if c.conversions_api_enabled else "❌") + " Conversions API")
267
+
268
+ # 3. AEM
269
+ checks["aem"] = 1.0 if c.aem_enabled else 0.0
270
+ feedback.append(("βœ…" if c.aem_enabled else "❌") + " AEM")
271
+
272
+ # 4. Budget allocation β€” did agent touch budgets or pause bad adsets?
273
+ paused_any = any(a.is_paused for a in c.adsets)
274
+ checks["paused_bad_adsets"] = 1.0 if paused_any else 0.0
275
+ feedback.append(("βœ…" if paused_any else "❌") + " Paused under-performing adsets")
276
+
277
+ checks["tracking_investigated"] = 1.0 if state.tracking_investigated else 0.0
278
+ feedback.append(("βœ…" if state.tracking_investigated else "❌") + " Tracking investigated")
279
+
280
+ checks["modeled_reporting"] = 1.0 if c.attribution_reporting_mode == "modeled" else 0.0
281
+ feedback.append(("βœ…" if c.attribution_reporting_mode == "modeled" else "❌") + " Modeled reporting enabled")
282
+
283
+ # 5. Budget reallocation
284
+ budget_reallocated = "budget_allocation" in state.issues_resolved or "budget_reallocation" in state.issues_resolved
285
+ checks["budget_allocation"] = 1.0 if budget_reallocated else 0.0
286
+ feedback.append(("βœ…" if budget_reallocated else "❌") + " Budget reallocated to top performers")
287
+
288
+ metrics = _trajectory_metrics(
289
+ state,
290
+ initial_gap=initial_gap,
291
+ initial_signal=initial_signal,
292
+ initial_true_roas=initial_true_roas,
293
+ )
294
+
295
+ gap_closed = metrics["gap_reduction"]
296
+ sig_recovery = metrics["signal_recovery"]
297
+ roas_gain = metrics["roas_improvement"]
298
+
299
+ feedback.append(
300
+ f"ℹ️ Gap closed: {gap_closed:.0%} | Signal: {initial_signal:.0%}β†’{state.tracking_reliability:.0%} | "
301
+ f"True ROAS: {initial_true_roas:.2f}β†’{c.true_roas:.2f}"
302
+ )
303
+
304
+ issues_fraction = len(resolved) / len(issues_required)
305
+ efficiency = metrics["efficiency"]
306
+
307
+ critical_missing_penalty = (
308
+ (1.0 - checks["paused_bad_adsets"]) * 0.15
309
+ + (1.0 - checks["tracking_investigated"]) * 0.07
310
+ + (1.0 - checks["modeled_reporting"]) * 0.08
311
+ )
312
+
313
+ raw_score = round(
314
+ max(
315
+ issues_fraction * 0.40
316
+ + gap_closed * 0.20
317
+ + sig_recovery * 0.15
318
+ + roas_gain * 0.15
319
+ + metrics["action_efficiency"] * 0.10
320
+ - metrics["redundancy_penalty"] * 0.10,
321
+ - critical_missing_penalty,
322
+ 0.0,
323
+ ),
324
+ 4,
325
+ )
326
+ near_optimal = (issues_fraction >= 0.90) and (metrics["redundancy_penalty"] <= 0.08)
327
+ score = _calibrate_score(raw_score, "hard", near_optimal)
328
+
329
+ return TaskResult(
330
+ task_id=state.task_id,
331
+ difficulty=state.difficulty,
332
+ score=score,
333
+ passed=score >= PASS_THRESHOLD,
334
+ breakdown={
335
+ **{f"issue_{k}": v for k, v in checks.items()},
336
+ "issues_fraction": round(issues_fraction, 4),
337
+ "gap_closed": round(gap_closed, 4),
338
+ "signal_recovery": round(sig_recovery, 4),
339
+ "roas_gain": round(roas_gain, 4),
340
+ "efficiency": round(efficiency, 4),
341
+ "action_efficiency": metrics["action_efficiency"],
342
+ "redundant_action_penalty": metrics["redundancy_penalty"],
343
+ "critical_missing_penalty": round(critical_missing_penalty, 4),
344
+ "issues_resolved_count": metrics["issues_resolved_count"],
345
+ },
346
+ feedback=feedback,
347
+ steps_used=state.step_count,
348
+ cumulative_reward=state.cumulative_reward,
349
+ )
350
+
351
+
352
+ # ─── Dispatcher ──────────────────────────────────────────────────────────────
353
+
354
+ GRADERS = {
355
+ "easy_attribution_window": grade_easy,
356
+ "medium_pixel_recovery": grade_medium,
357
+ "hard_full_attribution_audit": grade_hard,
358
+ }
359
+
360
+
361
+ def grade(state: EnvState, **kwargs) -> TaskResult:
362
+ grader_fn = GRADERS.get(state.task_id)
363
+ if grader_fn is None:
364
+ raise ValueError(f"No grader for task '{state.task_id}'")
365
+ return grader_fn(state, **kwargs)