TheAarvee05 commited on
Commit
bd7ae85
Β·
verified Β·
1 Parent(s): 12829af

Upload baseline/run_baseline.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. baseline/run_baseline.py +138 -0
baseline/run_baseline.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ run_baseline.py β€” Reproducible baseline evaluation across all 3 tasks.
3
+
4
+ Usage:
5
+ OPENAI_API_KEY=sk-... python baseline/run_baseline.py
6
+
7
+ Produces a score table in console output.
8
+ """
9
+
10
+ from __future__ import annotations
11
+ import sys
12
+ import os
13
+
14
+ # Allow running from project root
15
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
+
17
+ from meta_ads_env import MetaAdsAttributionEnv
18
+ from baseline.baseline_agent import BaselineAgent
19
+
20
+
21
+ TASKS = [
22
+ "easy_attribution_window",
23
+ "medium_pixel_recovery",
24
+ "hard_full_attribution_audit",
25
+ ]
26
+
27
+
28
+ def _format_context_for_console(raw_context: str) -> str:
29
+ """Hide verbose adset breakdown from console while keeping step/issue lines."""
30
+ marker = "\n\nAdset Performance Breakdown:"
31
+ if marker not in raw_context:
32
+ return raw_context
33
+ head, tail = raw_context.split(marker, 1)
34
+ step_marker = "\nStep "
35
+ if step_marker in tail:
36
+ tail = tail[tail.index(step_marker):]
37
+ else:
38
+ tail = ""
39
+ return head + tail
40
+
41
+
42
+ def run_task(task_id: str, agent: BaselineAgent, verbose: bool = True) -> dict:
43
+ env = MetaAdsAttributionEnv(task_id=task_id)
44
+ obs = env.reset()
45
+
46
+ if verbose:
47
+ print(f"\n{'='*60}")
48
+ print(f"TASK: {task_id.upper()}")
49
+ print(f"{'='*60}")
50
+ print(_format_context_for_console(obs.context))
51
+ print()
52
+
53
+ total_reward = 0.0
54
+ step = 0
55
+
56
+ while not obs.done:
57
+ action = agent.act(obs.context)
58
+
59
+ if verbose:
60
+ print(f" Step {step+1}: {action.action_type} params={action.parameters}")
61
+ print(f" Reasoning: {action.reasoning}")
62
+
63
+ obs, reward, done, info = env.step(action)
64
+ total_reward += reward.total
65
+
66
+ if verbose:
67
+ print(f" Reward: {reward.total:.4f} ({reward.explanation})")
68
+ print(f" Effects: {info['effects']}")
69
+ print(
70
+ " Delay Stats: "
71
+ f"pending={obs.pending_delayed_conversions} "
72
+ f"released_step={obs.delayed_conversion_release_events} "
73
+ f"cumulative={obs.cumulative_delayed_conversions} "
74
+ f"tracked={obs.tracked_conversions_accumulated} "
75
+ f"modeled={obs.modeled_conversions_accumulated}"
76
+ )
77
+
78
+ step += 1
79
+ if done:
80
+ break
81
+
82
+ result = env.grade_episode()
83
+
84
+ if verbose:
85
+ print(f"\n── Episode Summary ──────────────────────────────")
86
+ print(f" Score: {result.score:.4f} ({'PASS βœ…' if result.passed else 'FAIL ❌'})")
87
+ print(f" Steps: {result.steps_used}/{env._state.max_steps}")
88
+ print(f" Cumulative reward: {result.cumulative_reward:.4f}")
89
+ print(" Breakdown:")
90
+ for k, v in result.breakdown.items():
91
+ print(f" {k}: {v}")
92
+
93
+ return {
94
+ "task_id": result.task_id,
95
+ "difficulty": result.difficulty,
96
+ "score": result.score,
97
+ "passed": result.passed,
98
+ "steps_used": result.steps_used,
99
+ "cumulative_reward": result.cumulative_reward,
100
+ "breakdown": result.breakdown,
101
+ "feedback": result.feedback,
102
+ }
103
+
104
+
105
+ def main():
106
+ model_name = os.environ.get("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct")
107
+ print("Meta Ads Attribution OpenEnv β€” Baseline Evaluation")
108
+ print(f"Model: {model_name} | Tasks: 3\n")
109
+
110
+ try:
111
+ agent = BaselineAgent(model=model_name)
112
+ except EnvironmentError as e:
113
+ print(f"ERROR: {e}")
114
+ sys.exit(1)
115
+
116
+ all_results = []
117
+ for task_id in TASKS:
118
+ result = run_task(task_id, agent, verbose=True)
119
+ all_results.append(result)
120
+
121
+ # Summary table
122
+ print(f"\n{'='*60}")
123
+ print("BASELINE RESULTS SUMMARY")
124
+ print(f"{'='*60}")
125
+ print(f"{'Task':<35} {'Score':>7} {'Pass':>6} {'Steps':>6}")
126
+ print("-" * 60)
127
+ for r in all_results:
128
+ tag = "βœ…" if r["passed"] else "❌"
129
+ print(f"{r['task_id']:<35} {r['score']:>7.4f} {tag:>6} {r['steps_used']:>6}")
130
+
131
+ avg = sum(r["score"] for r in all_results) / len(all_results)
132
+ print("-" * 60)
133
+ print(f"{'AVERAGE':<35} {avg:>7.4f}")
134
+ print()
135
+
136
+
137
+ if __name__ == "__main__":
138
+ main()