| """ |
| evals/baselines/full_transcript.py |
| |
| Upper-bound baseline: Session 2 receives the FULL Session 1 transcript |
| (all observations, code written, test results). |
| |
| Expected S2 pass rate: ~75-85% |
| This is the theoretical ceiling — as if sessions were never separated. |
| The trained agent should approach (but not match) this score. |
| """ |
|
|
| from server.env import CrossSessionContinuityEnv, Action |
|
|
|
|
| def _build_full_transcript(session1_log: list) -> str: |
| """Serialize S1 trajectory to a (very long) handoff string.""" |
| lines = ["[FULL SESSION 1 TRANSCRIPT]"] |
| for i, entry in enumerate(session1_log): |
| lines.append(f"\n--- Step {i+1} ---") |
| lines.append(f"Action: {entry.get('action', {})}") |
| lines.append(f"Output: {str(entry.get('output', ''))[:300]}") |
| return "\n".join(lines) |
|
|
|
|
| def run_full_transcript_baseline(difficulty: str = "medium", n_episodes: int = 20, seed: int = 0): |
| """ |
| Upper-bound baseline: agent gets full Session 1 context. |
| """ |
| import random |
| random.seed(seed) |
| results = [] |
|
|
| for ep in range(n_episodes): |
| env = CrossSessionContinuityEnv(difficulty=difficulty) |
| obs = env.reset(seed=seed + ep) |
|
|
| |
| s1_log = [] |
| for _ in range(env.step_limit): |
| |
| action = Action(tool="submit") |
| result = env.step(action) |
| s1_log.append({"action": "submit", "output": result}) |
| if result.get("done"): |
| break |
|
|
| |
| oracle_handoff = ( |
| f"TASK: Complete the coding task.\n" |
| f"COMPLETED:\n- Starter code loaded\n" |
| f"REMAINING:\n- Full implementation needed\n" |
| f"KEY FUNCTIONS:\n- See starter_code in transcript\n" |
| f"EDGE CASES:\n- Empty input, max size, type coercions\n" |
| f"NEXT STEPS:\n1. Implement core logic\n2. Handle edge cases\n3. Run tests\n" |
| ) |
| env.session = 2 |
| env.handoff = oracle_handoff |
| env.handoff_parsed = True |
| env.task = env.session_mgr.transition(env.task) |
|
|
| visible = env.sandbox.run_tests(env.task.files, env.task.test_code) |
| pass_rate = visible.passed / max(visible.total, 1) |
| results.append(pass_rate) |
|
|
| mean = sum(results) / len(results) |
| return {"pass_rates": results, "mean": round(mean, 4), "label": "Full Transcript (UB)"} |
|
|
|
|
| if __name__ == "__main__": |
| res = run_full_transcript_baseline() |
| print(f"Full Transcript (Upper Bound) — Mean Pass Rate: {res['mean']:.1%}") |
|
|