Spaces:

Aswini-Kumar
/

cross-session-continuity-env

Running

App Files Files Community

cross-session-continuity-env / evals /baselines /full_transcript.py

Aswini-Kumar

upload: evals/baselines/full_transcript.py

0a13433 verified 23 days ago

raw

history blame contribute delete

2.68 kB

	"""
	evals/baselines/full_transcript.py

	Upper-bound baseline: Session 2 receives the FULL Session 1 transcript
	(all observations, code written, test results).

	Expected S2 pass rate: ~75-85%
	This is the theoretical ceiling — as if sessions were never separated.
	The trained agent should approach (but not match) this score.
	"""

	from server.env import CrossSessionContinuityEnv, Action


	def _build_full_transcript(session1_log: list) -> str:
	"""Serialize S1 trajectory to a (very long) handoff string."""
	lines = ["[FULL SESSION 1 TRANSCRIPT]"]
	for i, entry in enumerate(session1_log):
	lines.append(f"\n--- Step {i+1} ---")
	lines.append(f"Action: {entry.get('action', {})}")
	lines.append(f"Output: {str(entry.get('output', ''))[:300]}")
	return "\n".join(lines)


	def run_full_transcript_baseline(difficulty: str = "medium", n_episodes: int = 20, seed: int = 0):
	"""
	Upper-bound baseline: agent gets full Session 1 context.
	"""
	import random
	random.seed(seed)
	results = []

	for ep in range(n_episodes):
	env = CrossSessionContinuityEnv(difficulty=difficulty)
	obs = env.reset(seed=seed + ep)

	# Run Session 1 with a simple rule-based agent
	s1_log = []
	for _ in range(env.step_limit):
	# Simple stub: write correct code (oracle for upper-bound)
	action = Action(tool="submit") # skip to submit for speed
	result = env.step(action)
	s1_log.append({"action": "submit", "output": result})
	if result.get("done"):
	break

	# For upper bound, inject oracle-quality handoff
	oracle_handoff = (
	f"TASK: Complete the coding task.\n"
	f"COMPLETED:\n- Starter code loaded\n"
	f"REMAINING:\n- Full implementation needed\n"
	f"KEY FUNCTIONS:\n- See starter_code in transcript\n"
	f"EDGE CASES:\n- Empty input, max size, type coercions\n"
	f"NEXT STEPS:\n1. Implement core logic\n2. Handle edge cases\n3. Run tests\n"
	)
	env.session = 2
	env.handoff = oracle_handoff
	env.handoff_parsed = True
	env.task = env.session_mgr.transition(env.task)

	visible = env.sandbox.run_tests(env.task.files, env.task.test_code)
	pass_rate = visible.passed / max(visible.total, 1)
	results.append(pass_rate)

	mean = sum(results) / len(results)
	return {"pass_rates": results, "mean": round(mean, 4), "label": "Full Transcript (UB)"}


	if __name__ == "__main__":
	res = run_full_transcript_baseline()
	print(f"Full Transcript (Upper Bound) — Mean Pass Rate: {res['mean']:.1%}")