from __future__ import annotations import os import sys import uuid import json import re import requests from typing import Optional, List from openai import OpenAI # ── config ──────────────────────────────────────────────────────────────────── API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct") HF_TOKEN = os.getenv("HF_TOKEN") ENV_BASE_URL = os.getenv("ENV_BASE_URL", "https://akkiisfrommars-jericho.hf.space") BENCHMARK = "jericho" MAX_STEPS = 20 TASKS = ["easy", "medium", "hard"] if not HF_TOKEN: print("ERROR: HF_TOKEN environment variable is not set.") sys.exit(1) client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) # ── logging (required stdout format) ───────────────────────────────────────── def log_start(task: str, env: str, model: str): print(f"[START] task={task} env={env} model={model}", flush=True) MAX_REWARD = 14.0 # max possible reward in one step (hard task: 10 tests * 1.0 + 2.0 bonus + 2.0 buffer) def normalize_reward(r: float) -> float: """Normalize reward to strictly (0, 1).""" normalized = (r + MAX_REWARD) / (2 * MAX_REWARD) # shift to positive range return round(max(0.0001, min(normalized, 0.9999)), 4) def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]): error_val = error if error else "null" norm = normalize_reward(reward) print(f"[STEP] step={step} action={action} reward={norm:.4f} done={str(done).lower()} error={error_val}", flush=True) def log_end(success: bool, steps: int, score: float, rewards: List[float]): rewards_str = ",".join(f"{normalize_reward(r):.4f}" for r in rewards) print(f"[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={rewards_str}", flush=True) # ── environment helpers ─────────────────────────────────────────────────────── def env_reset(session_id: str, task_id: str) -> dict: resp = requests.post(f"{ENV_BASE_URL}/env/reset", json={ "session_id": session_id, "task_id": task_id }) resp.raise_for_status() return resp.json()["state"] def env_step(session_id: str, action: dict): resp = requests.post(f"{ENV_BASE_URL}/env/step", json={ "session_id": session_id, "action": action }) resp.raise_for_status() data = resp.json() reward = data["reward"] if isinstance(reward, dict): reward = reward["value"] return data["state"], float(reward), data["done"] def env_grade(task_id: str, code: str) -> dict: resp = requests.post(f"{ENV_BASE_URL}/grader/", json={ "task_id": task_id, "code": code }) resp.raise_for_status() return resp.json() def get_task_info(task_id: str) -> dict: resp = requests.get(f"{ENV_BASE_URL}/tasks/{task_id}") resp.raise_for_status() return resp.json() # ── LLM helpers ─────────────────────────────────────────────────────────────── SYSTEM_PROMPT = """You are an expert Python debugger. You will be given buggy Python code and test failure output. Your job is to fix ONE function at a time. When you decide which function to fix, respond in this exact JSON format: { "function_name": "the_function_to_fix", "fixed_code": "def the_function_to_fix(...):\\n # complete corrected function body here" } Rules: - Output ONLY valid JSON. No explanation, no markdown, no code fences. - The fixed_code must be a complete function definition starting with def. - Fix only ONE function per response. - Choose the function most likely causing current test failures. - If all tests pass, output: {"done": true} """ def ask_llm(code: str, test_output: str, functions: List[str], tests_passed: int, tests_total: int) -> Optional[dict]: user_message = f"""Current code: {code} Test results: {tests_passed}/{tests_total} passing Test output: {test_output[-3000:] if len(test_output) > 3000 else test_output} Available functions to fix: {functions} Which single function should be fixed, and what is the corrected version?""" try: response = client.chat.completions.create( model=MODEL_NAME, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_message} ], max_tokens=1024, temperature=0.2, ) raw = response.choices[0].message.content.strip() raw = re.sub(r"^```(?:json)?\s*", "", raw) raw = re.sub(r"\s*```$", "", raw) return json.loads(raw) except json.JSONDecodeError: return None except Exception as e: return None # ── agent loop ──────────────────────────────────────────────────────────────── def run_task(task_id: str) -> dict: session_id = f"{task_id}-{uuid.uuid4().hex[:8]}" task_info = get_task_info(task_id) functions = task_info.get("functions", []) rewards = [] steps_taken = 0 score = 0.0 success = False error = None log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME) try: state = env_reset(session_id, task_id) # initial test run state, reward, done = env_step(session_id, {"type": "run_tests"}) rewards.append(reward) steps_taken += 1 log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error=None) while not done and steps_taken < MAX_STEPS: if state["tests_passed"] == state["tests_total"]: break llm_response = ask_llm( code = state["code"], test_output = state["last_test_output"], functions = functions, tests_passed = state["tests_passed"], tests_total = state["tests_total"], ) if llm_response is None or llm_response.get("done"): state, reward, done = env_step(session_id, {"type": "run_tests"}) rewards.append(reward) steps_taken += 1 log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error="llm_parse_error") continue fn_name = llm_response.get("function_name") fn_code = llm_response.get("fixed_code") if not fn_name or not fn_code: state, reward, done = env_step(session_id, {"type": "run_tests"}) rewards.append(reward) steps_taken += 1 log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error="missing_fields") continue # edit action_str = f"edit_function({fn_name})" state, reward, done = env_step(session_id, { "type": "edit_function", "function_name": fn_name, "new_code": fn_code, }) rewards.append(reward) steps_taken += 1 log_step(step=steps_taken, action=action_str, reward=reward, done=done, error=None) # run tests after edit if not done: state, reward, done = env_step(session_id, {"type": "run_tests"}) rewards.append(reward) steps_taken += 1 log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error=None) grade = env_grade(task_id, state["code"]) raw_score = grade["score"] score = max(0.0001, min(raw_score, 0.9999)) success = raw_score >= 0.9999 except Exception as e: error = str(e) log_end(success=success, steps=steps_taken, score=score, rewards=rewards) return { "task_id": task_id, "score": score, "steps": steps_taken, "success": success, "rewards": rewards, } # ── main ────────────────────────────────────────────────────────────────────── def main(): results = [] for task_id in TASKS: try: result = run_task(task_id) results.append(result) except Exception as e: results.append({"task_id": task_id, "score": 0.0, "error": str(e)}) avg = sum(r.get("score", 0) for r in results) / len(results) with open("baseline_results.json", "w") as f: json.dump({"model": MODEL_NAME, "tasks": results, "average": round(avg, 4)}, f, indent=2) if __name__ == "__main__": main()