Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import os | |
| import sys | |
| import uuid | |
| import json | |
| import re | |
| import requests | |
| from typing import Optional, List | |
| from openai import OpenAI | |
| # ββ config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") | |
| MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct") | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| ENV_BASE_URL = os.getenv("ENV_BASE_URL", "https://akkiisfrommars-jericho.hf.space") | |
| BENCHMARK = "jericho" | |
| MAX_STEPS = 20 | |
| TASKS = ["easy", "medium", "hard"] | |
| if not HF_TOKEN: | |
| print("ERROR: HF_TOKEN environment variable is not set.") | |
| sys.exit(1) | |
| client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) | |
| # ββ logging (required stdout format) βββββββββββββββββββββββββββββββββββββββββ | |
| def log_start(task: str, env: str, model: str): | |
| print(f"[START] task={task} env={env} model={model}", flush=True) | |
| MAX_REWARD = 14.0 # max possible reward in one step (hard task: 10 tests * 1.0 + 2.0 bonus + 2.0 buffer) | |
| def normalize_reward(r: float) -> float: | |
| """Normalize reward to strictly (0, 1).""" | |
| normalized = (r + MAX_REWARD) / (2 * MAX_REWARD) # shift to positive range | |
| return round(max(0.0001, min(normalized, 0.9999)), 4) | |
| def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]): | |
| error_val = error if error else "null" | |
| norm = normalize_reward(reward) | |
| print(f"[STEP] step={step} action={action} reward={norm:.4f} done={str(done).lower()} error={error_val}", flush=True) | |
| def log_end(success: bool, steps: int, score: float, rewards: List[float]): | |
| rewards_str = ",".join(f"{normalize_reward(r):.4f}" for r in rewards) | |
| print(f"[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={rewards_str}", flush=True) | |
| # ββ environment helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def env_reset(session_id: str, task_id: str) -> dict: | |
| resp = requests.post(f"{ENV_BASE_URL}/env/reset", json={ | |
| "session_id": session_id, | |
| "task_id": task_id | |
| }) | |
| resp.raise_for_status() | |
| return resp.json()["state"] | |
| def env_step(session_id: str, action: dict): | |
| resp = requests.post(f"{ENV_BASE_URL}/env/step", json={ | |
| "session_id": session_id, | |
| "action": action | |
| }) | |
| resp.raise_for_status() | |
| data = resp.json() | |
| reward = data["reward"] | |
| if isinstance(reward, dict): | |
| reward = reward["value"] | |
| return data["state"], float(reward), data["done"] | |
| def env_grade(task_id: str, code: str) -> dict: | |
| resp = requests.post(f"{ENV_BASE_URL}/grader/", json={ | |
| "task_id": task_id, | |
| "code": code | |
| }) | |
| resp.raise_for_status() | |
| return resp.json() | |
| def get_task_info(task_id: str) -> dict: | |
| resp = requests.get(f"{ENV_BASE_URL}/tasks/{task_id}") | |
| resp.raise_for_status() | |
| return resp.json() | |
| # ββ LLM helpers βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SYSTEM_PROMPT = """You are an expert Python debugger. You will be given buggy Python code and test failure output. | |
| Your job is to fix ONE function at a time. When you decide which function to fix, respond in this exact JSON format: | |
| { | |
| "function_name": "the_function_to_fix", | |
| "fixed_code": "def the_function_to_fix(...):\\n # complete corrected function body here" | |
| } | |
| Rules: | |
| - Output ONLY valid JSON. No explanation, no markdown, no code fences. | |
| - The fixed_code must be a complete function definition starting with def. | |
| - Fix only ONE function per response. | |
| - Choose the function most likely causing current test failures. | |
| - If all tests pass, output: {"done": true} | |
| """ | |
| def ask_llm(code: str, test_output: str, functions: List[str], tests_passed: int, tests_total: int) -> Optional[dict]: | |
| user_message = f"""Current code: | |
| {code} | |
| Test results: {tests_passed}/{tests_total} passing | |
| Test output: | |
| {test_output[-3000:] if len(test_output) > 3000 else test_output} | |
| Available functions to fix: {functions} | |
| Which single function should be fixed, and what is the corrected version?""" | |
| try: | |
| response = client.chat.completions.create( | |
| model=MODEL_NAME, | |
| messages=[ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": user_message} | |
| ], | |
| max_tokens=1024, | |
| temperature=0.2, | |
| ) | |
| raw = response.choices[0].message.content.strip() | |
| raw = re.sub(r"^```(?:json)?\s*", "", raw) | |
| raw = re.sub(r"\s*```$", "", raw) | |
| return json.loads(raw) | |
| except json.JSONDecodeError: | |
| return None | |
| except Exception as e: | |
| return None | |
| # ββ agent loop ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run_task(task_id: str) -> dict: | |
| session_id = f"{task_id}-{uuid.uuid4().hex[:8]}" | |
| task_info = get_task_info(task_id) | |
| functions = task_info.get("functions", []) | |
| rewards = [] | |
| steps_taken = 0 | |
| score = 0.0 | |
| success = False | |
| error = None | |
| log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME) | |
| try: | |
| state = env_reset(session_id, task_id) | |
| # initial test run | |
| state, reward, done = env_step(session_id, {"type": "run_tests"}) | |
| rewards.append(reward) | |
| steps_taken += 1 | |
| log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error=None) | |
| while not done and steps_taken < MAX_STEPS: | |
| if state["tests_passed"] == state["tests_total"]: | |
| break | |
| llm_response = ask_llm( | |
| code = state["code"], | |
| test_output = state["last_test_output"], | |
| functions = functions, | |
| tests_passed = state["tests_passed"], | |
| tests_total = state["tests_total"], | |
| ) | |
| if llm_response is None or llm_response.get("done"): | |
| state, reward, done = env_step(session_id, {"type": "run_tests"}) | |
| rewards.append(reward) | |
| steps_taken += 1 | |
| log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error="llm_parse_error") | |
| continue | |
| fn_name = llm_response.get("function_name") | |
| fn_code = llm_response.get("fixed_code") | |
| if not fn_name or not fn_code: | |
| state, reward, done = env_step(session_id, {"type": "run_tests"}) | |
| rewards.append(reward) | |
| steps_taken += 1 | |
| log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error="missing_fields") | |
| continue | |
| # edit | |
| action_str = f"edit_function({fn_name})" | |
| state, reward, done = env_step(session_id, { | |
| "type": "edit_function", | |
| "function_name": fn_name, | |
| "new_code": fn_code, | |
| }) | |
| rewards.append(reward) | |
| steps_taken += 1 | |
| log_step(step=steps_taken, action=action_str, reward=reward, done=done, error=None) | |
| # run tests after edit | |
| if not done: | |
| state, reward, done = env_step(session_id, {"type": "run_tests"}) | |
| rewards.append(reward) | |
| steps_taken += 1 | |
| log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error=None) | |
| grade = env_grade(task_id, state["code"]) | |
| raw_score = grade["score"] | |
| score = max(0.0001, min(raw_score, 0.9999)) | |
| success = raw_score >= 0.9999 | |
| except Exception as e: | |
| error = str(e) | |
| log_end(success=success, steps=steps_taken, score=score, rewards=rewards) | |
| return { | |
| "task_id": task_id, | |
| "score": score, | |
| "steps": steps_taken, | |
| "success": success, | |
| "rewards": rewards, | |
| } | |
| # ββ main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def main(): | |
| results = [] | |
| for task_id in TASKS: | |
| try: | |
| result = run_task(task_id) | |
| results.append(result) | |
| except Exception as e: | |
| results.append({"task_id": task_id, "score": 0.0, "error": str(e)}) | |
| avg = sum(r.get("score", 0) for r in results) / len(results) | |
| with open("baseline_results.json", "w") as f: | |
| json.dump({"model": MODEL_NAME, "tasks": results, "average": round(avg, 4)}, f, indent=2) | |
| if __name__ == "__main__": | |
| main() |