jericho / inference.py
akkiisfrommars's picture
Update inference.py
73bc026 verified
from __future__ import annotations
import os
import sys
import uuid
import json
import re
import requests
from typing import Optional, List
from openai import OpenAI
# ── config ────────────────────────────────────────────────────────────────────
API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1")
MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/Llama-3.3-70B-Instruct")
HF_TOKEN = os.getenv("HF_TOKEN")
ENV_BASE_URL = os.getenv("ENV_BASE_URL", "https://akkiisfrommars-jericho.hf.space")
BENCHMARK = "jericho"
MAX_STEPS = 20
TASKS = ["easy", "medium", "hard"]
if not HF_TOKEN:
print("ERROR: HF_TOKEN environment variable is not set.")
sys.exit(1)
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
# ── logging (required stdout format) ─────────────────────────────────────────
def log_start(task: str, env: str, model: str):
print(f"[START] task={task} env={env} model={model}", flush=True)
MAX_REWARD = 14.0 # max possible reward in one step (hard task: 10 tests * 1.0 + 2.0 bonus + 2.0 buffer)
def normalize_reward(r: float) -> float:
"""Normalize reward to strictly (0, 1)."""
normalized = (r + MAX_REWARD) / (2 * MAX_REWARD) # shift to positive range
return round(max(0.0001, min(normalized, 0.9999)), 4)
def log_step(step: int, action: str, reward: float, done: bool, error: Optional[str]):
error_val = error if error else "null"
norm = normalize_reward(reward)
print(f"[STEP] step={step} action={action} reward={norm:.4f} done={str(done).lower()} error={error_val}", flush=True)
def log_end(success: bool, steps: int, score: float, rewards: List[float]):
rewards_str = ",".join(f"{normalize_reward(r):.4f}" for r in rewards)
print(f"[END] success={str(success).lower()} steps={steps} score={score:.4f} rewards={rewards_str}", flush=True)
# ── environment helpers ───────────────────────────────────────────────────────
def env_reset(session_id: str, task_id: str) -> dict:
resp = requests.post(f"{ENV_BASE_URL}/env/reset", json={
"session_id": session_id,
"task_id": task_id
})
resp.raise_for_status()
return resp.json()["state"]
def env_step(session_id: str, action: dict):
resp = requests.post(f"{ENV_BASE_URL}/env/step", json={
"session_id": session_id,
"action": action
})
resp.raise_for_status()
data = resp.json()
reward = data["reward"]
if isinstance(reward, dict):
reward = reward["value"]
return data["state"], float(reward), data["done"]
def env_grade(task_id: str, code: str) -> dict:
resp = requests.post(f"{ENV_BASE_URL}/grader/", json={
"task_id": task_id,
"code": code
})
resp.raise_for_status()
return resp.json()
def get_task_info(task_id: str) -> dict:
resp = requests.get(f"{ENV_BASE_URL}/tasks/{task_id}")
resp.raise_for_status()
return resp.json()
# ── LLM helpers ───────────────────────────────────────────────────────────────
SYSTEM_PROMPT = """You are an expert Python debugger. You will be given buggy Python code and test failure output.
Your job is to fix ONE function at a time. When you decide which function to fix, respond in this exact JSON format:
{
"function_name": "the_function_to_fix",
"fixed_code": "def the_function_to_fix(...):\\n # complete corrected function body here"
}
Rules:
- Output ONLY valid JSON. No explanation, no markdown, no code fences.
- The fixed_code must be a complete function definition starting with def.
- Fix only ONE function per response.
- Choose the function most likely causing current test failures.
- If all tests pass, output: {"done": true}
"""
def ask_llm(code: str, test_output: str, functions: List[str], tests_passed: int, tests_total: int) -> Optional[dict]:
user_message = f"""Current code:
{code}
Test results: {tests_passed}/{tests_total} passing
Test output:
{test_output[-3000:] if len(test_output) > 3000 else test_output}
Available functions to fix: {functions}
Which single function should be fixed, and what is the corrected version?"""
try:
response = client.chat.completions.create(
model=MODEL_NAME,
messages=[
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_message}
],
max_tokens=1024,
temperature=0.2,
)
raw = response.choices[0].message.content.strip()
raw = re.sub(r"^```(?:json)?\s*", "", raw)
raw = re.sub(r"\s*```$", "", raw)
return json.loads(raw)
except json.JSONDecodeError:
return None
except Exception as e:
return None
# ── agent loop ────────────────────────────────────────────────────────────────
def run_task(task_id: str) -> dict:
session_id = f"{task_id}-{uuid.uuid4().hex[:8]}"
task_info = get_task_info(task_id)
functions = task_info.get("functions", [])
rewards = []
steps_taken = 0
score = 0.0
success = False
error = None
log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)
try:
state = env_reset(session_id, task_id)
# initial test run
state, reward, done = env_step(session_id, {"type": "run_tests"})
rewards.append(reward)
steps_taken += 1
log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error=None)
while not done and steps_taken < MAX_STEPS:
if state["tests_passed"] == state["tests_total"]:
break
llm_response = ask_llm(
code = state["code"],
test_output = state["last_test_output"],
functions = functions,
tests_passed = state["tests_passed"],
tests_total = state["tests_total"],
)
if llm_response is None or llm_response.get("done"):
state, reward, done = env_step(session_id, {"type": "run_tests"})
rewards.append(reward)
steps_taken += 1
log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error="llm_parse_error")
continue
fn_name = llm_response.get("function_name")
fn_code = llm_response.get("fixed_code")
if not fn_name or not fn_code:
state, reward, done = env_step(session_id, {"type": "run_tests"})
rewards.append(reward)
steps_taken += 1
log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error="missing_fields")
continue
# edit
action_str = f"edit_function({fn_name})"
state, reward, done = env_step(session_id, {
"type": "edit_function",
"function_name": fn_name,
"new_code": fn_code,
})
rewards.append(reward)
steps_taken += 1
log_step(step=steps_taken, action=action_str, reward=reward, done=done, error=None)
# run tests after edit
if not done:
state, reward, done = env_step(session_id, {"type": "run_tests"})
rewards.append(reward)
steps_taken += 1
log_step(step=steps_taken, action="run_tests", reward=reward, done=done, error=None)
grade = env_grade(task_id, state["code"])
raw_score = grade["score"]
score = max(0.0001, min(raw_score, 0.9999))
success = raw_score >= 0.9999
except Exception as e:
error = str(e)
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
return {
"task_id": task_id,
"score": score,
"steps": steps_taken,
"success": success,
"rewards": rewards,
}
# ── main ──────────────────────────────────────────────────────────────────────
def main():
results = []
for task_id in TASKS:
try:
result = run_task(task_id)
results.append(result)
except Exception as e:
results.append({"task_id": task_id, "score": 0.0, "error": str(e)})
avg = sum(r.get("score", 0) for r in results) / len(results)
with open("baseline_results.json", "w") as f:
json.dump({"model": MODEL_NAME, "tasks": results, "average": round(avg, 4)}, f, indent=2)
if __name__ == "__main__":
main()