""" inference.py — Rust Coder OpenEnv Baseline Agent Architecture ──────────── • Runs 3 tasks (easy / medium / hard) as independent episodes. • Each task produces its own [START]…[STEP]…[END] log block. • A fresh WebSocket env connection is opened per task to avoid HF-Space WebSocket timeouts during long LLM + compilation waits. • Scores are clamped to (0.01, 0.99) — strictly inside (0, 1). • If HF_TOKEN is missing, minimal fallback blocks are emitted so the platform always receives 3 parseable task records. Required env vars ───────────────── API_BASE_URL — LLM router URL (default: HF router) MODEL_NAME — model identifier (default: Qwen 72B) HF_TOKEN — HuggingFace / API key ENV_URL — environment URL (default: http://localhost:8000) """ import os import asyncio import logging from typing import List, Optional from openai import OpenAI from dotenv import load_dotenv load_dotenv() # ── Configuration ───────────────────────────────────────────────────────────── API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1" MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct" HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY") ENV_URL = os.getenv("ENV_URL") or "http://localhost:8000" SUCCESS_SCORE_THRESHOLD = 0.5 TEMPERATURE = 0.1 MAX_TOKENS = 1500 # Exactly 3 tasks: easy / medium / hard (maps to problems.json indices) EVAL_TASKS = [ {"task_id": "task_1", "start_index": 0, "difficulty": "easy"}, {"task_id": "task_3", "start_index": 2, "difficulty": "medium"}, {"task_id": "task_6", "start_index": 5, "difficulty": "hard"}, ] # ── Logging ─────────────────────────────────────────────────────────────────── _LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper() logging.basicConfig( level=getattr(logging, _LOG_LEVEL, logging.INFO), format="%(asctime)s %(levelname)s %(name)s - %(message)s", ) logger = logging.getLogger("rust_coder.inference") from client import RustCoderEnv from models import RustCoderAction # ── Strict stdout log helpers ───────────────────────────────────────────────── def log_start(task: str, env: str, model: str) -> None: print(f"[START] task={task} env={env} model={model}", flush=True) def log_step( step: int, action: str, reward: float, done: bool, error: Optional[str] = None, ) -> None: action_str = (action or "").replace("\r", "\\r").replace("\n", "\\n")[:200] err_field = "null" if error is None else str(error).replace("\n", "\\n")[:200] print( f"[STEP] step={step} action={action_str} reward={reward:.2f} " f"done={str(bool(done)).lower()} error={err_field}", flush=True, ) def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None: rewards_str = ",".join(f"{r:.2f}" for r in rewards) print( f"[END] success={str(success).lower()} steps={steps} " f"score={score:.3f} rewards={rewards_str}", flush=True, ) # ── Score clamping ──────────────────────────────────────────────────────────── def clamp_score(raw: float) -> float: """ Clamp to the open interval (0, 1) — never exactly 0.0 or 1.0. Floor 0.01: even compilation failures yield a non-zero score. Ceiling 0.99: prevents a theoretically-perfect submission from returning 1.0. """ return round(max(0.01, min(0.99, float(raw))), 3) # ── LLM call ───────────────────────────────────────────────────────────────── async def get_model_code(prompt: str, client: OpenAI) -> str: """Ask the model for a complete Rust solution; strip markdown if needed.""" try: completion = client.chat.completions.create( model=MODEL_NAME, messages=[ { "role": "system", "content": ( "You are a senior Rust systems engineer. " "Return ONLY the complete, corrected Rust source file. " "No markdown fences. No commentary." ), }, {"role": "user", "content": prompt}, ], temperature=TEMPERATURE, max_tokens=MAX_TOKENS, ) text = (completion.choices[0].message.content or "").strip() if "```rust" in text: text = text.split("```rust")[1].split("```")[0] elif "```" in text: text = text.split("```")[1].split("```")[0] text = text.strip() return text or "// empty response" except Exception as exc: logger.exception("LLM call failed") return f"// LLM error: {exc}" # ── Single-task episode ─────────────────────────────────────────────────────── async def run_task(task_info: dict, client: Optional[OpenAI]) -> None: """ Run one task as a fully independent episode with its own env connection. Opens a fresh WebSocket connection so a slow LLM call on a previous task cannot cause a connection timeout here. Always emits exactly one [START]…[STEP]…[END] block. """ task_id = task_info["task_id"] start_index = task_info["start_index"] log_start(task=task_id, env="RustCoder-v1", model=MODEL_NAME) rewards: List[float] = [] steps_taken = 0 score = 0.01 success = False # Fresh connection per task — avoids WebSocket timeout across tasks env = RustCoderEnv(base_url=ENV_URL) try: # ── Reset to the target task ────────────────────────────────── reset_result = await env.reset(start_index=start_index) obs = reset_result.observation # ── Build prompt ────────────────────────────────────────────── prompt = obs.problem_description or "" header = getattr(obs, "header_section", "") if header: prompt += ( "\n\nHeader section (must be included verbatim):" f"\n```rust\n{header}\n```" ) # ── Get LLM code or skip if no token ───────────────────────── if client is not None: code = await get_model_code(prompt, client) else: code = "// no HF_TOKEN — using stub" steps_taken = 1 # ── Evaluate in environment ─────────────────────────────────── step_result = await env.step(RustCoderAction(code=code)) # Explicit None check — 0.0 is falsy but valid raw_reward = float(step_result.reward if step_result.reward is not None else 0.0) score = clamp_score(raw_reward) rewards.append(score) success = score >= SUCCESS_SCORE_THRESHOLD log_step(step=1, action=code, reward=score, done=True, error=None) except Exception as exc: logger.exception("Task %s failed", task_id) score = 0.01 rewards = [0.01] log_step( step=steps_taken + 1, action="error", reward=0.01, done=True, error=str(exc), ) finally: try: await env.close() except Exception: pass log_end(success=success, steps=steps_taken, score=score, rewards=rewards) # ── Main ────────────────────────────────────────────────────────────────────── async def main() -> None: # Build the LLM client if credentials are available client: Optional[OpenAI] = None if HF_TOKEN: client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) else: logger.warning( "HF_TOKEN / API_KEY not set — LLM calls disabled. " "Stub code will be submitted; scores will be at floor (0.01)." ) for task in EVAL_TASKS: await run_task(task, client) if __name__ == "__main__": asyncio.run(main())