Spaces:
Running
Running
| """ | |
| inference.py β Rust Coder OpenEnv Baseline Agent | |
| Architecture | |
| ββββββββββββ | |
| β’ Runs 3 tasks (easy / medium / hard) as independent episodes. | |
| β’ Each task produces its own [START]β¦[STEP]β¦[END] log block. | |
| β’ A fresh WebSocket env connection is opened per task to avoid | |
| HF-Space WebSocket timeouts during long LLM + compilation waits. | |
| β’ Scores are clamped to (0.01, 0.99) β strictly inside (0, 1). | |
| β’ If HF_TOKEN is missing, minimal fallback blocks are emitted so | |
| the platform always receives 3 parseable task records. | |
| Required env vars | |
| βββββββββββββββββ | |
| API_BASE_URL β LLM router URL (default: HF router) | |
| MODEL_NAME β model identifier (default: Qwen 72B) | |
| HF_TOKEN β HuggingFace / API key | |
| ENV_URL β environment URL (default: http://localhost:8000) | |
| """ | |
| import os | |
| import asyncio | |
| import logging | |
| from typing import List, Optional | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| # ββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1" | |
| MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct" | |
| HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY") | |
| ENV_URL = os.getenv("ENV_URL") or "http://localhost:8000" | |
| SUCCESS_SCORE_THRESHOLD = 0.5 | |
| TEMPERATURE = 0.1 | |
| MAX_TOKENS = 1500 | |
| # Exactly 3 tasks: easy / medium / hard (maps to problems.json indices) | |
| EVAL_TASKS = [ | |
| {"task_id": "task_1", "start_index": 0, "difficulty": "easy"}, | |
| {"task_id": "task_3", "start_index": 2, "difficulty": "medium"}, | |
| {"task_id": "task_6", "start_index": 5, "difficulty": "hard"}, | |
| ] | |
| # ββ Logging βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| _LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper() | |
| logging.basicConfig( | |
| level=getattr(logging, _LOG_LEVEL, logging.INFO), | |
| format="%(asctime)s %(levelname)s %(name)s - %(message)s", | |
| ) | |
| logger = logging.getLogger("rust_coder.inference") | |
| from client import RustCoderEnv | |
| from models import RustCoderAction | |
| # ββ Strict stdout log helpers βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def log_start(task: str, env: str, model: str) -> None: | |
| print(f"[START] task={task} env={env} model={model}", flush=True) | |
| def log_step( | |
| step: int, | |
| action: str, | |
| reward: float, | |
| done: bool, | |
| error: Optional[str] = None, | |
| ) -> None: | |
| action_str = (action or "").replace("\r", "\\r").replace("\n", "\\n")[:200] | |
| err_field = "null" if error is None else str(error).replace("\n", "\\n")[:200] | |
| print( | |
| f"[STEP] step={step} action={action_str} reward={reward:.2f} " | |
| f"done={str(bool(done)).lower()} error={err_field}", | |
| flush=True, | |
| ) | |
| def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None: | |
| rewards_str = ",".join(f"{r:.2f}" for r in rewards) | |
| print( | |
| f"[END] success={str(success).lower()} steps={steps} " | |
| f"score={score:.3f} rewards={rewards_str}", | |
| flush=True, | |
| ) | |
| # ββ Score clamping ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def clamp_score(raw: float) -> float: | |
| """ | |
| Clamp to the open interval (0, 1) β never exactly 0.0 or 1.0. | |
| Floor 0.01: even compilation failures yield a non-zero score. | |
| Ceiling 0.99: prevents a theoretically-perfect submission from | |
| returning 1.0. | |
| """ | |
| return round(max(0.01, min(0.99, float(raw))), 3) | |
| # ββ LLM call βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def get_model_code(prompt: str, client: OpenAI) -> str: | |
| """Ask the model for a complete Rust solution; strip markdown if needed.""" | |
| try: | |
| completion = client.chat.completions.create( | |
| model=MODEL_NAME, | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": ( | |
| "You are a senior Rust systems engineer. " | |
| "Return ONLY the complete, corrected Rust source file. " | |
| "No markdown fences. No commentary." | |
| ), | |
| }, | |
| {"role": "user", "content": prompt}, | |
| ], | |
| temperature=TEMPERATURE, | |
| max_tokens=MAX_TOKENS, | |
| ) | |
| text = (completion.choices[0].message.content or "").strip() | |
| if "```rust" in text: | |
| text = text.split("```rust")[1].split("```")[0] | |
| elif "```" in text: | |
| text = text.split("```")[1].split("```")[0] | |
| text = text.strip() | |
| return text or "// empty response" | |
| except Exception as exc: | |
| logger.exception("LLM call failed") | |
| return f"// LLM error: {exc}" | |
| # ββ Single-task episode βββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def run_task(task_info: dict, client: Optional[OpenAI]) -> None: | |
| """ | |
| Run one task as a fully independent episode with its own env connection. | |
| Opens a fresh WebSocket connection so a slow LLM call on a previous | |
| task cannot cause a connection timeout here. | |
| Always emits exactly one [START]β¦[STEP]β¦[END] block. | |
| """ | |
| task_id = task_info["task_id"] | |
| start_index = task_info["start_index"] | |
| log_start(task=task_id, env="RustCoder-v1", model=MODEL_NAME) | |
| rewards: List[float] = [] | |
| steps_taken = 0 | |
| score = 0.01 | |
| success = False | |
| # Fresh connection per task β avoids WebSocket timeout across tasks | |
| env = RustCoderEnv(base_url=ENV_URL) | |
| try: | |
| # ββ Reset to the target task ββββββββββββββββββββββββββββββββββ | |
| reset_result = await env.reset(start_index=start_index) | |
| obs = reset_result.observation | |
| # ββ Build prompt ββββββββββββββββββββββββββββββββββββββββββββββ | |
| prompt = obs.problem_description or "" | |
| header = getattr(obs, "header_section", "") | |
| if header: | |
| prompt += ( | |
| "\n\nHeader section (must be included verbatim):" | |
| f"\n```rust\n{header}\n```" | |
| ) | |
| # ββ Get LLM code or skip if no token βββββββββββββββββββββββββ | |
| if client is not None: | |
| code = await get_model_code(prompt, client) | |
| else: | |
| code = "// no HF_TOKEN β using stub" | |
| steps_taken = 1 | |
| # ββ Evaluate in environment βββββββββββββββββββββββββββββββββββ | |
| step_result = await env.step(RustCoderAction(code=code)) | |
| # Explicit None check β 0.0 is falsy but valid | |
| raw_reward = float(step_result.reward if step_result.reward is not None else 0.0) | |
| score = clamp_score(raw_reward) | |
| rewards.append(score) | |
| success = score >= SUCCESS_SCORE_THRESHOLD | |
| log_step(step=1, action=code, reward=score, done=True, error=None) | |
| except Exception as exc: | |
| logger.exception("Task %s failed", task_id) | |
| score = 0.01 | |
| rewards = [0.01] | |
| log_step( | |
| step=steps_taken + 1, | |
| action="error", | |
| reward=0.01, | |
| done=True, | |
| error=str(exc), | |
| ) | |
| finally: | |
| try: | |
| await env.close() | |
| except Exception: | |
| pass | |
| log_end(success=success, steps=steps_taken, score=score, rewards=rewards) | |
| # ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| async def main() -> None: | |
| # Build the LLM client if credentials are available | |
| client: Optional[OpenAI] = None | |
| if HF_TOKEN: | |
| client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN) | |
| else: | |
| logger.warning( | |
| "HF_TOKEN / API_KEY not set β LLM calls disabled. " | |
| "Stub code will be submitted; scores will be at floor (0.01)." | |
| ) | |
| for task in EVAL_TASKS: | |
| await run_task(task, client) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |