rust_coder / inference.py
Parthiban007's picture
Upload folder using huggingface_hub
efe528e verified
"""
inference.py β€” Rust Coder OpenEnv Baseline Agent
Architecture
────────────
β€’ Runs 3 tasks (easy / medium / hard) as independent episodes.
β€’ Each task produces its own [START]…[STEP]…[END] log block.
β€’ A fresh WebSocket env connection is opened per task to avoid
HF-Space WebSocket timeouts during long LLM + compilation waits.
β€’ Scores are clamped to (0.01, 0.99) β€” strictly inside (0, 1).
β€’ If HF_TOKEN is missing, minimal fallback blocks are emitted so
the platform always receives 3 parseable task records.
Required env vars
─────────────────
API_BASE_URL β€” LLM router URL (default: HF router)
MODEL_NAME β€” model identifier (default: Qwen 72B)
HF_TOKEN β€” HuggingFace / API key
ENV_URL β€” environment URL (default: http://localhost:8000)
"""
import os
import asyncio
import logging
from typing import List, Optional
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
# ── Configuration ─────────────────────────────────────────────────────────────
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
ENV_URL = os.getenv("ENV_URL") or "http://localhost:8000"
SUCCESS_SCORE_THRESHOLD = 0.5
TEMPERATURE = 0.1
MAX_TOKENS = 1500
# Exactly 3 tasks: easy / medium / hard (maps to problems.json indices)
EVAL_TASKS = [
{"task_id": "task_1", "start_index": 0, "difficulty": "easy"},
{"task_id": "task_3", "start_index": 2, "difficulty": "medium"},
{"task_id": "task_6", "start_index": 5, "difficulty": "hard"},
]
# ── Logging ───────────────────────────────────────────────────────────────────
_LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper()
logging.basicConfig(
level=getattr(logging, _LOG_LEVEL, logging.INFO),
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
)
logger = logging.getLogger("rust_coder.inference")
from client import RustCoderEnv
from models import RustCoderAction
# ── Strict stdout log helpers ─────────────────────────────────────────────────
def log_start(task: str, env: str, model: str) -> None:
print(f"[START] task={task} env={env} model={model}", flush=True)
def log_step(
step: int,
action: str,
reward: float,
done: bool,
error: Optional[str] = None,
) -> None:
action_str = (action or "").replace("\r", "\\r").replace("\n", "\\n")[:200]
err_field = "null" if error is None else str(error).replace("\n", "\\n")[:200]
print(
f"[STEP] step={step} action={action_str} reward={reward:.2f} "
f"done={str(bool(done)).lower()} error={err_field}",
flush=True,
)
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
print(
f"[END] success={str(success).lower()} steps={steps} "
f"score={score:.3f} rewards={rewards_str}",
flush=True,
)
# ── Score clamping ────────────────────────────────────────────────────────────
def clamp_score(raw: float) -> float:
"""
Clamp to the open interval (0, 1) β€” never exactly 0.0 or 1.0.
Floor 0.01: even compilation failures yield a non-zero score.
Ceiling 0.99: prevents a theoretically-perfect submission from
returning 1.0.
"""
return round(max(0.01, min(0.99, float(raw))), 3)
# ── LLM call ─────────────────────────────────────────────────────────────────
async def get_model_code(prompt: str, client: OpenAI) -> str:
"""Ask the model for a complete Rust solution; strip markdown if needed."""
try:
completion = client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": (
"You are a senior Rust systems engineer. "
"Return ONLY the complete, corrected Rust source file. "
"No markdown fences. No commentary."
),
},
{"role": "user", "content": prompt},
],
temperature=TEMPERATURE,
max_tokens=MAX_TOKENS,
)
text = (completion.choices[0].message.content or "").strip()
if "```rust" in text:
text = text.split("```rust")[1].split("```")[0]
elif "```" in text:
text = text.split("```")[1].split("```")[0]
text = text.strip()
return text or "// empty response"
except Exception as exc:
logger.exception("LLM call failed")
return f"// LLM error: {exc}"
# ── Single-task episode ───────────────────────────────────────────────────────
async def run_task(task_info: dict, client: Optional[OpenAI]) -> None:
"""
Run one task as a fully independent episode with its own env connection.
Opens a fresh WebSocket connection so a slow LLM call on a previous
task cannot cause a connection timeout here.
Always emits exactly one [START]…[STEP]…[END] block.
"""
task_id = task_info["task_id"]
start_index = task_info["start_index"]
log_start(task=task_id, env="RustCoder-v1", model=MODEL_NAME)
rewards: List[float] = []
steps_taken = 0
score = 0.01
success = False
# Fresh connection per task β€” avoids WebSocket timeout across tasks
env = RustCoderEnv(base_url=ENV_URL)
try:
# ── Reset to the target task ──────────────────────────────────
reset_result = await env.reset(start_index=start_index)
obs = reset_result.observation
# ── Build prompt ──────────────────────────────────────────────
prompt = obs.problem_description or ""
header = getattr(obs, "header_section", "")
if header:
prompt += (
"\n\nHeader section (must be included verbatim):"
f"\n```rust\n{header}\n```"
)
# ── Get LLM code or skip if no token ─────────────────────────
if client is not None:
code = await get_model_code(prompt, client)
else:
code = "// no HF_TOKEN β€” using stub"
steps_taken = 1
# ── Evaluate in environment ───────────────────────────────────
step_result = await env.step(RustCoderAction(code=code))
# Explicit None check β€” 0.0 is falsy but valid
raw_reward = float(step_result.reward if step_result.reward is not None else 0.0)
score = clamp_score(raw_reward)
rewards.append(score)
success = score >= SUCCESS_SCORE_THRESHOLD
log_step(step=1, action=code, reward=score, done=True, error=None)
except Exception as exc:
logger.exception("Task %s failed", task_id)
score = 0.01
rewards = [0.01]
log_step(
step=steps_taken + 1,
action="error",
reward=0.01,
done=True,
error=str(exc),
)
finally:
try:
await env.close()
except Exception:
pass
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
# ── Main ──────────────────────────────────────────────────────────────────────
async def main() -> None:
# Build the LLM client if credentials are available
client: Optional[OpenAI] = None
if HF_TOKEN:
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
else:
logger.warning(
"HF_TOKEN / API_KEY not set β€” LLM calls disabled. "
"Stub code will be submitted; scores will be at floor (0.01)."
)
for task in EVAL_TASKS:
await run_task(task, client)
if __name__ == "__main__":
asyncio.run(main())