Spaces:
Running
Running
File size: 9,060 Bytes
efe528e 9763ffa 0b15484 9763ffa efe528e 9763ffa efe528e 0b15484 9763ffa efe528e e96c0d4 9763ffa efe528e e96c0d4 efe528e e96c0d4 9763ffa efe528e e96c0d4 efe528e e96c0d4 9763ffa efe528e 9763ffa efe528e 9763ffa efe528e 9763ffa efe528e 9763ffa 6b82a4d efe528e 9763ffa efe528e 9763ffa efe528e 9763ffa efe528e 9763ffa efe528e 9763ffa efe528e 9763ffa efe528e 9763ffa efe528e 9763ffa efe528e 9763ffa efe528e 9763ffa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 | """
inference.py β Rust Coder OpenEnv Baseline Agent
Architecture
ββββββββββββ
β’ Runs 3 tasks (easy / medium / hard) as independent episodes.
β’ Each task produces its own [START]β¦[STEP]β¦[END] log block.
β’ A fresh WebSocket env connection is opened per task to avoid
HF-Space WebSocket timeouts during long LLM + compilation waits.
β’ Scores are clamped to (0.01, 0.99) β strictly inside (0, 1).
β’ If HF_TOKEN is missing, minimal fallback blocks are emitted so
the platform always receives 3 parseable task records.
Required env vars
βββββββββββββββββ
API_BASE_URL β LLM router URL (default: HF router)
MODEL_NAME β model identifier (default: Qwen 72B)
HF_TOKEN β HuggingFace / API key
ENV_URL β environment URL (default: http://localhost:8000)
"""
import os
import asyncio
import logging
from typing import List, Optional
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
# ββ Configuration βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
ENV_URL = os.getenv("ENV_URL") or "http://localhost:8000"
SUCCESS_SCORE_THRESHOLD = 0.5
TEMPERATURE = 0.1
MAX_TOKENS = 1500
# Exactly 3 tasks: easy / medium / hard (maps to problems.json indices)
EVAL_TASKS = [
{"task_id": "task_1", "start_index": 0, "difficulty": "easy"},
{"task_id": "task_3", "start_index": 2, "difficulty": "medium"},
{"task_id": "task_6", "start_index": 5, "difficulty": "hard"},
]
# ββ Logging βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper()
logging.basicConfig(
level=getattr(logging, _LOG_LEVEL, logging.INFO),
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
)
logger = logging.getLogger("rust_coder.inference")
from client import RustCoderEnv
from models import RustCoderAction
# ββ Strict stdout log helpers βββββββββββββββββββββββββββββββββββββββββββββββββ
def log_start(task: str, env: str, model: str) -> None:
print(f"[START] task={task} env={env} model={model}", flush=True)
def log_step(
step: int,
action: str,
reward: float,
done: bool,
error: Optional[str] = None,
) -> None:
action_str = (action or "").replace("\r", "\\r").replace("\n", "\\n")[:200]
err_field = "null" if error is None else str(error).replace("\n", "\\n")[:200]
print(
f"[STEP] step={step} action={action_str} reward={reward:.2f} "
f"done={str(bool(done)).lower()} error={err_field}",
flush=True,
)
def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
rewards_str = ",".join(f"{r:.2f}" for r in rewards)
print(
f"[END] success={str(success).lower()} steps={steps} "
f"score={score:.3f} rewards={rewards_str}",
flush=True,
)
# ββ Score clamping ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def clamp_score(raw: float) -> float:
"""
Clamp to the open interval (0, 1) β never exactly 0.0 or 1.0.
Floor 0.01: even compilation failures yield a non-zero score.
Ceiling 0.99: prevents a theoretically-perfect submission from
returning 1.0.
"""
return round(max(0.01, min(0.99, float(raw))), 3)
# ββ LLM call βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
async def get_model_code(prompt: str, client: OpenAI) -> str:
"""Ask the model for a complete Rust solution; strip markdown if needed."""
try:
completion = client.chat.completions.create(
model=MODEL_NAME,
messages=[
{
"role": "system",
"content": (
"You are a senior Rust systems engineer. "
"Return ONLY the complete, corrected Rust source file. "
"No markdown fences. No commentary."
),
},
{"role": "user", "content": prompt},
],
temperature=TEMPERATURE,
max_tokens=MAX_TOKENS,
)
text = (completion.choices[0].message.content or "").strip()
if "```rust" in text:
text = text.split("```rust")[1].split("```")[0]
elif "```" in text:
text = text.split("```")[1].split("```")[0]
text = text.strip()
return text or "// empty response"
except Exception as exc:
logger.exception("LLM call failed")
return f"// LLM error: {exc}"
# ββ Single-task episode βββββββββββββββββββββββββββββββββββββββββββββββββββββββ
async def run_task(task_info: dict, client: Optional[OpenAI]) -> None:
"""
Run one task as a fully independent episode with its own env connection.
Opens a fresh WebSocket connection so a slow LLM call on a previous
task cannot cause a connection timeout here.
Always emits exactly one [START]β¦[STEP]β¦[END] block.
"""
task_id = task_info["task_id"]
start_index = task_info["start_index"]
log_start(task=task_id, env="RustCoder-v1", model=MODEL_NAME)
rewards: List[float] = []
steps_taken = 0
score = 0.01
success = False
# Fresh connection per task β avoids WebSocket timeout across tasks
env = RustCoderEnv(base_url=ENV_URL)
try:
# ββ Reset to the target task ββββββββββββββββββββββββββββββββββ
reset_result = await env.reset(start_index=start_index)
obs = reset_result.observation
# ββ Build prompt ββββββββββββββββββββββββββββββββββββββββββββββ
prompt = obs.problem_description or ""
header = getattr(obs, "header_section", "")
if header:
prompt += (
"\n\nHeader section (must be included verbatim):"
f"\n```rust\n{header}\n```"
)
# ββ Get LLM code or skip if no token βββββββββββββββββββββββββ
if client is not None:
code = await get_model_code(prompt, client)
else:
code = "// no HF_TOKEN β using stub"
steps_taken = 1
# ββ Evaluate in environment βββββββββββββββββββββββββββββββββββ
step_result = await env.step(RustCoderAction(code=code))
# Explicit None check β 0.0 is falsy but valid
raw_reward = float(step_result.reward if step_result.reward is not None else 0.0)
score = clamp_score(raw_reward)
rewards.append(score)
success = score >= SUCCESS_SCORE_THRESHOLD
log_step(step=1, action=code, reward=score, done=True, error=None)
except Exception as exc:
logger.exception("Task %s failed", task_id)
score = 0.01
rewards = [0.01]
log_step(
step=steps_taken + 1,
action="error",
reward=0.01,
done=True,
error=str(exc),
)
finally:
try:
await env.close()
except Exception:
pass
log_end(success=success, steps=steps_taken, score=score, rewards=rewards)
# ββ Main ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
async def main() -> None:
# Build the LLM client if credentials are available
client: Optional[OpenAI] = None
if HF_TOKEN:
client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
else:
logger.warning(
"HF_TOKEN / API_KEY not set β LLM calls disabled. "
"Stub code will be submitted; scores will be at floor (0.01)."
)
for task in EVAL_TASKS:
await run_task(task, client)
if __name__ == "__main__":
asyncio.run(main())
|