Spaces:

Parthiban007
/

rust_coder

Running

App Files Files Community

rust_coder / inference.py

Parthiban007

Upload folder using huggingface_hub

efe528e verified about 7 hours ago

raw

history blame contribute delete

9.06 kB

	"""
	inference.py — Rust Coder OpenEnv Baseline Agent

	Architecture
	────────────
	• Runs 3 tasks (easy / medium / hard) as independent episodes.
	• Each task produces its own [START]…[STEP]…[END] log block.
	• A fresh WebSocket env connection is opened per task to avoid
	HF-Space WebSocket timeouts during long LLM + compilation waits.
	• Scores are clamped to (0.01, 0.99) — strictly inside (0, 1).
	• If HF_TOKEN is missing, minimal fallback blocks are emitted so
	the platform always receives 3 parseable task records.

	Required env vars
	─────────────────
	API_BASE_URL — LLM router URL (default: HF router)
	MODEL_NAME — model identifier (default: Qwen 72B)
	HF_TOKEN — HuggingFace / API key
	ENV_URL — environment URL (default: http://localhost:8000)
	"""

	import os
	import asyncio
	import logging
	from typing import List, Optional

	from openai import OpenAI
	from dotenv import load_dotenv

	load_dotenv()

	# ── Configuration ─────────────────────────────────────────────────────────────
	API_BASE_URL = os.getenv("API_BASE_URL") or "https://router.huggingface.co/v1"
	MODEL_NAME = os.getenv("MODEL_NAME") or "Qwen/Qwen2.5-72B-Instruct"
	HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("API_KEY")
	ENV_URL = os.getenv("ENV_URL") or "http://localhost:8000"

	SUCCESS_SCORE_THRESHOLD = 0.5
	TEMPERATURE = 0.1
	MAX_TOKENS = 1500

	# Exactly 3 tasks: easy / medium / hard (maps to problems.json indices)
	EVAL_TASKS = [
	{"task_id": "task_1", "start_index": 0, "difficulty": "easy"},
	{"task_id": "task_3", "start_index": 2, "difficulty": "medium"},
	{"task_id": "task_6", "start_index": 5, "difficulty": "hard"},
	]

	# ── Logging ───────────────────────────────────────────────────────────────────
	_LOG_LEVEL = (os.getenv("LOG_LEVEL") or "INFO").upper()
	logging.basicConfig(
	level=getattr(logging, _LOG_LEVEL, logging.INFO),
	format="%(asctime)s %(levelname)s %(name)s - %(message)s",
	)
	logger = logging.getLogger("rust_coder.inference")

	from client import RustCoderEnv
	from models import RustCoderAction


	# ── Strict stdout log helpers ─────────────────────────────────────────────────

	def log_start(task: str, env: str, model: str) -> None:
	print(f"[START] task={task} env={env} model={model}", flush=True)


	def log_step(
	step: int,
	action: str,
	reward: float,
	done: bool,
	error: Optional[str] = None,
	) -> None:
	action_str = (action or "").replace("\r", "\\r").replace("\n", "\\n")[:200]
	err_field = "null" if error is None else str(error).replace("\n", "\\n")[:200]
	print(
	f"[STEP] step={step} action={action_str} reward={reward:.2f} "
	f"done={str(bool(done)).lower()} error={err_field}",
	flush=True,
	)


	def log_end(success: bool, steps: int, score: float, rewards: List[float]) -> None:
	rewards_str = ",".join(f"{r:.2f}" for r in rewards)
	print(
	f"[END] success={str(success).lower()} steps={steps} "
	f"score={score:.3f} rewards={rewards_str}",
	flush=True,
	)


	# ── Score clamping ────────────────────────────────────────────────────────────

	def clamp_score(raw: float) -> float:
	"""
	Clamp to the open interval (0, 1) — never exactly 0.0 or 1.0.

	Floor 0.01: even compilation failures yield a non-zero score.
	Ceiling 0.99: prevents a theoretically-perfect submission from
	returning 1.0.
	"""
	return round(max(0.01, min(0.99, float(raw))), 3)


	# ── LLM call ─────────────────────────────────────────────────────────────────

	async def get_model_code(prompt: str, client: OpenAI) -> str:
	"""Ask the model for a complete Rust solution; strip markdown if needed."""
	try:
	completion = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{
	"role": "system",
	"content": (
	"You are a senior Rust systems engineer. "
	"Return ONLY the complete, corrected Rust source file. "
	"No markdown fences. No commentary."
	),
	},
	{"role": "user", "content": prompt},
	],
	temperature=TEMPERATURE,
	max_tokens=MAX_TOKENS,
	)
	text = (completion.choices[0].message.content or "").strip()
	if "```rust" in text:
	text = text.split("```rust")[1].split("```")[0]
	elif "```" in text:
	text = text.split("```")[1].split("```")[0]
	text = text.strip()
	return text or "// empty response"
	except Exception as exc:
	logger.exception("LLM call failed")
	return f"// LLM error: {exc}"


	# ── Single-task episode ───────────────────────────────────────────────────────

	async def run_task(task_info: dict, client: Optional[OpenAI]) -> None:
	"""
	Run one task as a fully independent episode with its own env connection.

	Opens a fresh WebSocket connection so a slow LLM call on a previous
	task cannot cause a connection timeout here.

	Always emits exactly one [START]…[STEP]…[END] block.
	"""
	task_id = task_info["task_id"]
	start_index = task_info["start_index"]

	log_start(task=task_id, env="RustCoder-v1", model=MODEL_NAME)

	rewards: List[float] = []
	steps_taken = 0
	score = 0.01
	success = False

	# Fresh connection per task — avoids WebSocket timeout across tasks
	env = RustCoderEnv(base_url=ENV_URL)
	try:
	# ── Reset to the target task ──────────────────────────────────
	reset_result = await env.reset(start_index=start_index)
	obs = reset_result.observation

	# ── Build prompt ──────────────────────────────────────────────
	prompt = obs.problem_description or ""
	header = getattr(obs, "header_section", "")
	if header:
	prompt += (
	"\n\nHeader section (must be included verbatim):"
	f"\n```rust\n{header}\n```"
	)

	# ── Get LLM code or skip if no token ─────────────────────────
	if client is not None:
	code = await get_model_code(prompt, client)
	else:
	code = "// no HF_TOKEN — using stub"

	steps_taken = 1

	# ── Evaluate in environment ───────────────────────────────────
	step_result = await env.step(RustCoderAction(code=code))
	# Explicit None check — 0.0 is falsy but valid
	raw_reward = float(step_result.reward if step_result.reward is not None else 0.0)
	score = clamp_score(raw_reward)
	rewards.append(score)
	success = score >= SUCCESS_SCORE_THRESHOLD

	log_step(step=1, action=code, reward=score, done=True, error=None)

	except Exception as exc:
	logger.exception("Task %s failed", task_id)
	score = 0.01
	rewards = [0.01]
	log_step(
	step=steps_taken + 1,
	action="error",
	reward=0.01,
	done=True,
	error=str(exc),
	)
	finally:
	try:
	await env.close()
	except Exception:
	pass

	log_end(success=success, steps=steps_taken, score=score, rewards=rewards)


	# ── Main ──────────────────────────────────────────────────────────────────────

	async def main() -> None:
	# Build the LLM client if credentials are available
	client: Optional[OpenAI] = None
	if HF_TOKEN:
	client = OpenAI(base_url=API_BASE_URL, api_key=HF_TOKEN)
	else:
	logger.warning(
	"HF_TOKEN / API_KEY not set — LLM calls disabled. "
	"Stub code will be submitted; scores will be at floor (0.01)."
	)

	for task in EVAL_TASKS:
	await run_task(task, client)


	if __name__ == "__main__":
	asyncio.run(main())