""" Inference Script — VeriRL Verilog Hardware Design Environment =================================== MANDATORY - Before submitting, ensure the following variables are defined in your environment configuration: API_BASE_URL The API endpoint for the LLM. MODEL_NAME The model identifier to use for inference. HF_TOKEN Your Hugging Face / API key. ENV_BASE_URL The base URL of the running VeriRL environment server. - Defaults: API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct") ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000") - The inference script must be named `inference.py` and placed in the root directory of the project - Participants must use OpenAI Client for all LLM calls using above variables STDOUT FORMAT - The script must emit exactly three line types to stdout, in this order: [START] task= env= model= [STEP] step= action= reward=<0.00> done= error= [END] success= steps= score= rewards= Rules: - One [START] line at episode begin. - One [STEP] line per step, immediately after env.step() returns. - One [END] line after env.close(), always emitted (even on exception). - reward and rewards are formatted to 2 decimal places. - done and success are lowercase booleans: true or false. - error is the raw last_action_error string, or null if none. - All fields on a single line with no newlines within a line. - Each task's score must be strictly in (0, 1). Example: [START] task=mac_unit env=verirl model=Qwen2.5-72B-Instruct [STEP] step=1 action=write_file(312chars) reward=0.02 done=false error=null [STEP] step=2 action=run_compile reward=0.07 done=false error=null [STEP] step=3 action=submit reward=0.05 done=true error=null [END] success=true steps=3 score=0.47 rewards=0.02,0.07,0.05 """ import asyncio import json import os import sys import textwrap import time from typing import List, Optional from dotenv import load_dotenv from openai import OpenAI from verirl_env import VerirlAction, verirl_env load_dotenv() # --- Configuration --- API_KEY = os.getenv("OPENAI_API_KEY") or os.getenv("HF_TOKEN") or os.getenv("API_KEY") API_BASE_URL = os.getenv("API_BASE_URL", "https://router.huggingface.co/v1") MODEL_NAME = os.getenv("MODEL_NAME", "Qwen/Qwen2.5-72B-Instruct") ENV_BASE_URL = os.getenv("ENV_BASE_URL", "http://localhost:8000") BENCHMARK = "verirl" # Per-task wall-clock budgets (seconds) and success threshold TASK_BUDGETS: dict[str, int] = { # easy "mac_unit": 4 * 60, "relu_clip": 3 * 60, "barrel_shifter": 3 * 60, # medium "axi_fifo": 6 * 60, "register_file": 5 * 60, "ring_buffer": 6 * 60, "dot_product": 5 * 60, "fir_filter": 6 * 60, # hard "systolic_array": 8 * 60, "fp16_adder": 10 * 60, } SUCCESS_SCORE_THRESHOLD = 0.5 # final_score in [0, 1] SYSTEM_PROMPT = textwrap.dedent( """ You are an expert RTL hardware designer. Implement the given Verilog specification correctly. REQUIRED WORKFLOW — follow this sequence every episode: 1. write_file — write a complete, synthesizable Verilog module. For multi-module designs use separate files: {"action_type": "write_file", "filename": "pe.v", "verilog_src": "..."} {"action_type": "write_file", "filename": "top.v", "verilog_src": "..."} 2. run_compile — check all files for syntax errors; fix with write_file if needed 3. run_sim — run the testbench; read every PASS/FAIL line; fix failures with write_file 4. (optional) run_formal — check formal properties if available; fix any counterexamples 5. (optional) run_synth — check area against reference cell count 6. submit — only after attempting compile and sim NEVER submit without first running run_compile and run_sim. Available actions — respond with exactly one JSON object, no markdown: {"action_type": "write_file", "filename": "design.v", "verilog_src": "", "message": "..."} {"action_type": "run_compile", "message": "checking syntax"} {"action_type": "run_sim", "message": "running testbench"} {"action_type": "run_synth", "message": "checking area"} {"action_type": "run_formal", "message": "checking formal properties"} {"action_type": "list_files", "message": "show written files"} {"action_type": "submit", "message": "final submission"} Design rules: - No `initial` blocks in the design module (testbench only) - Use always @(posedge clk) for sequential logic - Fully combinational modules: use assign or always @(*) - Pay close attention to pipeline depth, pipeline registers, and timing requirements - For tasks requiring multiple modules: use separate write_file calls with different filenames """ ).strip() # --------------------------------------------------------------------------- # Logging helpers # --------------------------------------------------------------------------- def log_start(task: str, model: str) -> None: print(f"[START] task={task} env={BENCHMARK} model={model}", flush=True) def log_step( step: int, action: str, reward: float, done: bool, error: Optional[str] ) -> None: print( f"[STEP] step={step} action={action} reward={reward:.2f} " f"done={str(done).lower()} error={error or 'null'}", flush=True, ) def log_end(success: bool, steps: int, rewards: List[float], final_score: float) -> None: rewards_str = ",".join(f"{r:.2f}" for r in rewards) print( f"[END] success={str(success).lower()} steps={steps} score={final_score:.2f} rewards={rewards_str}", flush=True, ) # --------------------------------------------------------------------------- # Agent helpers # --------------------------------------------------------------------------- def safe_score(raw) -> float: """Clamp any score/reward to the open interval (0, 1) required by the validator.""" return round(min(max(float(raw), 0.01), 0.99), 2) def sanitize_error(error_str: str, max_len: int = 80) -> Optional[str]: """Sanitize error message: remove newlines, truncate to max_len.""" if not error_str: return None # Replace newlines and multiple spaces with single space sanitized = " ".join(error_str.split()) # Truncate if len(sanitized) > max_len: sanitized = sanitized[:max_len] return sanitized def format_observation(obs) -> str: """Format a VerirlObservation as a readable context block for the LLM.""" parts = [] if obs.task_spec: parts.append(f"TASK SPECIFICATION:\n{obs.task_spec}") if obs.tool_stdout: parts.append(f"TOOL OUTPUT:\n{obs.tool_stdout}") if obs.tool_stderr: parts.append(f"ERRORS:\n{obs.tool_stderr}") # Multi-file project summary if getattr(obs, "current_files", None): file_summary = ", ".join( f"{n}({len(s)}chars)" for n, s in sorted(obs.current_files.items()) ) parts.append(f"Files on disk: {file_summary}") # Formal verification status formal_proven = getattr(obs, "formal_properties_proven", None) formal_total = getattr(obs, "formal_properties_total", None) formal_str = ( f" | formal={formal_proven}/{formal_total}" if formal_proven is not None else "" ) parts.append( f"Status: compile={'OK' if obs.compile_ok else 'FAIL'} | " f"tests={obs.tests_passed}/{obs.tests_total}" f"{formal_str} | " f"turn={obs.turn_number} | remaining={obs.turns_remaining}" ) return "\n\n".join(parts) def parse_action(response_text: str) -> tuple[VerirlAction, Optional[str]]: """Extract a JSON action from the LLM response, handling markdown fences. Returns (action, parse_error) where parse_error is None on success. """ text = response_text.strip() if "```json" in text: text = text.split("```json")[1].split("```")[0].strip() elif "```" in text: text = text.split("```")[1].split("```")[0].strip() start = text.find("{") if start >= 0: try: decoder = json.JSONDecoder() data, _ = decoder.raw_decode(text, start) valid_fields = VerirlAction.model_fields return VerirlAction(**{k: v for k, v in data.items() if k in valid_fields}), None except Exception as exc: err = sanitize_error(f"parse_error: {str(exc)}", max_len=60) return VerirlAction(action_type="submit", message="parse error"), err return VerirlAction(action_type="submit", message="parse error"), "parse_error: no JSON found" def action_label(action: VerirlAction) -> str: """Compact one-token label for [STEP] logging.""" if action.action_type == "write_file" and action.verilog_src: return f"write_file({len(action.verilog_src)}chars)" return action.action_type # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _fallback_action(obs) -> VerirlAction: """Fallback when the LLM call fails — submit whatever code we have.""" msg = "llm error, submitting best code" if obs.current_verilog else "llm error" return VerirlAction(action_type="submit", message=msg) # --------------------------------------------------------------------------- # Episode runner # --------------------------------------------------------------------------- async def run_task(task_id: str, llm: OpenAI) -> float: """Run one complete episode for the given task. Returns final_score in [0, 1].""" start_time = time.time() final_score = 0.0 rewards: List[float] = [] steps_taken = 0 success = False log_start(task=task_id, model=MODEL_NAME) env = None obs = None try: budget = TASK_BUDGETS.get(task_id) if budget is None: raise ValueError(f"Unknown task_id '{task_id}'. Valid: {list(TASK_BUDGETS.keys())}") env = verirl_env(base_url=ENV_BASE_URL) result = await env.reset(task_id=task_id) obs = result.observation messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": format_observation(obs)}, ] for step in range(1, 100): # max guard; episodes end via done flag elapsed = time.time() - start_time if elapsed > budget: action = VerirlAction( action_type="submit", message="time budget exceeded" ) result = await env.step(action) reward = safe_score(result.reward or 0.01) rewards.append(reward) steps_taken = step log_step(step, action_label(action), reward, True, None) final_score = safe_score(result.observation.final_score or 0.01) break # LLM call error: Optional[str] = None try: response = llm.chat.completions.create( model=MODEL_NAME, messages=messages, ) assistant_text = response.choices[0].message.content or "" except Exception as exc: error = sanitize_error(str(exc)) assistant_text = "" if assistant_text: action, parse_err = parse_action(assistant_text) if parse_err and error is None: error = parse_err else: # LLM call failed — pick a sensible fallback based on current state # rather than immediately submitting action = _fallback_action(obs) # Environment step try: result = await env.step(action) except Exception as exc: error = sanitize_error(str(exc)) # Log the failed step before breaking reward = 0.01 # fallback reward on error rewards.append(reward) steps_taken = step log_step(step, action_label(action), reward, True, error) break obs = result.observation reward = safe_score(result.reward or 0.01) done = result.done # Validate reward is in expected range if not (-1.0 <= reward <= 1.0): print( f"[WARNING] Task {task_id}: reward {reward} outside [-1.0, 1.0]", file=sys.stderr, flush=True, ) error = sanitize_error(f"reward {reward} outside [-1.0, 1.0]") rewards.append(reward) steps_taken = step log_step(step, action_label(action), reward, done, error) messages.append({"role": "assistant", "content": assistant_text}) messages.append({"role": "user", "content": format_observation(obs)}) if done: final_score = safe_score(obs.final_score or 0.01) break success = final_score >= SUCCESS_SCORE_THRESHOLD # Validate final_score is in [0, 1] if not (0.0 <= final_score <= 1.0): print( f"[WARNING] Task {task_id}: final_score {final_score} not in [0, 1]", file=sys.stderr, flush=True, ) finally: # Safety net: if loop exited without a submit (e.g. connection drop), # attempt a final submit so the score is not lost. if final_score == 0.0 and obs is not None and obs.current_verilog is not None: try: result = await env.step( VerirlAction(action_type="submit", message="safety submit") ) final_score = safe_score(result.observation.final_score or 0.01) success = final_score >= SUCCESS_SCORE_THRESHOLD steps_taken += 1 log_step(steps_taken, "submit", safe_score(result.reward or 0.01), True, None) except Exception: pass # Ensure final_score is never exactly 0.0 (validator requires strict (0, 1)) if final_score == 0.0: final_score = 0.01 success = final_score >= SUCCESS_SCORE_THRESHOLD # Ensure rewards list is never empty — [END] requires at least one reward value if not rewards: rewards = [0.01] if env is not None: try: await env.close() except Exception: pass log_end(success=success, steps=steps_taken, rewards=rewards, final_score=final_score) return final_score # --------------------------------------------------------------------------- # Task Enumeration & Validation # --------------------------------------------------------------------------- async def validate_environment(base_url: str) -> List[str]: """ Validate that the environment is operational and all tasks are discoverable. Runs silently to avoid interfering with stdout format. Returns list of valid task IDs or empty list if validation fails. Args: base_url: Environment server URL Returns: List of task IDs if validation passes, empty list otherwise """ # Known tasks — these must be in the environment task_manifest = [ {"id": "mac_unit", "difficulty": "easy", "max_turns": 8}, {"id": "relu_clip", "difficulty": "easy", "max_turns": 6}, {"id": "barrel_shifter", "difficulty": "easy", "max_turns": 6}, {"id": "axi_fifo", "difficulty": "medium", "max_turns": 10}, {"id": "register_file", "difficulty": "medium", "max_turns": 8}, {"id": "ring_buffer", "difficulty": "medium", "max_turns": 10}, {"id": "dot_product", "difficulty": "medium", "max_turns": 8}, {"id": "fir_filter", "difficulty": "medium", "max_turns": 10}, {"id": "systolic_array", "difficulty": "hard", "max_turns": 12}, {"id": "fp16_adder", "difficulty": "hard", "max_turns": 15}, ] task_ids = [] env = verirl_env(base_url=base_url) for task in task_manifest: task_id = task["id"] try: result = await env.reset(task_id=task_id) # Submit empty code (should score 0) result = await env.step(VerirlAction(action_type="submit")) obs = result.observation final_score = safe_score(obs.final_score or 0.01) # Task is valid if we got a score if final_score is not None: task_ids.append(task_id) except Exception: # Silently skip failed validation pass await env.close() return task_ids # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- async def main() -> None: # Try to enumerate tasks; if validation fails, use defaults task_ids = await validate_environment(ENV_BASE_URL) if not task_ids: # Fallback: use hardcoded task list (validation may have failed due to network) task_ids = [ "mac_unit", "relu_clip", "barrel_shifter", "axi_fifo", "register_file", "ring_buffer", "dot_product", "fir_filter", "systolic_array", "fp16_adder", ] llm = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) scores: dict[str, float] = {} total_start = time.time() for task_id in task_ids: scores[task_id] = await run_task(task_id, llm) # No summary output — spec requires ONLY [START]/[STEP]/[END] lines to stdout if __name__ == "__main__": asyncio.run(main())