""" inference.py — SpectraQual OpenEnv Baseline Inference Script Runs an LLM agent against all 3 SpectraQual tasks and emits structured logs. Environment variables (set before running): API_BASE_URL The LLM API endpoint (default: https://openrouter.ai/api/v1) MODEL_NAME Model identifier (default: meta-llama/llama-3.3-70b-instruct) HF_TOKEN Your Hugging Face / API key (required in production) Usage: export HF_TOKEN="hf_xxx..." python inference.py Output format: [START] task= env=SpectraQual model= [STEP] step= action= reward= done= error= [END] success= steps= score= rewards=[...] """ from __future__ import annotations import json import os import sys import time from typing import List, Optional # ── Path setup so we can import from src/ ────────────────────────────────── ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) SRC_DIR = os.path.join(ROOT_DIR, "src") sys.path.insert(0, SRC_DIR) from openai import OpenAI from env import SpectraQualEnv from models import PCBAction, StepResult from config import ( ACTIONS, VALID_ACTIONS, MAX_STEPS_PER_TASK, SUCCESS_SCORE_THRESHOLD, TEMPERATURE, MAX_TOKENS, TASKS, ) from tasks import TASK_DESCRIPTIONS, run_task, grade # ── Environment variables ────────────────────────────────────────────────── API_BASE_URL = os.getenv("API_BASE_URL", "https://openrouter.ai/api/v1") MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/llama-3.3-70b-instruct") HF_TOKEN = os.getenv("HF_TOKEN") API_KEY = HF_TOKEN or os.getenv("OPENAI_API_KEY", "no-key-set") # Optional: if you use from_docker_image() style containerized env LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME") BENCHMARK = "SpectraQual" TASK_IDS = ["task_easy", "task_medium", "task_hard"] # ── System prompt for the LLM ────────────────────────────────────────────── SYSTEM_PROMPT = """You are a PCB quality-control triage agent. You will receive information about a printed circuit board (PCB) including its defect type, component cost, criticality score, and available factory soldering slots. You must choose exactly ONE action from the allowed list. Respond with ONLY the action name — no explanation, no extra text, no punctuation. Action meanings: - PASS → Board has no defect; clear it. - SCRAP → Board is too damaged or high-risk; discard it. - ROUTE_COMPONENT_REPLACEMENT → Board has a missing component; route to repair. - ROUTE_SOLDERING → Board has a solder bridge; send to soldering station. - ROUTE_DIAGNOSTICS → Board has an ambiguous fault; send for investigation. - WAIT → No soldering slot available; hold the board. Rules: - For defect_type=none, you MUST respond PASS. - For defect_type=missing_component, choose ROUTE_COMPONENT_REPLACEMENT or SCRAP. - For defect_type=solder_bridge, choose ROUTE_SOLDERING, WAIT, or SCRAP. - For defect_type=short_circuit, choose SCRAP or ROUTE_DIAGNOSTICS. - If slots_free=0 and action=ROUTE_SOLDERING would apply, prefer WAIT instead. Respond with only one word. Example: ROUTE_SOLDERING""" # ── Prompt builder ───────────────────────────────────────────────────────── def build_user_prompt( obs, step: int, last_reward: float, history: List[str], ) -> str: history_txt = "\n".join(history[-5:]) if history else "None" anomaly_txt = f"⚠️ ANOMALY DETECTED (score={obs.anomaly_score:.2f})" if obs.is_anomaly else "Normal" return f"""=== PCB TRIAGE — Step {step} === Board ID: {obs.board_id} Defect Type: {obs.defect_type} Component Cost: ₹{obs.component_cost:.2f} Criticality: {obs.criticality:.2f} Slots Free: {obs.slots_free} / {len(obs.slots_state)} Slot State: {obs.slots_state} Anomaly: {anomaly_txt} Valid Actions: {", ".join(obs.valid_actions)} Last Reward: {last_reward:.4f} Cumulative: {obs.cumulative_reward:.4f} Accuracy: {obs.rolling_accuracy:.2%} Recent History: {history_txt} Choose exactly one action from: {", ".join(obs.valid_actions)}""" # ── Structured log helpers ───────────────────────────────────────────────── def log_start(task: str, env: str, model: str) -> None: print( f"[START] task={task} env={env} model={model}", flush=True, ) def log_step( step: int, action: str, reward: float, done: bool, error: Optional[str], ) -> None: error_val = "null" if error is None else f'"{error}"' print( f"[STEP] step={step} action={action} reward={reward:.4f} done={done} error={error_val}", flush=True, ) def log_end( success: bool, steps: int, score: float, rewards: List[float], ) -> None: rewards_str = json.dumps([round(r, 4) for r in rewards]) print( f"[END] success={success} steps={steps} score={score:.4f} rewards={rewards_str}", flush=True, ) # ── LLM call ────────────────────────────────────────────────────────────── def get_llm_action( client: OpenAI, obs, step: int, last_reward: float, history: List[str], ) -> str: """Ask the LLM for a triage action. Falls back to SCRAP on any error.""" prompt = build_user_prompt(obs, step, last_reward, history) try: completion = client.chat.completions.create( model=MODEL_NAME, messages=[ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}, ], temperature=TEMPERATURE, max_tokens=MAX_TOKENS, stream=False, ) raw = (completion.choices[0].message.content or "").strip().upper() # Validate: pick first word that matches a known action for candidate in raw.split(): candidate = candidate.strip(".,;:!?\"'") if candidate in ACTIONS: return candidate # Fallback: try to find partial match for action in ACTIONS: if action in raw: return action print(f"[DEBUG] Unexpected model output: {raw!r}", flush=True) return "SCRAP" except Exception as exc: print(f"[DEBUG] LLM request failed: {exc}", flush=True) return "SCRAP" # ── Single task runner ───────────────────────────────────────────────────── def run_task_inference(client: OpenAI, task_id: str) -> tuple[bool, int, float, List[float]]: """ Run the LLM agent against one task. Returns (success, steps_taken, score, rewards_list). """ cfg = TASKS[task_id] max_steps = min(cfg["n_boards"] + 5, MAX_STEPS_PER_TASK) total_reward_cap = cfg["n_boards"] * 1.0 # max possible (1.0 per step) env = SpectraQualEnv(task_id=task_id) history: List[str] = [] rewards: List[float] = [] action_log: List[str] = [] steps_taken = 0 score = 0.0 success = False log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME) try: result = env.reset() obs = result.observation last_reward = 0.0 for step in range(1, max_steps + 1): if result.done: break # Get action from LLM action_str = get_llm_action(client, obs, step, last_reward, history) action_log.append(action_str) error = None try: result = env.step(PCBAction(action=action_str)) except Exception as e: error = str(e) result = env.step(PCBAction(action="SCRAP")) obs = result.observation reward = result.reward done = result.done last_reward = reward rewards.append(reward) steps_taken = step log_step(step=step, action=action_str, reward=reward, done=done, error=error) history.append( f"Step {step}: {action_str!r} → reward={reward:.4f}" ) if done: break # Score = average normalized reward across all steps score = sum(rewards) / max(len(rewards), 1) score = min(max(score, 0.0), 1.0) success = score >= SUCCESS_SCORE_THRESHOLD except Exception as exc: print(f"[DEBUG] Task runner error: {exc}", flush=True) finally: log_end(success=success, steps=steps_taken, score=score, rewards=rewards) return success, steps_taken, score, rewards # ── Main ────────────────────────────────────────────────────────────────── def main() -> None: print(f"[DEBUG] API_BASE_URL = {API_BASE_URL}", flush=True) print(f"[DEBUG] MODEL_NAME = {MODEL_NAME}", flush=True) print(f"[DEBUG] HF_TOKEN = {'SET' if HF_TOKEN else 'NOT SET (using OPENAI_API_KEY fallback)'}", flush=True) print("", flush=True) client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY) all_scores: List[float] = [] for task_id in TASK_IDS: print(f"\n{'='*60}", flush=True) print(f"[DEBUG] Starting {task_id} | {TASK_DESCRIPTIONS[task_id][:80]}...", flush=True) print(f"{'='*60}\n", flush=True) success, steps, score, rewards = run_task_inference(client, task_id) all_scores.append(score) print(f"\n[DEBUG] {task_id} complete — score={score:.4f} success={success}\n", flush=True) time.sleep(1) # brief pause between tasks overall = sum(all_scores) / len(all_scores) if all_scores else 0.0 print(f"\n{'='*60}", flush=True) print(f"[SUMMARY] Overall score={overall:.4f}", flush=True) print(f"[SUMMARY] Per-task: { {tid: round(s, 4) for tid, s in zip(TASK_IDS, all_scores)} }", flush=True) print(f"{'='*60}\n", flush=True) if __name__ == "__main__": main()