Spaces:

taarunforge
/

spectraqual

Sleeping

App Files Files Community

spectraqual / inference.py

taarunforge

Deploy SpectraQual OpenEnv environment

dfbb493 about 2 months ago

raw

history blame contribute delete

10.8 kB

	"""
	inference.py — SpectraQual OpenEnv Baseline Inference Script

	Runs an LLM agent against all 3 SpectraQual tasks and emits structured logs.

	Environment variables (set before running):
	API_BASE_URL The LLM API endpoint (default: https://openrouter.ai/api/v1)
	MODEL_NAME Model identifier (default: meta-llama/llama-3.3-70b-instruct)
	HF_TOKEN Your Hugging Face / API key (required in production)

	Usage:
	export HF_TOKEN="hf_xxx..."
	python inference.py

	Output format:
	[START] task=<id> env=SpectraQual model=<model>
	[STEP] step=<n> action=<A> reward=<r> done=<bool> error=<null\|msg>
	[END] success=<bool> steps=<n> score=<f> rewards=[...]
	"""

	from __future__ import annotations
	import json
	import os
	import sys
	import time
	from typing import List, Optional

	# ── Path setup so we can import from src/ ──────────────────────────────────
	ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
	SRC_DIR = os.path.join(ROOT_DIR, "src")
	sys.path.insert(0, SRC_DIR)

	from openai import OpenAI
	from env import SpectraQualEnv
	from models import PCBAction, StepResult
	from config import (
	ACTIONS,
	VALID_ACTIONS,
	MAX_STEPS_PER_TASK,
	SUCCESS_SCORE_THRESHOLD,
	TEMPERATURE,
	MAX_TOKENS,
	TASKS,
	)
	from tasks import TASK_DESCRIPTIONS, run_task, grade

	# ── Environment variables ──────────────────────────────────────────────────
	API_BASE_URL = os.getenv("API_BASE_URL", "https://openrouter.ai/api/v1")
	MODEL_NAME = os.getenv("MODEL_NAME", "meta-llama/llama-3.3-70b-instruct")
	HF_TOKEN = os.getenv("HF_TOKEN")
	API_KEY = HF_TOKEN or os.getenv("OPENAI_API_KEY", "no-key-set")

	# Optional: if you use from_docker_image() style containerized env
	LOCAL_IMAGE_NAME = os.getenv("LOCAL_IMAGE_NAME")

	BENCHMARK = "SpectraQual"
	TASK_IDS = ["task_easy", "task_medium", "task_hard"]

	# ── System prompt for the LLM ──────────────────────────────────────────────
	SYSTEM_PROMPT = """You are a PCB quality-control triage agent.
	You will receive information about a printed circuit board (PCB) including its defect type,
	component cost, criticality score, and available factory soldering slots.

	You must choose exactly ONE action from the allowed list.
	Respond with ONLY the action name — no explanation, no extra text, no punctuation.

	Action meanings:
	- PASS → Board has no defect; clear it.
	- SCRAP → Board is too damaged or high-risk; discard it.
	- ROUTE_COMPONENT_REPLACEMENT → Board has a missing component; route to repair.
	- ROUTE_SOLDERING → Board has a solder bridge; send to soldering station.
	- ROUTE_DIAGNOSTICS → Board has an ambiguous fault; send for investigation.
	- WAIT → No soldering slot available; hold the board.

	Rules:
	- For defect_type=none, you MUST respond PASS.
	- For defect_type=missing_component, choose ROUTE_COMPONENT_REPLACEMENT or SCRAP.
	- For defect_type=solder_bridge, choose ROUTE_SOLDERING, WAIT, or SCRAP.
	- For defect_type=short_circuit, choose SCRAP or ROUTE_DIAGNOSTICS.
	- If slots_free=0 and action=ROUTE_SOLDERING would apply, prefer WAIT instead.

	Respond with only one word. Example: ROUTE_SOLDERING"""


	# ── Prompt builder ─────────────────────────────────────────────────────────
	def build_user_prompt(
	obs,
	step: int,
	last_reward: float,
	history: List[str],
	) -> str:
	history_txt = "\n".join(history[-5:]) if history else "None"
	anomaly_txt = f"⚠️ ANOMALY DETECTED (score={obs.anomaly_score:.2f})" if obs.is_anomaly else "Normal"
	return f"""=== PCB TRIAGE — Step {step} ===
	Board ID: {obs.board_id}
	Defect Type: {obs.defect_type}
	Component Cost: ₹{obs.component_cost:.2f}
	Criticality: {obs.criticality:.2f}
	Slots Free: {obs.slots_free} / {len(obs.slots_state)}
	Slot State: {obs.slots_state}
	Anomaly: {anomaly_txt}

	Valid Actions: {", ".join(obs.valid_actions)}

	Last Reward: {last_reward:.4f}
	Cumulative: {obs.cumulative_reward:.4f}
	Accuracy: {obs.rolling_accuracy:.2%}

	Recent History:
	{history_txt}

	Choose exactly one action from: {", ".join(obs.valid_actions)}"""


	# ── Structured log helpers ─────────────────────────────────────────────────
	def log_start(task: str, env: str, model: str) -> None:
	print(
	f"[START] task={task} env={env} model={model}",
	flush=True,
	)


	def log_step(
	step: int,
	action: str,
	reward: float,
	done: bool,
	error: Optional[str],
	) -> None:
	error_val = "null" if error is None else f'"{error}"'
	print(
	f"[STEP] step={step} action={action} reward={reward:.4f} done={done} error={error_val}",
	flush=True,
	)


	def log_end(
	success: bool,
	steps: int,
	score: float,
	rewards: List[float],
	) -> None:
	rewards_str = json.dumps([round(r, 4) for r in rewards])
	print(
	f"[END] success={success} steps={steps} score={score:.4f} rewards={rewards_str}",
	flush=True,
	)


	# ── LLM call ──────────────────────────────────────────────────────────────
	def get_llm_action(
	client: OpenAI,
	obs,
	step: int,
	last_reward: float,
	history: List[str],
	) -> str:
	"""Ask the LLM for a triage action. Falls back to SCRAP on any error."""
	prompt = build_user_prompt(obs, step, last_reward, history)
	try:
	completion = client.chat.completions.create(
	model=MODEL_NAME,
	messages=[
	{"role": "system", "content": SYSTEM_PROMPT},
	{"role": "user", "content": prompt},
	],
	temperature=TEMPERATURE,
	max_tokens=MAX_TOKENS,
	stream=False,
	)
	raw = (completion.choices[0].message.content or "").strip().upper()

	# Validate: pick first word that matches a known action
	for candidate in raw.split():
	candidate = candidate.strip(".,;:!?\"'")
	if candidate in ACTIONS:
	return candidate

	# Fallback: try to find partial match
	for action in ACTIONS:
	if action in raw:
	return action

	print(f"[DEBUG] Unexpected model output: {raw!r}", flush=True)
	return "SCRAP"

	except Exception as exc:
	print(f"[DEBUG] LLM request failed: {exc}", flush=True)
	return "SCRAP"


	# ── Single task runner ─────────────────────────────────────────────────────
	def run_task_inference(client: OpenAI, task_id: str) -> tuple[bool, int, float, List[float]]:
	"""
	Run the LLM agent against one task.
	Returns (success, steps_taken, score, rewards_list).
	"""
	cfg = TASKS[task_id]
	max_steps = min(cfg["n_boards"] + 5, MAX_STEPS_PER_TASK)
	total_reward_cap = cfg["n_boards"] * 1.0 # max possible (1.0 per step)

	env = SpectraQualEnv(task_id=task_id)
	history: List[str] = []
	rewards: List[float] = []
	action_log: List[str] = []
	steps_taken = 0
	score = 0.0
	success = False

	log_start(task=task_id, env=BENCHMARK, model=MODEL_NAME)

	try:
	result = env.reset()
	obs = result.observation
	last_reward = 0.0

	for step in range(1, max_steps + 1):
	if result.done:
	break

	# Get action from LLM
	action_str = get_llm_action(client, obs, step, last_reward, history)
	action_log.append(action_str)

	error = None
	try:
	result = env.step(PCBAction(action=action_str))
	except Exception as e:
	error = str(e)
	result = env.step(PCBAction(action="SCRAP"))

	obs = result.observation
	reward = result.reward
	done = result.done
	last_reward = reward

	rewards.append(reward)
	steps_taken = step

	log_step(step=step, action=action_str, reward=reward, done=done, error=error)

	history.append(
	f"Step {step}: {action_str!r} → reward={reward:.4f}"
	)

	if done:
	break

	# Score = average normalized reward across all steps
	score = sum(rewards) / max(len(rewards), 1)
	score = min(max(score, 0.0), 1.0)
	success = score >= SUCCESS_SCORE_THRESHOLD

	except Exception as exc:
	print(f"[DEBUG] Task runner error: {exc}", flush=True)

	finally:
	log_end(success=success, steps=steps_taken, score=score, rewards=rewards)

	return success, steps_taken, score, rewards


	# ── Main ──────────────────────────────────────────────────────────────────
	def main() -> None:
	print(f"[DEBUG] API_BASE_URL = {API_BASE_URL}", flush=True)
	print(f"[DEBUG] MODEL_NAME = {MODEL_NAME}", flush=True)
	print(f"[DEBUG] HF_TOKEN = {'SET' if HF_TOKEN else 'NOT SET (using OPENAI_API_KEY fallback)'}", flush=True)
	print("", flush=True)

	client = OpenAI(base_url=API_BASE_URL, api_key=API_KEY)

	all_scores: List[float] = []

	for task_id in TASK_IDS:
	print(f"\n{'='*60}", flush=True)
	print(f"[DEBUG] Starting {task_id} \| {TASK_DESCRIPTIONS[task_id][:80]}...", flush=True)
	print(f"{'='*60}\n", flush=True)

	success, steps, score, rewards = run_task_inference(client, task_id)
	all_scores.append(score)

	print(f"\n[DEBUG] {task_id} complete — score={score:.4f} success={success}\n", flush=True)
	time.sleep(1) # brief pause between tasks

	overall = sum(all_scores) / len(all_scores) if all_scores else 0.0
	print(f"\n{'='*60}", flush=True)
	print(f"[SUMMARY] Overall score={overall:.4f}", flush=True)
	print(f"[SUMMARY] Per-task: { {tid: round(s, 4) for tid, s in zip(TASK_IDS, all_scores)} }", flush=True)
	print(f"{'='*60}\n", flush=True)


	if __name__ == "__main__":
	main()