""" tasks.py — SpectraQual Task Definitions and Programmatic Graders Each task runs the environment with a fixed seed and scores the agent 0.0–1.0. Graders are deterministic and reproducible. """ from __future__ import annotations import sys import os from typing import List sys.path.insert(0, os.path.dirname(__file__)) from config import ( TASKS, MEDIUM_ECONOMIC_TARGET, HARD_ANOMALY_RATE_TARGET, SUCCESS_SCORE_THRESHOLD, ) from models import TaskResult from env import SpectraQualEnv from models import PCBAction # --------------------------- # TASK RUNNER # --------------------------- def run_task(task_id: str, actions: List[str]) -> TaskResult: """ Run a task with a pre-determined list of actions. Used by graders to replay an agent's trajectory deterministically. Args: task_id: one of "task_easy", "task_medium", "task_hard" actions: list of action strings, one per step Returns: TaskResult with all episode metrics filled in. """ cfg = TASKS[task_id] env = SpectraQualEnv(task_id=task_id) env.reset() rewards: List[float] = [] correct = 0 total = 0 bottlenecks = 0 anomaly_total = 0 anomaly_flagged = 0 cum_raw = 0.0 for i, action_str in enumerate(actions): if env._done: break # Default to SCRAP if action is out of valid range valid = env._current_pcb and env._current_pcb.get("defect_type") try: result = env.step(PCBAction(action=action_str)) except Exception: result = env.step(PCBAction(action="SCRAP")) rewards.append(result.reward) total += 1 if result.info.get("is_anomaly"): anomaly_total += 1 if result.reward_components: cum_raw += result.reward_components.total_raw if result.info.get("is_anomaly") and result.reward_components.anomaly_bonus >= 0.8: anomaly_flagged += 1 if env._is_correct(result.info.get("defect", ""), action_str): correct += 1 bottlenecks = env._bottleneck_cnt max_possible_raw = cfg["n_boards"] * 1.0 # max normalized = 1.0 per step return TaskResult( task_id=task_id, total_steps=total, rewards=rewards, correct_decisions=correct, total_decisions=total, bottleneck_count=bottlenecks, anomaly_total=anomaly_total, anomaly_flagged=anomaly_flagged, cumulative_raw_reward=cum_raw, max_possible_raw=max_possible_raw, ) # --------------------------- # GRADER: TASK EASY # --------------------------- def grade_easy(result: TaskResult) -> float: """ Task Easy Grader. Objective: Correctly classify all defect types. No slot pressure. Scoring: correct_decisions / total_decisions → 0.0–1.0 Also gives partial credit for near-correct results: - 100% correct = 1.0 - 80% correct = 0.8 - 0% correct = 0.0 """ if result.total_decisions == 0: return 0.0 accuracy = result.correct_decisions / result.total_decisions # Blend accuracy with average reward for robustness avg_reward = sum(result.rewards) / len(result.rewards) if result.rewards else 0.0 # Weight: 70% accuracy, 30% reward quality score = 0.70 * accuracy + 0.30 * avg_reward return round(min(max(score, 0.0), 1.0), 4) # --------------------------- # GRADER: TASK MEDIUM # --------------------------- def grade_medium(result: TaskResult) -> float: """ Task Medium Grader. Objective: Triage 15 boards with 1 slot (queue pressure). Scoring: 0.6 * economic_efficiency + 0.4 * bottleneck_avoidance - economic_efficiency: avg normalized reward vs target - bottleneck_avoidance: 1.0 if no bottlenecks, scales down to 0 """ if not result.rewards: return 0.0 avg_reward = sum(result.rewards) / len(result.rewards) # Economic efficiency: how close to target (MEDIUM_ECONOMIC_TARGET = 0.50) economic_score = min(avg_reward / MEDIUM_ECONOMIC_TARGET, 1.0) # Bottleneck avoidance: 0 bottleneck = 1.0, ≥5 = 0.0 max_tolerable_bottlenecks = 5 bottleneck_score = max(0.0, 1.0 - result.bottleneck_count / max_tolerable_bottlenecks) score = 0.60 * economic_score + 0.40 * bottleneck_score return round(min(max(score, 0.0), 1.0), 4) # --------------------------- # GRADER: TASK HARD # --------------------------- def grade_hard(result: TaskResult) -> float: """ Task Hard Grader. Objective: 20 boards, mixed anomalies, tight slots. Scoring: 0.5 * anomaly_score + 0.3 * economic_score + 0.2 * throughput_score - anomaly_score: anomaly_flagged / max(anomaly_total, 1), target ≥ 0.5 - economic_score: avg normalized reward - throughput_score: boards_processed / total (penalizes WAIT spam) """ if not result.rewards: return 0.0 cfg = TASKS["task_hard"] avg_reward = sum(result.rewards) / len(result.rewards) # Anomaly score: did the agent handle anomalous boards correctly? if result.anomaly_total > 0: raw_anomaly = result.anomaly_flagged / result.anomaly_total else: raw_anomaly = 1.0 # no anomalies → not penalized # Scale anomaly score: meeting HARD_ANOMALY_RATE_TARGET = 1.0 anomaly_score = min(raw_anomaly / HARD_ANOMALY_RATE_TARGET, 1.0) # Economic score economic_score = avg_reward # Throughput: penalize excessive WAIT actions throughput_score = min(result.total_decisions / cfg["n_boards"], 1.0) score = ( 0.50 * anomaly_score + 0.30 * economic_score + 0.20 * throughput_score ) return round(min(max(score, 0.0), 1.0), 4) # --------------------------- # GRADER DISPATCH # --------------------------- GRADERS = { "task_easy": grade_easy, "task_medium": grade_medium, "task_hard": grade_hard, } def grade(task_id: str, result: TaskResult) -> float: """Dispatch to the correct grader for the given task_id.""" if task_id not in GRADERS: raise ValueError(f"No grader for task_id='{task_id}'") return GRADERS[task_id](result) # --------------------------- # TASK DESCRIPTIONS (for README / inference prompt) # --------------------------- TASK_DESCRIPTIONS = { "task_easy": ( "Triage 10 PCBs with no factory slot pressure. " "Focus: identify the correct action for each defect type. " "Grader: accuracy-weighted reward (70% accuracy + 30% reward quality). " "Expected frontier model score: ≥0.85." ), "task_medium": ( "Triage 15 PCBs with only 1 active soldering slot. " "Focus: manage queue pressure while maintaining economic performance. " "Grader: 60% economic efficiency + 40% bottleneck avoidance. " "Expected frontier model score: ≥0.65." ), "task_hard": ( "Triage 20 PCBs with 25% anomaly rate and tight slot constraints. " "Focus: handle extreme-cost/criticality boards safely AND maintain throughput. " "Grader: 50% anomaly handling + 30% economic score + 20% throughput. " "Expected frontier model score: ≥0.50." ), } # --------------------------- # CLI TEST UTILITY # --------------------------- if __name__ == "__main__": """Quick sanity check: run all 3 tasks with a rule-based agent.""" from env import SpectraQualEnv, decide_action from models import PCBAction print("\n=== SpectraQual Task Grader Sanity Check ===\n") for tid in ["task_easy", "task_medium", "task_hard"]: env = SpectraQualEnv(task_id=tid) result_obj = env.reset() actions = [] while not result_obj.done: obs = result_obj.observation pcb = { "defect_type": obs.defect_type, "component_cost": obs.component_cost, "criticality": obs.criticality, } action_str = decide_action(pcb) actions.append(action_str) result_obj = env.step(PCBAction(action=action_str)) task_result = run_task(tid, actions) score = grade(tid, task_result) print(f"[{tid}] Score: {score:.4f} | Correct: {task_result.correct_decisions}/{task_result.total_decisions} | Bottlenecks: {task_result.bottleneck_count}") print("\n=== Done ===")