Spaces:

taarunforge
/

spectraqual

Sleeping

File size: 8,406 Bytes

dfbb493

"""
tasks.py — SpectraQual Task Definitions and Programmatic Graders
Each task runs the environment with a fixed seed and scores the agent 0.0–1.0.
Graders are deterministic and reproducible.
"""

from __future__ import annotations
import sys
import os
from typing import List

sys.path.insert(0, os.path.dirname(__file__))

from config import (
    TASKS,
    MEDIUM_ECONOMIC_TARGET,
    HARD_ANOMALY_RATE_TARGET,
    SUCCESS_SCORE_THRESHOLD,
)
from models import TaskResult
from env import SpectraQualEnv
from models import PCBAction


# ---------------------------
# TASK RUNNER
# ---------------------------
def run_task(task_id: str, actions: List[str]) -> TaskResult:
    """
    Run a task with a pre-determined list of actions.
    Used by graders to replay an agent's trajectory deterministically.

    Args:
        task_id: one of "task_easy", "task_medium", "task_hard"
        actions:  list of action strings, one per step

    Returns:
        TaskResult with all episode metrics filled in.
    """
    cfg = TASKS[task_id]
    env = SpectraQualEnv(task_id=task_id)
    env.reset()

    rewards:    List[float] = []
    correct     = 0
    total       = 0
    bottlenecks = 0
    anomaly_total   = 0
    anomaly_flagged = 0
    cum_raw     = 0.0

    for i, action_str in enumerate(actions):
        if env._done:
            break

        # Default to SCRAP if action is out of valid range
        valid = env._current_pcb and env._current_pcb.get("defect_type")
        try:
            result = env.step(PCBAction(action=action_str))
        except Exception:
            result = env.step(PCBAction(action="SCRAP"))

        rewards.append(result.reward)
        total += 1
        if result.info.get("is_anomaly"):
            anomaly_total += 1
        if result.reward_components:
            cum_raw += result.reward_components.total_raw
            if result.info.get("is_anomaly") and result.reward_components.anomaly_bonus >= 0.8:
                anomaly_flagged += 1

        if env._is_correct(result.info.get("defect", ""), action_str):
            correct += 1

        bottlenecks = env._bottleneck_cnt

    max_possible_raw = cfg["n_boards"] * 1.0  # max normalized = 1.0 per step

    return TaskResult(
        task_id=task_id,
        total_steps=total,
        rewards=rewards,
        correct_decisions=correct,
        total_decisions=total,
        bottleneck_count=bottlenecks,
        anomaly_total=anomaly_total,
        anomaly_flagged=anomaly_flagged,
        cumulative_raw_reward=cum_raw,
        max_possible_raw=max_possible_raw,
    )


# ---------------------------
# GRADER: TASK EASY
# ---------------------------
def grade_easy(result: TaskResult) -> float:
    """
    Task Easy Grader.
    Objective: Correctly classify all defect types. No slot pressure.
    Scoring: correct_decisions / total_decisions → 0.0–1.0

    Also gives partial credit for near-correct results:
    - 100% correct = 1.0
    - 80% correct  = 0.8
    - 0% correct   = 0.0
    """
    if result.total_decisions == 0:
        return 0.0

    accuracy = result.correct_decisions / result.total_decisions

    # Blend accuracy with average reward for robustness
    avg_reward = sum(result.rewards) / len(result.rewards) if result.rewards else 0.0

    # Weight: 70% accuracy, 30% reward quality
    score = 0.70 * accuracy + 0.30 * avg_reward
    return round(min(max(score, 0.0), 1.0), 4)


# ---------------------------
# GRADER: TASK MEDIUM
# ---------------------------
def grade_medium(result: TaskResult) -> float:
    """
    Task Medium Grader.
    Objective: Triage 15 boards with 1 slot (queue pressure).
    Scoring: 0.6 * economic_efficiency + 0.4 * bottleneck_avoidance

    - economic_efficiency: avg normalized reward vs target
    - bottleneck_avoidance: 1.0 if no bottlenecks, scales down to 0
    """
    if not result.rewards:
        return 0.0

    avg_reward = sum(result.rewards) / len(result.rewards)

    # Economic efficiency: how close to target (MEDIUM_ECONOMIC_TARGET = 0.50)
    economic_score = min(avg_reward / MEDIUM_ECONOMIC_TARGET, 1.0)

    # Bottleneck avoidance: 0 bottleneck = 1.0, ≥5 = 0.0
    max_tolerable_bottlenecks = 5
    bottleneck_score = max(0.0, 1.0 - result.bottleneck_count / max_tolerable_bottlenecks)

    score = 0.60 * economic_score + 0.40 * bottleneck_score
    return round(min(max(score, 0.0), 1.0), 4)


# ---------------------------
# GRADER: TASK HARD
# ---------------------------
def grade_hard(result: TaskResult) -> float:
    """
    Task Hard Grader.
    Objective: 20 boards, mixed anomalies, tight slots.
    Scoring: 0.5 * anomaly_score + 0.3 * economic_score + 0.2 * throughput_score

    - anomaly_score:    anomaly_flagged / max(anomaly_total, 1), target ≥ 0.5
    - economic_score:   avg normalized reward
    - throughput_score: boards_processed / total (penalizes WAIT spam)
    """
    if not result.rewards:
        return 0.0

    cfg = TASKS["task_hard"]
    avg_reward = sum(result.rewards) / len(result.rewards)

    # Anomaly score: did the agent handle anomalous boards correctly?
    if result.anomaly_total > 0:
        raw_anomaly = result.anomaly_flagged / result.anomaly_total
    else:
        raw_anomaly = 1.0  # no anomalies → not penalized

    # Scale anomaly score: meeting HARD_ANOMALY_RATE_TARGET = 1.0
    anomaly_score = min(raw_anomaly / HARD_ANOMALY_RATE_TARGET, 1.0)

    # Economic score
    economic_score = avg_reward

    # Throughput: penalize excessive WAIT actions
    throughput_score = min(result.total_decisions / cfg["n_boards"], 1.0)

    score = (
        0.50 * anomaly_score +
        0.30 * economic_score +
        0.20 * throughput_score
    )
    return round(min(max(score, 0.0), 1.0), 4)


# ---------------------------
# GRADER DISPATCH
# ---------------------------
GRADERS = {
    "task_easy":   grade_easy,
    "task_medium": grade_medium,
    "task_hard":   grade_hard,
}


def grade(task_id: str, result: TaskResult) -> float:
    """Dispatch to the correct grader for the given task_id."""
    if task_id not in GRADERS:
        raise ValueError(f"No grader for task_id='{task_id}'")
    return GRADERS[task_id](result)


# ---------------------------
# TASK DESCRIPTIONS (for README / inference prompt)
# ---------------------------
TASK_DESCRIPTIONS = {
    "task_easy": (
        "Triage 10 PCBs with no factory slot pressure. "
        "Focus: identify the correct action for each defect type. "
        "Grader: accuracy-weighted reward (70% accuracy + 30% reward quality). "
        "Expected frontier model score: ≥0.85."
    ),
    "task_medium": (
        "Triage 15 PCBs with only 1 active soldering slot. "
        "Focus: manage queue pressure while maintaining economic performance. "
        "Grader: 60% economic efficiency + 40% bottleneck avoidance. "
        "Expected frontier model score: ≥0.65."
    ),
    "task_hard": (
        "Triage 20 PCBs with 25% anomaly rate and tight slot constraints. "
        "Focus: handle extreme-cost/criticality boards safely AND maintain throughput. "
        "Grader: 50% anomaly handling + 30% economic score + 20% throughput. "
        "Expected frontier model score: ≥0.50."
    ),
}


# ---------------------------
# CLI TEST UTILITY
# ---------------------------
if __name__ == "__main__":
    """Quick sanity check: run all 3 tasks with a rule-based agent."""
    from env import SpectraQualEnv, decide_action
    from models import PCBAction

    print("\n=== SpectraQual Task Grader Sanity Check ===\n")

    for tid in ["task_easy", "task_medium", "task_hard"]:
        env = SpectraQualEnv(task_id=tid)
        result_obj = env.reset()
        actions = []

        while not result_obj.done:
            obs = result_obj.observation
            pcb = {
                "defect_type":    obs.defect_type,
                "component_cost": obs.component_cost,
                "criticality":    obs.criticality,
            }
            action_str  = decide_action(pcb)
            actions.append(action_str)
            result_obj  = env.step(PCBAction(action=action_str))

        task_result = run_task(tid, actions)
        score       = grade(tid, task_result)
        print(f"[{tid}] Score: {score:.4f} | Correct: {task_result.correct_decisions}/{task_result.total_decisions} | Bottlenecks: {task_result.bottleneck_count}")

    print("\n=== Done ===")