Spaces:

taarunforge
/

spectraqual

Sleeping

App Files Files Community

spectraqual / src /tasks.py

taarunforge

Deploy SpectraQual OpenEnv environment

dfbb493 about 1 month ago

raw

history blame contribute delete

8.41 kB

	"""
	tasks.py — SpectraQual Task Definitions and Programmatic Graders
	Each task runs the environment with a fixed seed and scores the agent 0.0–1.0.
	Graders are deterministic and reproducible.
	"""

	from __future__ import annotations
	import sys
	import os
	from typing import List

	sys.path.insert(0, os.path.dirname(__file__))

	from config import (
	TASKS,
	MEDIUM_ECONOMIC_TARGET,
	HARD_ANOMALY_RATE_TARGET,
	SUCCESS_SCORE_THRESHOLD,
	)
	from models import TaskResult
	from env import SpectraQualEnv
	from models import PCBAction


	# ---------------------------
	# TASK RUNNER
	# ---------------------------
	def run_task(task_id: str, actions: List[str]) -> TaskResult:
	"""
	Run a task with a pre-determined list of actions.
	Used by graders to replay an agent's trajectory deterministically.

	Args:
	task_id: one of "task_easy", "task_medium", "task_hard"
	actions: list of action strings, one per step

	Returns:
	TaskResult with all episode metrics filled in.
	"""
	cfg = TASKS[task_id]
	env = SpectraQualEnv(task_id=task_id)
	env.reset()

	rewards: List[float] = []
	correct = 0
	total = 0
	bottlenecks = 0
	anomaly_total = 0
	anomaly_flagged = 0
	cum_raw = 0.0

	for i, action_str in enumerate(actions):
	if env._done:
	break

	# Default to SCRAP if action is out of valid range
	valid = env._current_pcb and env._current_pcb.get("defect_type")
	try:
	result = env.step(PCBAction(action=action_str))
	except Exception:
	result = env.step(PCBAction(action="SCRAP"))

	rewards.append(result.reward)
	total += 1
	if result.info.get("is_anomaly"):
	anomaly_total += 1
	if result.reward_components:
	cum_raw += result.reward_components.total_raw
	if result.info.get("is_anomaly") and result.reward_components.anomaly_bonus >= 0.8:
	anomaly_flagged += 1

	if env._is_correct(result.info.get("defect", ""), action_str):
	correct += 1

	bottlenecks = env._bottleneck_cnt

	max_possible_raw = cfg["n_boards"] * 1.0 # max normalized = 1.0 per step

	return TaskResult(
	task_id=task_id,
	total_steps=total,
	rewards=rewards,
	correct_decisions=correct,
	total_decisions=total,
	bottleneck_count=bottlenecks,
	anomaly_total=anomaly_total,
	anomaly_flagged=anomaly_flagged,
	cumulative_raw_reward=cum_raw,
	max_possible_raw=max_possible_raw,
	)


	# ---------------------------
	# GRADER: TASK EASY
	# ---------------------------
	def grade_easy(result: TaskResult) -> float:
	"""
	Task Easy Grader.
	Objective: Correctly classify all defect types. No slot pressure.
	Scoring: correct_decisions / total_decisions → 0.0–1.0

	Also gives partial credit for near-correct results:
	- 100% correct = 1.0
	- 80% correct = 0.8
	- 0% correct = 0.0
	"""
	if result.total_decisions == 0:
	return 0.0

	accuracy = result.correct_decisions / result.total_decisions

	# Blend accuracy with average reward for robustness
	avg_reward = sum(result.rewards) / len(result.rewards) if result.rewards else 0.0

	# Weight: 70% accuracy, 30% reward quality
	score = 0.70 * accuracy + 0.30 * avg_reward
	return round(min(max(score, 0.0), 1.0), 4)


	# ---------------------------
	# GRADER: TASK MEDIUM
	# ---------------------------
	def grade_medium(result: TaskResult) -> float:
	"""
	Task Medium Grader.
	Objective: Triage 15 boards with 1 slot (queue pressure).
	Scoring: 0.6 * economic_efficiency + 0.4 * bottleneck_avoidance

	- economic_efficiency: avg normalized reward vs target
	- bottleneck_avoidance: 1.0 if no bottlenecks, scales down to 0
	"""
	if not result.rewards:
	return 0.0

	avg_reward = sum(result.rewards) / len(result.rewards)

	# Economic efficiency: how close to target (MEDIUM_ECONOMIC_TARGET = 0.50)
	economic_score = min(avg_reward / MEDIUM_ECONOMIC_TARGET, 1.0)

	# Bottleneck avoidance: 0 bottleneck = 1.0, ≥5 = 0.0
	max_tolerable_bottlenecks = 5
	bottleneck_score = max(0.0, 1.0 - result.bottleneck_count / max_tolerable_bottlenecks)

	score = 0.60 * economic_score + 0.40 * bottleneck_score
	return round(min(max(score, 0.0), 1.0), 4)


	# ---------------------------
	# GRADER: TASK HARD
	# ---------------------------
	def grade_hard(result: TaskResult) -> float:
	"""
	Task Hard Grader.
	Objective: 20 boards, mixed anomalies, tight slots.
	Scoring: 0.5 * anomaly_score + 0.3 * economic_score + 0.2 * throughput_score

	- anomaly_score: anomaly_flagged / max(anomaly_total, 1), target ≥ 0.5
	- economic_score: avg normalized reward
	- throughput_score: boards_processed / total (penalizes WAIT spam)
	"""
	if not result.rewards:
	return 0.0

	cfg = TASKS["task_hard"]
	avg_reward = sum(result.rewards) / len(result.rewards)

	# Anomaly score: did the agent handle anomalous boards correctly?
	if result.anomaly_total > 0:
	raw_anomaly = result.anomaly_flagged / result.anomaly_total
	else:
	raw_anomaly = 1.0 # no anomalies → not penalized

	# Scale anomaly score: meeting HARD_ANOMALY_RATE_TARGET = 1.0
	anomaly_score = min(raw_anomaly / HARD_ANOMALY_RATE_TARGET, 1.0)

	# Economic score
	economic_score = avg_reward

	# Throughput: penalize excessive WAIT actions
	throughput_score = min(result.total_decisions / cfg["n_boards"], 1.0)

	score = (
	0.50 * anomaly_score +
	0.30 * economic_score +
	0.20 * throughput_score
	)
	return round(min(max(score, 0.0), 1.0), 4)


	# ---------------------------
	# GRADER DISPATCH
	# ---------------------------
	GRADERS = {
	"task_easy": grade_easy,
	"task_medium": grade_medium,
	"task_hard": grade_hard,
	}


	def grade(task_id: str, result: TaskResult) -> float:
	"""Dispatch to the correct grader for the given task_id."""
	if task_id not in GRADERS:
	raise ValueError(f"No grader for task_id='{task_id}'")
	return GRADERS[task_id](result)


	# ---------------------------
	# TASK DESCRIPTIONS (for README / inference prompt)
	# ---------------------------
	TASK_DESCRIPTIONS = {
	"task_easy": (
	"Triage 10 PCBs with no factory slot pressure. "
	"Focus: identify the correct action for each defect type. "
	"Grader: accuracy-weighted reward (70% accuracy + 30% reward quality). "
	"Expected frontier model score: ≥0.85."
	),
	"task_medium": (
	"Triage 15 PCBs with only 1 active soldering slot. "
	"Focus: manage queue pressure while maintaining economic performance. "
	"Grader: 60% economic efficiency + 40% bottleneck avoidance. "
	"Expected frontier model score: ≥0.65."
	),
	"task_hard": (
	"Triage 20 PCBs with 25% anomaly rate and tight slot constraints. "
	"Focus: handle extreme-cost/criticality boards safely AND maintain throughput. "
	"Grader: 50% anomaly handling + 30% economic score + 20% throughput. "
	"Expected frontier model score: ≥0.50."
	),
	}


	# ---------------------------
	# CLI TEST UTILITY
	# ---------------------------
	if __name__ == "__main__":
	"""Quick sanity check: run all 3 tasks with a rule-based agent."""
	from env import SpectraQualEnv, decide_action
	from models import PCBAction

	print("\n=== SpectraQual Task Grader Sanity Check ===\n")

	for tid in ["task_easy", "task_medium", "task_hard"]:
	env = SpectraQualEnv(task_id=tid)
	result_obj = env.reset()
	actions = []

	while not result_obj.done:
	obs = result_obj.observation
	pcb = {
	"defect_type": obs.defect_type,
	"component_cost": obs.component_cost,
	"criticality": obs.criticality,
	}
	action_str = decide_action(pcb)
	actions.append(action_str)
	result_obj = env.step(PCBAction(action=action_str))

	task_result = run_task(tid, actions)
	score = grade(tid, task_result)
	print(f"[{tid}] Score: {score:.4f} \| Correct: {task_result.correct_decisions}/{task_result.total_decisions} \| Bottlenecks: {task_result.bottleneck_count}")

	print("\n=== Done ===")