from __future__ import annotations from dataclasses import dataclass, asdict from typing import Any @dataclass(frozen=True) class GraderResult: task_id: str raw_reward: float baseline_reward: float target_reward: float score: float completed_orders: int | None = None on_time_orders: int | None = None late_orders: int | None = None expired_orders: int | None = None rejected_orders: int | None = None invalid_actions: int | None = None service_rate: float | None = None on_time_rate: float | None = None def to_dict(self) -> dict[str, Any]: return asdict(self) def clamp(value: float, low: float = 0.0, high: float = 1.0) -> float: return max(low, min(high, value)) def normalized_score(raw_reward: float, baseline_reward: float, target_reward: float) -> float: if target_reward <= baseline_reward: return 1.0 if raw_reward >= target_reward else 0.0 return clamp((raw_reward - baseline_reward) / (target_reward - baseline_reward)) def grade_task( task_id: str, raw_reward: float, baseline_reward: float, target_reward: float, *, completed_orders: int | None = None, on_time_orders: int | None = None, late_orders: int | None = None, expired_orders: int | None = None, rejected_orders: int | None = None, invalid_actions: int | None = None, ) -> GraderResult: total_resolved = (completed_orders or 0) + (expired_orders or 0) + (rejected_orders or 0) service_rate = ((completed_orders or 0) / total_resolved) if total_resolved else None on_time_rate = ((on_time_orders or 0) / (completed_orders or 1)) if completed_orders else None return GraderResult( task_id=task_id, raw_reward=float(raw_reward), baseline_reward=float(baseline_reward), target_reward=float(target_reward), score=normalized_score(raw_reward, baseline_reward, target_reward), completed_orders=completed_orders, on_time_orders=on_time_orders, late_orders=late_orders, expired_orders=expired_orders, rejected_orders=rejected_orders, invalid_actions=invalid_actions, service_rate=service_rate, on_time_rate=on_time_rate, ) def grade_trajectory( task_id: str, trajectory_reward: float, baseline_reward: float, target_reward: float, stats: dict[str, int] | None = None, ) -> GraderResult: stats = stats or {} return grade_task( task_id=task_id, raw_reward=trajectory_reward, baseline_reward=baseline_reward, target_reward=target_reward, completed_orders=stats.get("completed_orders"), on_time_orders=stats.get("on_time_orders"), late_orders=stats.get("late_orders"), expired_orders=stats.get("expired_orders"), rejected_orders=stats.get("rejected_orders"), invalid_actions=stats.get("invalid_actions"), ) def weighted_mean(results: list[GraderResult], weights: dict[str, float]) -> float: if not results: return 0.0 weighted_total = 0.0 weight_total = 0.0 for result in results: weight = float(weights.get(result.task_id, 1.0)) weighted_total += result.score * weight weight_total += weight return 0.0 if weight_total == 0 else weighted_total / weight_total def summarize_results(results: list[GraderResult], weights: dict[str, float] | None = None) -> dict[str, Any]: weights = weights or {} return { "tasks": [result.to_dict() for result in results], "overall_score": weighted_mean(results, weights), } __all__ = [ "GraderResult", "clamp", "grade_task", "grade_trajectory", "normalized_score", "summarize_results", "weighted_mean", ]