Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """Evaluation suite for the bio-experiment planning environment. | |
| Separates metrics into four families: | |
| - online RL metrics (collected during training rollouts) | |
| - offline benchmark metrics (computed on a fixed held-out set) | |
| - expert review metrics (for human-in-the-loop evaluation) | |
| - simulator fidelity metrics (how well the simulator matches reality) | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| from typing import Any, Dict, List, Optional | |
| import numpy as np | |
| from .trajectory import Trajectory, TrajectoryDataset | |
| class MetricResult: | |
| name: str | |
| value: float | |
| details: Dict[str, Any] = field(default_factory=dict) | |
| class EvaluationSuite: | |
| """Computes and aggregates evaluation metrics over trajectory datasets.""" | |
| # ββ online RL metrics βββββββββββββββββββββββββββββββββββββββββββββββ | |
| def online_metrics(trajectories: List[Trajectory]) -> List[MetricResult]: | |
| if not trajectories: | |
| return [] | |
| rewards = [t.total_reward for t in trajectories] | |
| lengths = [len(t.steps) for t in trajectories] | |
| successes = [t.success for t in trajectories] | |
| return [ | |
| MetricResult("mean_return", float(np.mean(rewards))), | |
| MetricResult("median_return", float(np.median(rewards))), | |
| MetricResult("std_return", float(np.std(rewards))), | |
| MetricResult("mean_episode_length", float(np.mean(lengths))), | |
| MetricResult("success_rate", float(np.mean(successes))), | |
| ] | |
| # ββ offline benchmark metrics βββββββββββββββββββββββββββββββββββββββ | |
| def benchmark_metrics(dataset: TrajectoryDataset) -> List[MetricResult]: | |
| results: List[MetricResult] = [] | |
| if len(dataset) == 0: | |
| return results | |
| results.append(MetricResult( | |
| "pipeline_validity_rate", | |
| EvaluationSuite._pipeline_validity_rate(dataset), | |
| )) | |
| results.append(MetricResult( | |
| "ordering_score", | |
| EvaluationSuite._ordering_score(dataset), | |
| )) | |
| results.append(MetricResult( | |
| "action_diversity", | |
| EvaluationSuite._action_diversity(dataset), | |
| )) | |
| results.append(MetricResult( | |
| "mean_conclusion_confidence", | |
| EvaluationSuite._mean_conclusion_confidence(dataset), | |
| )) | |
| return results | |
| # ββ expert review metrics (stubs) βββββββββββββββββββββββββββββββββββ | |
| def expert_review_metrics( | |
| trajectories: List[Trajectory], | |
| expert_scores: Optional[Dict[str, float]] = None, | |
| ) -> List[MetricResult]: | |
| """Placeholder for human expert review scores. | |
| In practice, each trajectory would be scored by a domain expert | |
| on axes such as scientific validity, creativity, and efficiency. | |
| """ | |
| if not expert_scores: | |
| return [MetricResult("expert_review", 0.0, {"note": "no scores provided"})] | |
| avg = float(np.mean(list(expert_scores.values()))) | |
| return [MetricResult("expert_review_mean", avg, expert_scores)] | |
| # ββ simulator fidelity metrics (stubs) ββββββββββββββββββββββββββββββ | |
| def simulator_fidelity_metrics( | |
| simulated: TrajectoryDataset, | |
| real: Optional[TrajectoryDataset] = None, | |
| ) -> List[MetricResult]: | |
| """Compare simulated trajectories against real experimental data. | |
| When ``real`` is provided, computes distributional distances | |
| between simulated and real output statistics. | |
| """ | |
| if real is None or len(real) == 0: | |
| return [MetricResult("fidelity", 0.0, {"note": "no real data"})] | |
| sim_rewards = [t.total_reward for t in simulated.trajectories] | |
| real_rewards = [t.total_reward for t in real.trajectories] | |
| reward_gap = abs(float(np.mean(sim_rewards)) - float(np.mean(real_rewards))) | |
| return [MetricResult("reward_distribution_gap", reward_gap)] | |
| # ββ internal helpers ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _pipeline_validity_rate(ds: TrajectoryDataset) -> float: | |
| valid = 0 | |
| for t in ds.trajectories: | |
| violations = sum( | |
| 1 for s in t.steps | |
| if s.observation.get("rule_violations", []) != [] | |
| and s.observation.get("rule_violations") is not None | |
| ) | |
| if violations == 0: | |
| valid += 1 | |
| return valid / max(len(ds), 1) | |
| def _ordering_score(ds: TrajectoryDataset) -> float: | |
| scores: List[float] = [] | |
| for t in ds.trajectories: | |
| breakdown_scores = [] | |
| for s in t.steps: | |
| bd = s.reward_breakdown | |
| if "ordering" in bd: | |
| breakdown_scores.append(bd["ordering"]) | |
| if breakdown_scores: | |
| scores.append(float(np.mean(breakdown_scores))) | |
| return float(np.mean(scores)) if scores else 0.0 | |
| def _action_diversity(ds: TrajectoryDataset) -> float: | |
| all_types: set = set() | |
| for t in ds.trajectories: | |
| for s in t.steps: | |
| at = s.action.get("action_type") | |
| if at: | |
| all_types.add(at) | |
| from models import ActionType | |
| return len(all_types) / max(len(ActionType), 1) | |
| def _mean_conclusion_confidence(ds: TrajectoryDataset) -> float: | |
| confs: List[float] = [] | |
| for t in ds.trajectories: | |
| for s in t.steps: | |
| conclusions = s.observation.get("conclusions", []) | |
| for c in conclusions: | |
| if isinstance(c, dict) and "confidence" in c: | |
| confs.append(c["confidence"]) | |
| return float(np.mean(confs)) if confs else 0.0 | |