| | """Evaluation suite for the bio-experiment planning environment.
|
| |
|
| | Separates metrics into four families:
|
| | - online RL metrics (collected during training rollouts)
|
| | - offline benchmark metrics (computed on a fixed held-out set)
|
| | - expert review metrics (for human-in-the-loop evaluation)
|
| | - simulator fidelity metrics (how well the simulator matches reality)
|
| | """
|
| |
|
| | from __future__ import annotations
|
| |
|
| | from dataclasses import dataclass, field
|
| | from typing import Any, Dict, List, Optional
|
| |
|
| | import numpy as np
|
| |
|
| | from .trajectory import Trajectory, TrajectoryDataset
|
| |
|
| |
|
| | @dataclass
|
| | class MetricResult:
|
| | name: str
|
| | value: float
|
| | details: Dict[str, Any] = field(default_factory=dict)
|
| |
|
| |
|
| | class EvaluationSuite:
|
| | """Computes and aggregates evaluation metrics over trajectory datasets."""
|
| |
|
| |
|
| |
|
| | @staticmethod
|
| | def online_metrics(trajectories: List[Trajectory]) -> List[MetricResult]:
|
| | if not trajectories:
|
| | return []
|
| |
|
| | rewards = [t.total_reward for t in trajectories]
|
| | lengths = [len(t.steps) for t in trajectories]
|
| | successes = [t.success for t in trajectories]
|
| |
|
| | return [
|
| | MetricResult("mean_return", float(np.mean(rewards))),
|
| | MetricResult("median_return", float(np.median(rewards))),
|
| | MetricResult("std_return", float(np.std(rewards))),
|
| | MetricResult("mean_episode_length", float(np.mean(lengths))),
|
| | MetricResult("success_rate", float(np.mean(successes))),
|
| | ]
|
| |
|
| |
|
| |
|
| | @staticmethod
|
| | def benchmark_metrics(dataset: TrajectoryDataset) -> List[MetricResult]:
|
| | results: List[MetricResult] = []
|
| | if len(dataset) == 0:
|
| | return results
|
| |
|
| | results.append(MetricResult(
|
| | "pipeline_validity_rate",
|
| | EvaluationSuite._pipeline_validity_rate(dataset),
|
| | ))
|
| | results.append(MetricResult(
|
| | "ordering_score",
|
| | EvaluationSuite._ordering_score(dataset),
|
| | ))
|
| | results.append(MetricResult(
|
| | "action_diversity",
|
| | EvaluationSuite._action_diversity(dataset),
|
| | ))
|
| | results.append(MetricResult(
|
| | "mean_conclusion_confidence",
|
| | EvaluationSuite._mean_conclusion_confidence(dataset),
|
| | ))
|
| | return results
|
| |
|
| |
|
| |
|
| | @staticmethod
|
| | def expert_review_metrics(
|
| | trajectories: List[Trajectory],
|
| | expert_scores: Optional[Dict[str, float]] = None,
|
| | ) -> List[MetricResult]:
|
| | """Placeholder for human expert review scores.
|
| |
|
| | In practice, each trajectory would be scored by a domain expert
|
| | on axes such as scientific validity, creativity, and efficiency.
|
| | """
|
| | if not expert_scores:
|
| | return [MetricResult("expert_review", 0.0, {"note": "no scores provided"})]
|
| | avg = float(np.mean(list(expert_scores.values())))
|
| | return [MetricResult("expert_review_mean", avg, expert_scores)]
|
| |
|
| |
|
| |
|
| | @staticmethod
|
| | def simulator_fidelity_metrics(
|
| | simulated: TrajectoryDataset,
|
| | real: Optional[TrajectoryDataset] = None,
|
| | ) -> List[MetricResult]:
|
| | """Compare simulated trajectories against real experimental data.
|
| |
|
| | When ``real`` is provided, computes distributional distances
|
| | between simulated and real output statistics.
|
| | """
|
| | if real is None or len(real) == 0:
|
| | return [MetricResult("fidelity", 0.0, {"note": "no real data"})]
|
| |
|
| | sim_rewards = [t.total_reward for t in simulated.trajectories]
|
| | real_rewards = [t.total_reward for t in real.trajectories]
|
| |
|
| | reward_gap = abs(float(np.mean(sim_rewards)) - float(np.mean(real_rewards)))
|
| | return [MetricResult("reward_distribution_gap", reward_gap)]
|
| |
|
| |
|
| |
|
| | @staticmethod
|
| | def _pipeline_validity_rate(ds: TrajectoryDataset) -> float:
|
| | valid = 0
|
| | for t in ds.trajectories:
|
| | violations = sum(
|
| | 1 for s in t.steps
|
| | if s.observation.get("rule_violations", []) != []
|
| | and s.observation.get("rule_violations") is not None
|
| | )
|
| | if violations == 0:
|
| | valid += 1
|
| | return valid / max(len(ds), 1)
|
| |
|
| | @staticmethod
|
| | def _ordering_score(ds: TrajectoryDataset) -> float:
|
| | scores: List[float] = []
|
| | for t in ds.trajectories:
|
| | breakdown_scores = []
|
| | for s in t.steps:
|
| | bd = s.reward_breakdown
|
| | if "ordering" in bd:
|
| | breakdown_scores.append(bd["ordering"])
|
| | if breakdown_scores:
|
| | scores.append(float(np.mean(breakdown_scores)))
|
| | return float(np.mean(scores)) if scores else 0.0
|
| |
|
| | @staticmethod
|
| | def _action_diversity(ds: TrajectoryDataset) -> float:
|
| | all_types: set = set()
|
| | for t in ds.trajectories:
|
| | for s in t.steps:
|
| | at = s.action.get("action_type")
|
| | if at:
|
| | all_types.add(at)
|
| | from models import ActionType
|
| | return len(all_types) / max(len(ActionType), 1)
|
| |
|
| | @staticmethod
|
| | def _mean_conclusion_confidence(ds: TrajectoryDataset) -> float:
|
| | confs: List[float] = []
|
| | for t in ds.trajectories:
|
| | for s in t.steps:
|
| | conclusions = s.observation.get("conclusions", [])
|
| | for c in conclusions:
|
| | if isinstance(c, dict) and "confidence" in c:
|
| | confs.append(c["confidence"])
|
| | return float(np.mean(confs)) if confs else 0.0
|
| |
|