File size: 6,452 Bytes
4db0438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c3cfae
4db0438
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5c3cfae
 
4db0438
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""Evaluation suite for the bio-experiment planning environment.



Separates metrics into four families:

  - online RL metrics      (collected during training rollouts)

  - offline benchmark metrics (computed on a fixed held-out set)

  - expert review metrics  (for human-in-the-loop evaluation)

  - simulator fidelity metrics (how well the simulator matches reality)

"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional

import numpy as np

from .trajectory import Trajectory, TrajectoryDataset


@dataclass
class MetricResult:
    name: str
    value: float
    details: Dict[str, Any] = field(default_factory=dict)


class EvaluationSuite:
    """Computes and aggregates evaluation metrics over trajectory datasets."""

    # ── online RL metrics ───────────────────────────────────────────────

    @staticmethod
    def online_metrics(trajectories: List[Trajectory]) -> List[MetricResult]:
        if not trajectories:
            return []

        rewards = [t.total_reward for t in trajectories]
        lengths = [len(t.steps) for t in trajectories]
        successes = [t.success for t in trajectories]

        return [
            MetricResult("mean_return", float(np.mean(rewards))),
            MetricResult("median_return", float(np.median(rewards))),
            MetricResult("std_return", float(np.std(rewards))),
            MetricResult("mean_episode_length", float(np.mean(lengths))),
            MetricResult("success_rate", float(np.mean(successes))),
        ]

    # ── offline benchmark metrics ───────────────────────────────────────

    @staticmethod
    def benchmark_metrics(dataset: TrajectoryDataset) -> List[MetricResult]:
        results: List[MetricResult] = []
        if len(dataset) == 0:
            return results

        results.append(MetricResult(
            "pipeline_validity_rate",
            EvaluationSuite._pipeline_validity_rate(dataset),
        ))
        results.append(MetricResult(
            "ordering_score",
            EvaluationSuite._ordering_score(dataset),
        ))
        results.append(MetricResult(
            "action_diversity",
            EvaluationSuite._action_diversity(dataset),
        ))
        results.append(MetricResult(
            "mean_conclusion_confidence",
            EvaluationSuite._mean_conclusion_confidence(dataset),
        ))
        return results

    # ── expert review metrics (stubs) ───────────────────────────────────

    @staticmethod
    def expert_review_metrics(

        trajectories: List[Trajectory],

        expert_scores: Optional[Dict[str, float]] = None,

    ) -> List[MetricResult]:
        """Placeholder for human expert review scores.



        In practice, each trajectory would be scored by a domain expert

        on axes such as scientific validity, creativity, and efficiency.

        """
        if not expert_scores:
            return [MetricResult("expert_review", 0.0, {"note": "no scores provided"})]
        avg = float(np.mean(list(expert_scores.values())))
        return [MetricResult("expert_review_mean", avg, expert_scores)]

    # ── simulator fidelity metrics (stubs) ──────────────────────────────

    @staticmethod
    def simulator_fidelity_metrics(

        simulated: TrajectoryDataset,

        real: Optional[TrajectoryDataset] = None,

    ) -> List[MetricResult]:
        """Compare simulated trajectories against real experimental data.



        When ``real`` is provided, computes distributional distances

        between simulated and real output statistics.

        """
        if real is None or len(real) == 0:
            return [MetricResult("fidelity", 0.0, {"note": "no real data"})]

        sim_rewards = [t.total_reward for t in simulated.trajectories]
        real_rewards = [t.total_reward for t in real.trajectories]

        reward_gap = abs(float(np.mean(sim_rewards)) - float(np.mean(real_rewards)))
        return [MetricResult("reward_distribution_gap", reward_gap)]

    # ── internal helpers ────────────────────────────────────────────────

    @staticmethod
    def _pipeline_validity_rate(ds: TrajectoryDataset) -> float:
        valid = 0
        for t in ds.trajectories:
            violations = sum(
                1 for s in t.steps
                if s.observation.get("rule_violations", []) != []
                and s.observation.get("rule_violations") is not None
            )
            if violations == 0:
                valid += 1
        return valid / max(len(ds), 1)

    @staticmethod
    def _ordering_score(ds: TrajectoryDataset) -> float:
        scores: List[float] = []
        for t in ds.trajectories:
            breakdown_scores = []
            for s in t.steps:
                bd = s.reward_breakdown
                if "ordering" in bd:
                    breakdown_scores.append(bd["ordering"])
            if breakdown_scores:
                scores.append(float(np.mean(breakdown_scores)))
        return float(np.mean(scores)) if scores else 0.0

    @staticmethod
    def _action_diversity(ds: TrajectoryDataset) -> float:
        all_types: set = set()
        for t in ds.trajectories:
            for s in t.steps:
                at = s.action.get("action_type")
                if at:
                    all_types.add(at)
        from models import ActionType
        return len(all_types) / max(len(ActionType), 1)

    @staticmethod
    def _mean_conclusion_confidence(ds: TrajectoryDataset) -> float:
        confs: List[float] = []
        for t in ds.trajectories:
            for s in t.steps:
                conclusions = s.observation.get("conclusions", [])
                for c in conclusions:
                    if isinstance(c, dict) and "confidence" in c:
                        confs.append(c["confidence"])
        return float(np.mean(confs)) if confs else 0.0