"""Opinionated evaluation profiles for EvacOS2 specialist and orchestrator runs.""" from __future__ import annotations import argparse import sys from dataclasses import dataclass from pathlib import Path from typing import Sequence ROOT = Path(__file__).resolve().parents[1] if str(ROOT) not in sys.path: sys.path.insert(0, str(ROOT)) from evaluation.demo_bundle import DemoBundleResult, build_demo_bundle from training.checkpoint import load_checkpoint DEFAULT_TIERS = "easy" DEFAULT_SEEDS = "42,123,456,789,1024" DEFAULT_MAX_ROUNDS = 50 @dataclass(frozen=True) class EvalProfile: """Named eval surface with safe defaults for one model lane.""" name: str description: str families: tuple[str, ...] output_dir: Path config_path: Path PROFILES: dict[str, EvalProfile] = { "3b-fire": EvalProfile( name="3b-fire", description="Evaluate the 3B fire floor specialist.", families=("fire",), output_dir=Path("outputs/evals/3b-fire-specialist"), config_path=Path("training/config.remote-unsloth-3b-fire-floor-specialist.yaml"), ), "3b-flood": EvalProfile( name="3b-flood", description="Evaluate the 3B flood floor specialist.", families=("flood",), output_dir=Path("outputs/evals/3b-flood-specialist"), config_path=Path("training/config.remote-unsloth-3b-flood-floor-specialist.yaml"), ), "3b-gas": EvalProfile( name="3b-gas", description="Evaluate the 3B gas floor specialist.", families=("gas",), output_dir=Path("outputs/evals/3b-gas-specialist"), config_path=Path("training/config.remote-unsloth-3b-gas-floor-specialist.yaml"), ), "7b-orchestrator": EvalProfile( name="7b-orchestrator", description="Evaluate the shared 7B orchestrator over routed frozen 3B specialists.", families=("fire", "flood", "gas"), output_dir=Path("outputs/evals/7b-orchestrator-routed-specialists"), config_path=Path("training/config.remote-unsloth-7b-orchestrator-frozen-specialists.example.yaml"), ), } def _split_csv(raw: str) -> list[str]: return [item.strip() for item in raw.split(",") if item.strip()] def _load_latest_checkpoint_snapshot(config_path: Path) -> dict: try: import yaml except ModuleNotFoundError as exc: # pragma: no cover - depends on runner image raise ImportError( "--use-latest-checkpoint-normalizer requires PyYAML to read the config." ) from exc data = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} checkpoint_root = Path( str(data.get("checkpoint", {}).get("root_dir", "outputs/checkpoints")) ) bundle = load_checkpoint(checkpoint_root) if bundle is None: raise FileNotFoundError( f"No checkpoint with normalizer snapshot found under {checkpoint_root}" ) return bundle.normalizer_snapshot def build_parser(profile: EvalProfile) -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description=profile.description) parser.add_argument( "--trained-checkpoint", type=Path, default=None, help=( "Path to the trained checkpoint directory or its lora_adapter root. " "Use --skip-trained for a baseline-only smoke bundle." ), ) parser.add_argument( "--output-dir", type=Path, default=profile.output_dir, help="Where to write JSON, CSV, scorecard, and plots.", ) parser.add_argument( "--config", type=Path, default=profile.config_path, help="Training config used if checkpoint metadata is unavailable.", ) parser.add_argument( "--tiers", default=DEFAULT_TIERS, help="Comma-separated evaluation tiers. Current proof lane supports easy only.", ) parser.add_argument( "--seeds", default=DEFAULT_SEEDS, help="Comma-separated held-out eval seeds.", ) parser.add_argument( "--families", default=",".join(profile.families), help="Comma-separated disaster families. Defaults to the profile lane.", ) parser.add_argument("--rationale-mode", default="linear_capped") parser.add_argument( "--max-rounds", type=int, default=DEFAULT_MAX_ROUNDS, help=( "Bounded rounds per eval episode. Keep this small for smoke/gate " "checks so trained eval cannot silently run for hours." ), ) parser.add_argument("--training-metrics-path", type=Path, default=None) parser.add_argument("--skip-trained", action="store_true") parser.add_argument( "--use-latest-checkpoint-normalizer", action="store_true", help="Seed trained eval normalization from config.checkpoint.root_dir/latest.", ) parser.add_argument( "--baseline-policy", choices=("stub", "base_model"), default="stub", help=( "Baseline reference. Use base_model for judge-facing no-LoRA " "model-vs-trained-LoRA comparisons." ), ) return parser def run_profile(profile_name: str, argv: Sequence[str] | None = None) -> DemoBundleResult: profile = PROFILES[profile_name] parser = build_parser(profile) args = parser.parse_args(argv) trained_normalizer_snapshot = None if args.use_latest_checkpoint_normalizer: trained_normalizer_snapshot = _load_latest_checkpoint_snapshot(args.config) result = build_demo_bundle( trained_checkpoint=args.trained_checkpoint, tiers=tuple(_split_csv(args.tiers)), seeds=tuple(int(item) for item in _split_csv(args.seeds)), disaster_families=tuple(_split_csv(args.families)), max_rounds=args.max_rounds, rationale_mode=args.rationale_mode, output_dir=args.output_dir, skip_trained=args.skip_trained, training_metrics_path=args.training_metrics_path, trained_normalizer_snapshot=trained_normalizer_snapshot, config_path=args.config, baseline_policy=args.baseline_policy, ) _print_result(profile, result) return result def _print_result(profile: EvalProfile, result: DemoBundleResult) -> None: print(f"Eval profile: {profile.name}") print(f"Output dir: {result.output_dir}") print(f"Scorecard: {result.scorecard_md}") print(f"Summary: {result.summary_md}") print(f"Comparison CSV: {result.comparison_csv}") if result.trained_json is not None: print(f"Trained fixed suite: {result.trained_json}") else: print("Trained fixed suite: skipped") for plot_path in result.plot_paths: print(f"Plot: {plot_path}")