evacos2-openenv / scripts /eval_profiles.py
shashankN777's picture
Add held-out base-vs-trained specialist evidence
9a7187a verified
"""Opinionated evaluation profiles for EvacOS2 specialist and orchestrator runs."""
from __future__ import annotations
import argparse
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Sequence
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from evaluation.demo_bundle import DemoBundleResult, build_demo_bundle
from training.checkpoint import load_checkpoint
DEFAULT_TIERS = "easy"
DEFAULT_SEEDS = "42,123,456,789,1024"
DEFAULT_MAX_ROUNDS = 50
@dataclass(frozen=True)
class EvalProfile:
"""Named eval surface with safe defaults for one model lane."""
name: str
description: str
families: tuple[str, ...]
output_dir: Path
config_path: Path
PROFILES: dict[str, EvalProfile] = {
"3b-fire": EvalProfile(
name="3b-fire",
description="Evaluate the 3B fire floor specialist.",
families=("fire",),
output_dir=Path("outputs/evals/3b-fire-specialist"),
config_path=Path("training/config.remote-unsloth-3b-fire-floor-specialist.yaml"),
),
"3b-flood": EvalProfile(
name="3b-flood",
description="Evaluate the 3B flood floor specialist.",
families=("flood",),
output_dir=Path("outputs/evals/3b-flood-specialist"),
config_path=Path("training/config.remote-unsloth-3b-flood-floor-specialist.yaml"),
),
"3b-gas": EvalProfile(
name="3b-gas",
description="Evaluate the 3B gas floor specialist.",
families=("gas",),
output_dir=Path("outputs/evals/3b-gas-specialist"),
config_path=Path("training/config.remote-unsloth-3b-gas-floor-specialist.yaml"),
),
"7b-orchestrator": EvalProfile(
name="7b-orchestrator",
description="Evaluate the shared 7B orchestrator over routed frozen 3B specialists.",
families=("fire", "flood", "gas"),
output_dir=Path("outputs/evals/7b-orchestrator-routed-specialists"),
config_path=Path("training/config.remote-unsloth-7b-orchestrator-frozen-specialists.example.yaml"),
),
}
def _split_csv(raw: str) -> list[str]:
return [item.strip() for item in raw.split(",") if item.strip()]
def _load_latest_checkpoint_snapshot(config_path: Path) -> dict:
try:
import yaml
except ModuleNotFoundError as exc: # pragma: no cover - depends on runner image
raise ImportError(
"--use-latest-checkpoint-normalizer requires PyYAML to read the config."
) from exc
data = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
checkpoint_root = Path(
str(data.get("checkpoint", {}).get("root_dir", "outputs/checkpoints"))
)
bundle = load_checkpoint(checkpoint_root)
if bundle is None:
raise FileNotFoundError(
f"No checkpoint with normalizer snapshot found under {checkpoint_root}"
)
return bundle.normalizer_snapshot
def build_parser(profile: EvalProfile) -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description=profile.description)
parser.add_argument(
"--trained-checkpoint",
type=Path,
default=None,
help=(
"Path to the trained checkpoint directory or its lora_adapter root. "
"Use --skip-trained for a baseline-only smoke bundle."
),
)
parser.add_argument(
"--output-dir",
type=Path,
default=profile.output_dir,
help="Where to write JSON, CSV, scorecard, and plots.",
)
parser.add_argument(
"--config",
type=Path,
default=profile.config_path,
help="Training config used if checkpoint metadata is unavailable.",
)
parser.add_argument(
"--tiers",
default=DEFAULT_TIERS,
help="Comma-separated evaluation tiers. Current proof lane supports easy only.",
)
parser.add_argument(
"--seeds",
default=DEFAULT_SEEDS,
help="Comma-separated held-out eval seeds.",
)
parser.add_argument(
"--families",
default=",".join(profile.families),
help="Comma-separated disaster families. Defaults to the profile lane.",
)
parser.add_argument("--rationale-mode", default="linear_capped")
parser.add_argument(
"--max-rounds",
type=int,
default=DEFAULT_MAX_ROUNDS,
help=(
"Bounded rounds per eval episode. Keep this small for smoke/gate "
"checks so trained eval cannot silently run for hours."
),
)
parser.add_argument("--training-metrics-path", type=Path, default=None)
parser.add_argument("--skip-trained", action="store_true")
parser.add_argument(
"--use-latest-checkpoint-normalizer",
action="store_true",
help="Seed trained eval normalization from config.checkpoint.root_dir/latest.",
)
parser.add_argument(
"--baseline-policy",
choices=("stub", "base_model"),
default="stub",
help=(
"Baseline reference. Use base_model for judge-facing no-LoRA "
"model-vs-trained-LoRA comparisons."
),
)
return parser
def run_profile(profile_name: str, argv: Sequence[str] | None = None) -> DemoBundleResult:
profile = PROFILES[profile_name]
parser = build_parser(profile)
args = parser.parse_args(argv)
trained_normalizer_snapshot = None
if args.use_latest_checkpoint_normalizer:
trained_normalizer_snapshot = _load_latest_checkpoint_snapshot(args.config)
result = build_demo_bundle(
trained_checkpoint=args.trained_checkpoint,
tiers=tuple(_split_csv(args.tiers)),
seeds=tuple(int(item) for item in _split_csv(args.seeds)),
disaster_families=tuple(_split_csv(args.families)),
max_rounds=args.max_rounds,
rationale_mode=args.rationale_mode,
output_dir=args.output_dir,
skip_trained=args.skip_trained,
training_metrics_path=args.training_metrics_path,
trained_normalizer_snapshot=trained_normalizer_snapshot,
config_path=args.config,
baseline_policy=args.baseline_policy,
)
_print_result(profile, result)
return result
def _print_result(profile: EvalProfile, result: DemoBundleResult) -> None:
print(f"Eval profile: {profile.name}")
print(f"Output dir: {result.output_dir}")
print(f"Scorecard: {result.scorecard_md}")
print(f"Summary: {result.summary_md}")
print(f"Comparison CSV: {result.comparison_csv}")
if result.trained_json is not None:
print(f"Trained fixed suite: {result.trained_json}")
else:
print("Trained fixed suite: skipped")
for plot_path in result.plot_paths:
print(f"Plot: {plot_path}")