Spaces:

shashankN777
/

evacos2-openenv

Sleeping

App Files Files Community

evacos2-openenv / scripts /eval_profiles.py

shashankN777

Add held-out base-vs-trained specialist evidence

9a7187a verified about 1 month ago

raw

history blame contribute delete

6.77 kB

	"""Opinionated evaluation profiles for EvacOS2 specialist and orchestrator runs."""

	from __future__ import annotations

	import argparse
	import sys
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Sequence

	ROOT = Path(__file__).resolve().parents[1]
	if str(ROOT) not in sys.path:
	sys.path.insert(0, str(ROOT))

	from evaluation.demo_bundle import DemoBundleResult, build_demo_bundle
	from training.checkpoint import load_checkpoint


	DEFAULT_TIERS = "easy"
	DEFAULT_SEEDS = "42,123,456,789,1024"
	DEFAULT_MAX_ROUNDS = 50


	@dataclass(frozen=True)
	class EvalProfile:
	"""Named eval surface with safe defaults for one model lane."""

	name: str
	description: str
	families: tuple[str, ...]
	output_dir: Path
	config_path: Path


	PROFILES: dict[str, EvalProfile] = {
	"3b-fire": EvalProfile(
	name="3b-fire",
	description="Evaluate the 3B fire floor specialist.",
	families=("fire",),
	output_dir=Path("outputs/evals/3b-fire-specialist"),
	config_path=Path("training/config.remote-unsloth-3b-fire-floor-specialist.yaml"),
	),
	"3b-flood": EvalProfile(
	name="3b-flood",
	description="Evaluate the 3B flood floor specialist.",
	families=("flood",),
	output_dir=Path("outputs/evals/3b-flood-specialist"),
	config_path=Path("training/config.remote-unsloth-3b-flood-floor-specialist.yaml"),
	),
	"3b-gas": EvalProfile(
	name="3b-gas",
	description="Evaluate the 3B gas floor specialist.",
	families=("gas",),
	output_dir=Path("outputs/evals/3b-gas-specialist"),
	config_path=Path("training/config.remote-unsloth-3b-gas-floor-specialist.yaml"),
	),
	"7b-orchestrator": EvalProfile(
	name="7b-orchestrator",
	description="Evaluate the shared 7B orchestrator over routed frozen 3B specialists.",
	families=("fire", "flood", "gas"),
	output_dir=Path("outputs/evals/7b-orchestrator-routed-specialists"),
	config_path=Path("training/config.remote-unsloth-7b-orchestrator-frozen-specialists.example.yaml"),
	),
	}


	def _split_csv(raw: str) -> list[str]:
	return [item.strip() for item in raw.split(",") if item.strip()]


	def _load_latest_checkpoint_snapshot(config_path: Path) -> dict:
	try:
	import yaml
	except ModuleNotFoundError as exc: # pragma: no cover - depends on runner image
	raise ImportError(
	"--use-latest-checkpoint-normalizer requires PyYAML to read the config."
	) from exc

	data = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
	checkpoint_root = Path(
	str(data.get("checkpoint", {}).get("root_dir", "outputs/checkpoints"))
	)
	bundle = load_checkpoint(checkpoint_root)
	if bundle is None:
	raise FileNotFoundError(
	f"No checkpoint with normalizer snapshot found under {checkpoint_root}"
	)
	return bundle.normalizer_snapshot


	def build_parser(profile: EvalProfile) -> argparse.ArgumentParser:
	parser = argparse.ArgumentParser(description=profile.description)
	parser.add_argument(
	"--trained-checkpoint",
	type=Path,
	default=None,
	help=(
	"Path to the trained checkpoint directory or its lora_adapter root. "
	"Use --skip-trained for a baseline-only smoke bundle."
	),
	)
	parser.add_argument(
	"--output-dir",
	type=Path,
	default=profile.output_dir,
	help="Where to write JSON, CSV, scorecard, and plots.",
	)
	parser.add_argument(
	"--config",
	type=Path,
	default=profile.config_path,
	help="Training config used if checkpoint metadata is unavailable.",
	)
	parser.add_argument(
	"--tiers",
	default=DEFAULT_TIERS,
	help="Comma-separated evaluation tiers. Current proof lane supports easy only.",
	)
	parser.add_argument(
	"--seeds",
	default=DEFAULT_SEEDS,
	help="Comma-separated held-out eval seeds.",
	)
	parser.add_argument(
	"--families",
	default=",".join(profile.families),
	help="Comma-separated disaster families. Defaults to the profile lane.",
	)
	parser.add_argument("--rationale-mode", default="linear_capped")
	parser.add_argument(
	"--max-rounds",
	type=int,
	default=DEFAULT_MAX_ROUNDS,
	help=(
	"Bounded rounds per eval episode. Keep this small for smoke/gate "
	"checks so trained eval cannot silently run for hours."
	),
	)
	parser.add_argument("--training-metrics-path", type=Path, default=None)
	parser.add_argument("--skip-trained", action="store_true")
	parser.add_argument(
	"--use-latest-checkpoint-normalizer",
	action="store_true",
	help="Seed trained eval normalization from config.checkpoint.root_dir/latest.",
	)
	parser.add_argument(
	"--baseline-policy",
	choices=("stub", "base_model"),
	default="stub",
	help=(
	"Baseline reference. Use base_model for judge-facing no-LoRA "
	"model-vs-trained-LoRA comparisons."
	),
	)
	return parser


	def run_profile(profile_name: str, argv: Sequence[str] \| None = None) -> DemoBundleResult:
	profile = PROFILES[profile_name]
	parser = build_parser(profile)
	args = parser.parse_args(argv)

	trained_normalizer_snapshot = None
	if args.use_latest_checkpoint_normalizer:
	trained_normalizer_snapshot = _load_latest_checkpoint_snapshot(args.config)

	result = build_demo_bundle(
	trained_checkpoint=args.trained_checkpoint,
	tiers=tuple(_split_csv(args.tiers)),
	seeds=tuple(int(item) for item in _split_csv(args.seeds)),
	disaster_families=tuple(_split_csv(args.families)),
	max_rounds=args.max_rounds,
	rationale_mode=args.rationale_mode,
	output_dir=args.output_dir,
	skip_trained=args.skip_trained,
	training_metrics_path=args.training_metrics_path,
	trained_normalizer_snapshot=trained_normalizer_snapshot,
	config_path=args.config,
	baseline_policy=args.baseline_policy,
	)
	_print_result(profile, result)
	return result


	def _print_result(profile: EvalProfile, result: DemoBundleResult) -> None:
	print(f"Eval profile: {profile.name}")
	print(f"Output dir: {result.output_dir}")
	print(f"Scorecard: {result.scorecard_md}")
	print(f"Summary: {result.summary_md}")
	print(f"Comparison CSV: {result.comparison_csv}")
	if result.trained_json is not None:
	print(f"Trained fixed suite: {result.trained_json}")
	else:
	print("Trained fixed suite: skipped")
	for plot_path in result.plot_paths:
	print(f"Plot: {plot_path}")