from __future__ import annotations import argparse import json import re from pathlib import Path from typing import Any from pydantic import ValidationError from eval.metrics import compute_metrics from eval.run_eval import EVAL_PATH, _actual_from_flow, _pass_fail_notes, load_jsonl from routercore.model_router import extract_first_json_object from routercore.models import RouterOutput from routercore.policy import evaluate_policy from routercore.validator import validate_route from training.format_dataset import build_inference_prompt from training.train_lora import OptionalTrainingDependencyError PROJECT_ROOT = Path(__file__).resolve().parents[1] RESULTS_DIR = PROJECT_ROOT / "eval" / "results" def load_lora_dependencies(): try: import torch from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer except ImportError as exc: raise OptionalTrainingDependencyError( "Optional LoRA eval dependencies are unavailable. Install transformers, peft, and torch." ) from exc return { "torch": torch, "PeftModel": PeftModel, "AutoModelForCausalLM": AutoModelForCausalLM, "AutoTokenizer": AutoTokenizer, } def _safe_adapter_name(adapter: Path) -> str: name = adapter.name or str(adapter) safe = re.sub(r"[^A-Za-z0-9._-]+", "_", name).strip("_") return safe or "adapter" class LoraRouter: def __init__( self, *, base_model: str, adapter: Path, device: str = "auto", max_new_tokens: int = 512, ) -> None: deps = load_lora_dependencies() self.torch = deps["torch"] PeftModel = deps["PeftModel"] AutoModelForCausalLM = deps["AutoModelForCausalLM"] AutoTokenizer = deps["AutoTokenizer"] self.max_new_tokens = max_new_tokens self.device = self._resolve_device(device) try: self.tokenizer = AutoTokenizer.from_pretrained(adapter if adapter.exists() else base_model) except Exception: self.tokenizer = AutoTokenizer.from_pretrained(base_model) if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token base = AutoModelForCausalLM.from_pretrained(base_model) self.model = PeftModel.from_pretrained(base, adapter) self.model.to(self.device) self.model.eval() def route(self, request_text: str) -> RouterOutput: prompt = build_inference_prompt(request_text) encoded = self.tokenizer(prompt, return_tensors="pt").to(self.device) with self.torch.no_grad(): output_ids = self.model.generate( **encoded, max_new_tokens=self.max_new_tokens, do_sample=False, pad_token_id=self.tokenizer.eos_token_id, ) input_length = encoded["input_ids"].shape[-1] raw_text = self.tokenizer.decode(output_ids[0][input_length:], skip_special_tokens=True) parsed = extract_first_json_object(raw_text) if parsed is None: return self._fallback("model_output_parse_failed") try: return RouterOutput.model_validate(parsed) except (ValidationError, ValueError, TypeError): return self._fallback("model_output_parse_failed") def _resolve_device(self, device: str) -> str: if device == "auto": return "cuda" if self.torch.cuda.is_available() else "cpu" if device == "cuda" and not self.torch.cuda.is_available(): return "cpu" return device @staticmethod def _fallback(reason: str) -> RouterOutput: return RouterOutput( status="fallback", workflow=None, confidence=0.0, parameters={}, missing_fields=[], candidate_workflows=[], failure_reasons=[reason], clarifying_question=None, ) def run_lora_eval( *, base_model: str, adapter: Path, limit: int | None = None, device: str = "auto", ) -> dict[str, Any]: router = LoraRouter(base_model=base_model, adapter=adapter, device=device) examples = load_jsonl(EVAL_PATH) if limit is not None: examples = examples[:limit] per_example_results: list[dict[str, Any]] = [] metric_rows: list[dict[str, Any]] = [] for item in examples: router_output = router.route(item["input"]) validation_result = validate_route(router_output) policy_decision = evaluate_policy( router_output, validation_result, original_request=item["input"], ) actual = _actual_from_flow(router_output, validation_result, policy_decision) notes = _pass_fail_notes(item, actual) metric_rows.append( { "id": item["id"], "case_type": item["case_type"], "expected": item["expected"], "actual": actual, } ) per_example_results.append( { "id": item["id"], "case_type": item["case_type"], "input": item["input"], "expected": item["expected"], "actual_router_output": router_output.model_dump(mode="json"), "validation_result": validation_result.model_dump(mode="json"), "policy_decision": policy_decision.model_dump(mode="json"), "actual": actual, "pass_fail_notes": notes, } ) return { "base_model": base_model, "adapter": str(adapter), "limit": limit, "summary_metrics": compute_metrics(metric_rows), "per_example_results": per_example_results, } def _print_metrics_table(adapter: Path, metrics: dict[str, float]) -> None: print(f"LoRA Evaluation: {adapter}") print("=" * (17 + len(str(adapter)))) for name, value in metrics.items(): print(f"{name:40} {value:6.2%}") def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Evaluate a RouterCore LoRA adapter.") parser.add_argument("--base-model", required=True) parser.add_argument("--adapter", type=Path, required=True) parser.add_argument("--limit", type=int, default=None) parser.add_argument("--device", choices=["auto", "cpu", "cuda"], default="auto") return parser.parse_args() def main() -> None: args = parse_args() try: output = run_lora_eval( base_model=args.base_model, adapter=args.adapter, limit=args.limit, device=args.device, ) except OptionalTrainingDependencyError as exc: print(str(exc)) print("Skipping LoRA evaluation. Run `pip install transformers peft torch` to enable it.") return RESULTS_DIR.mkdir(parents=True, exist_ok=True) output_path = RESULTS_DIR / f"lora_eval_{_safe_adapter_name(args.adapter)}.json" output_path.write_text(json.dumps(output, indent=2), encoding="utf-8") _print_metrics_table(args.adapter, output["summary_metrics"]) print(f"\nWrote detailed results to {output_path}") if __name__ == "__main__": main()