""" Evaluate the fine-tuned skill-classification model. Loads the trained model, runs 50 held-out prompts, and reports per-example PASS/FAIL plus overall accuracy. Uses transformers + PEFT so evaluation works on Mac/CPU without Unsloth. Run: python -m src.evaluate python -m src.evaluate --model-path ./trained_model/adapter """ from __future__ import annotations import argparse import json from pathlib import Path import torch from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer from src.paths import DATA_DIR, TRAINED_MODEL_DIR from src.skill_utils import extract_skill from src.classifier_prompt import build_classifier_messages BASE_MODEL = "Qwen/Qwen2.5-3B-Instruct" MAX_SEQ_LENGTH = 2048 DEFAULT_MODEL_PATH = TRAINED_MODEL_DIR / "adapter" FALLBACK_MODEL_PATH = TRAINED_MODEL_DIR / "merged" EVAL_PROMPTS_PATH = DATA_DIR / "eval_prompts.json" def load_eval_prompts() -> list[dict[str, str]]: with EVAL_PROMPTS_PATH.open(encoding="utf-8") as handle: prompts = json.load(handle) if len(prompts) != 50: raise ValueError(f"Expected 50 eval prompts in {EVAL_PROMPTS_PATH}, got {len(prompts)}") return prompts def pick_device() -> str: if torch.cuda.is_available(): return "cuda" if torch.backends.mps.is_available(): return "mps" return "cpu" def is_adapter_path(model_path: Path) -> bool: return (model_path / "adapter_config.json").exists() def is_complete_merged_model(model_path: Path) -> bool: config_path = model_path / "config.json" if not config_path.exists() or config_path.stat().st_size == 0: return False index_path = model_path / "model.safetensors.index.json" if index_path.exists(): index = json.loads(index_path.read_text(encoding="utf-8")) shard_names = set(index.get("weight_map", {}).values()) return all( (model_path / shard).exists() and (model_path / shard).stat().st_size > 0 for shard in shard_names ) single_shard = model_path / "model.safetensors" return single_shard.exists() and single_shard.stat().st_size > 0 def resolve_model_path(path: str) -> Path: model_path = Path(path) if model_path.exists(): if is_adapter_path(model_path): return model_path if is_complete_merged_model(model_path): return model_path print(f"Warning: {model_path} looks incomplete; trying adapter fallback.") adapter_path = DEFAULT_MODEL_PATH if adapter_path.exists() and is_adapter_path(adapter_path): print(f"Using LoRA adapter at {adapter_path}") return adapter_path merged_path = FALLBACK_MODEL_PATH if merged_path.exists() and is_complete_merged_model(merged_path): print(f"Using merged model at {merged_path}") return merged_path raise FileNotFoundError( "No usable trained model found. Expected a complete merged model or " f"LoRA adapter at {DEFAULT_MODEL_PATH}." ) def load_model(model_path: Path, device: str): dtype = torch.float16 if device in {"cuda", "mps"} else torch.float32 if is_adapter_path(model_path): print(f"Loading base model: {BASE_MODEL}") tokenizer = AutoTokenizer.from_pretrained(model_path) base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, dtype=dtype, low_cpu_mem_usage=True, ) model = PeftModel.from_pretrained(base_model, str(model_path)) else: print("Loading merged model weights") tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained( model_path, dtype=dtype, low_cpu_mem_usage=True, ) model.to(device) model.eval() return model, tokenizer def generate_skill(model, tokenizer, prompt: str, device: str) -> str: messages = build_classifier_messages(prompt) inputs = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", ).to(device) with torch.inference_mode(): outputs = model.generate( input_ids=inputs, max_new_tokens=64, use_cache=True, do_sample=False, ) generated = outputs[0][inputs.shape[1] :] return tokenizer.decode(generated, skip_special_tokens=True).strip() def evaluate(model, tokenizer, test_prompts: list[dict[str, str]], device: str) -> float: correct = 0 for index, case in enumerate(test_prompts, start=1): prompt = case["prompt"] expected = case["expected"] raw_output = generate_skill(model, tokenizer, prompt, device) predicted = extract_skill(raw_output) passed = predicted == expected correct += int(passed) print(f"--- [{index}/{len(test_prompts)}] ---") print(f"Prompt: {prompt}") print(f"Expected: {expected}") print(f"Predicted: {predicted if predicted is not None else raw_output}") print(f"Result: {'PASS' if passed else 'FAIL'}") print() accuracy = correct / len(test_prompts) print(f"Accuracy: {correct}/{len(test_prompts)} ({accuracy:.1%})") return accuracy def main() -> None: parser = argparse.ArgumentParser(description="Evaluate the skill model.") parser.add_argument( "--model-path", default=str(DEFAULT_MODEL_PATH), help=f"Path to LoRA adapter or merged model (default: {DEFAULT_MODEL_PATH})", ) args = parser.parse_args() test_prompts = load_eval_prompts() device = pick_device() model_path = resolve_model_path(args.model_path) print(f"Device: {device}") print(f"Loading model from {model_path.resolve()}") model, tokenizer = load_model(model_path, device) print(f"Running evaluation on {len(test_prompts)} prompts...\n") evaluate(model, tokenizer, test_prompts, device) if __name__ == "__main__": main()