""" Evaluate a trained LoRA adapter on ALL_TASKS (same prompt format as train.py). Usage (Colab): cd /content/cicd-rl-agent !python eval_lora.py --adapter-path ./cicd_rl_agent_final Optional: --base-model must match what you fine-tuned. Debug a few tasks (raw vs canonical reference): !python eval_lora.py --adapter-path ./cicd_rl_agent_final --inspect easy_003,medium_001 """ import argparse import os import re import sys try: import yaml except Exception: yaml = None sys.path.insert(0, os.path.dirname(__file__)) import unsloth # noqa: F401 # before trl/transformers from cicd_debug_env.tasks import ALL_TASKS # Must match train.py SYSTEM_PROMPT = ( "You are an expert DevOps engineer. " "You receive a broken CI/CD pipeline YAML and error details. " "Output ONLY the corrected YAML — no explanation, no markdown fences." ) DEFAULT_BASE = "unsloth/Qwen2.5-0.5B-Instruct" def build_prompt(task: dict) -> str: return ( f"### Error\n{task.get('error_message', '')}\n\n" f"### Broken Pipeline\n{task['pipeline_yaml']}\n\n" f"### Fixed Pipeline (YAML only):\n" ) def strip_code_fences(text: str) -> str: t = text.strip() if "```" in t: t = re.sub(r"^```[a-zA-Z0-9]*\s*\n", "", t) t = re.sub(r"\n```\s*$", "", t) return t.strip() def canonical_yaml(text: str) -> str: stripped = strip_code_fences(text) if not stripped: return "" lines = [ln.rstrip() for ln in stripped.splitlines() if ln.strip()] stripped = "\n".join(lines).strip() if yaml is None: return stripped try: parsed = yaml.safe_load(stripped) return yaml.safe_dump(parsed, sort_keys=True).strip() except Exception: return stripped def partial_match_score(completion: str, correct: str, broken: str, canonical_compare: bool) -> str: c = completion.strip() if canonical_compare: c_cmp = canonical_yaml(c) correct_cmp = canonical_yaml(correct) broken_cmp = canonical_yaml(broken) else: c_cmp = c correct_cmp = correct.strip() broken_cmp = broken.strip() if c_cmp == correct_cmp: return "exact" if any( line.strip() in c for line in correct.splitlines() if len(line.strip()) > 8 ): return "partial" if c_cmp == broken_cmp: return "unchanged" return "wrong" def main(): p = argparse.ArgumentParser(description="Evaluate LoRA on CI/CD YAML fix tasks") p.add_argument("--adapter-path", default="./cicd_rl_agent_final", type=str) p.add_argument("--base-model", default=DEFAULT_BASE, type=str) p.add_argument("--max-new-tokens", type=int, default=128) p.add_argument( "--canonical-compare", action=argparse.BooleanOptionalAction, default=True, help="Compare predicted YAML vs correct_yaml using canonicalized YAML tree", ) p.add_argument( "--inspect", type=str, default="", help="Comma-separated task ids (e.g. easy_001,medium_002) to print raw model output vs reference", ) args = p.parse_args() inspect_ids = {s.strip() for s in args.inspect.split(",") if s.strip()} if not os.path.isdir(args.adapter_path): print(f"Adapter path not found: {args.adapter_path}") sys.exit(1) import torch from peft import PeftModel from unsloth import FastLanguageModel print(f"Loading base: {args.base_model}") model, tokenizer = FastLanguageModel.from_pretrained( model_name=args.base_model, max_seq_length=1024, dtype=None, load_in_4bit=True, ) print(f"Loading LoRA: {args.adapter_path}") model = PeftModel.from_pretrained(model, args.adapter_path) FastLanguageModel.for_inference(model) dev = next(model.parameters()).device def generate(task: dict) -> str: user = build_prompt(task) messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user}, ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer(text, return_tensors="pt").to(dev) with torch.inference_mode(): out = model.generate( **inputs, max_new_tokens=args.max_new_tokens, do_sample=False ) return tokenizer.decode( out[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True ) by_diff: dict = {"easy": [], "medium": [], "hard": []} for task in ALL_TASKS: d = task["difficulty"] if d not in by_diff: continue raw = generate(task) comp = strip_code_fences(raw) correct = task.get("correct_yaml", "") label = partial_match_score(comp, correct, task["pipeline_yaml"], args.canonical_compare) tid = task["id"] if inspect_ids and tid in inspect_ids: pred_c = canonical_yaml(comp) if args.canonical_compare else comp.strip() gold_c = canonical_yaml(correct) if args.canonical_compare else correct.strip() print(f"\n=== INSPECT {tid} (label={label}) ===\n") print("--- raw model output ---") print(raw) print("--- after strip_code_fences ---") print(comp) print("--- canonical pred ---") print(pred_c) print("--- canonical reference (correct_yaml) ---") print(gold_c) print("--- match ---") print("exact canonical match:", pred_c == gold_c) by_diff[d].append( { "id": tid, "label": label, } ) print("\n--- LoRA eval (GRPO / train.py prompt style) ---") for diff in ("easy", "medium", "hard"): rows = by_diff[diff] if not rows: continue n = len(rows) exact = sum(1 for r in rows if r["label"] == "exact") partial = sum(1 for r in rows if r["label"] == "partial") print( f"{diff.capitalize()}: n={n} exact={exact/n:.0%} partial+exact={(exact+partial)/n:.0%} " f"(per-task: {[r['id']+':'+r['label'] for r in rows]})" ) if __name__ == "__main__": main()