| """ |
| Evaluate a trained LoRA adapter on ALL_TASKS (same prompt format as train.py). |
| Usage (Colab): |
| cd /content/cicd-rl-agent |
| !python eval_lora.py --adapter-path ./cicd_rl_agent_final |
| |
| Optional: --base-model must match what you fine-tuned. |
| |
| Debug a few tasks (raw vs canonical reference): |
| !python eval_lora.py --adapter-path ./cicd_rl_agent_final --inspect easy_003,medium_001 |
| """ |
|
|
| import argparse |
| import os |
| import re |
| import sys |
| try: |
| import yaml |
| except Exception: |
| yaml = None |
|
|
| sys.path.insert(0, os.path.dirname(__file__)) |
|
|
| import unsloth |
|
|
| from cicd_debug_env.tasks import ALL_TASKS |
|
|
| |
| SYSTEM_PROMPT = ( |
| "You are an expert DevOps engineer. " |
| "You receive a broken CI/CD pipeline YAML and error details. " |
| "Output ONLY the corrected YAML — no explanation, no markdown fences." |
| ) |
|
|
| DEFAULT_BASE = "unsloth/Qwen2.5-0.5B-Instruct" |
|
|
|
|
| def build_prompt(task: dict) -> str: |
| return ( |
| f"### Error\n{task.get('error_message', '')}\n\n" |
| f"### Broken Pipeline\n{task['pipeline_yaml']}\n\n" |
| f"### Fixed Pipeline (YAML only):\n" |
| ) |
|
|
|
|
| def strip_code_fences(text: str) -> str: |
| t = text.strip() |
| if "```" in t: |
| t = re.sub(r"^```[a-zA-Z0-9]*\s*\n", "", t) |
| t = re.sub(r"\n```\s*$", "", t) |
| return t.strip() |
|
|
| def canonical_yaml(text: str) -> str: |
| stripped = strip_code_fences(text) |
| if not stripped: |
| return "" |
| lines = [ln.rstrip() for ln in stripped.splitlines() if ln.strip()] |
| stripped = "\n".join(lines).strip() |
| if yaml is None: |
| return stripped |
| try: |
| parsed = yaml.safe_load(stripped) |
| return yaml.safe_dump(parsed, sort_keys=True).strip() |
| except Exception: |
| return stripped |
|
|
|
|
| def partial_match_score(completion: str, correct: str, broken: str, canonical_compare: bool) -> str: |
| c = completion.strip() |
| if canonical_compare: |
| c_cmp = canonical_yaml(c) |
| correct_cmp = canonical_yaml(correct) |
| broken_cmp = canonical_yaml(broken) |
| else: |
| c_cmp = c |
| correct_cmp = correct.strip() |
| broken_cmp = broken.strip() |
| if c_cmp == correct_cmp: |
| return "exact" |
| if any( |
| line.strip() in c |
| for line in correct.splitlines() |
| if len(line.strip()) > 8 |
| ): |
| return "partial" |
| if c_cmp == broken_cmp: |
| return "unchanged" |
| return "wrong" |
|
|
|
|
| def main(): |
| p = argparse.ArgumentParser(description="Evaluate LoRA on CI/CD YAML fix tasks") |
| p.add_argument("--adapter-path", default="./cicd_rl_agent_final", type=str) |
| p.add_argument("--base-model", default=DEFAULT_BASE, type=str) |
| p.add_argument("--max-new-tokens", type=int, default=128) |
| p.add_argument( |
| "--canonical-compare", |
| action=argparse.BooleanOptionalAction, |
| default=True, |
| help="Compare predicted YAML vs correct_yaml using canonicalized YAML tree", |
| ) |
| p.add_argument( |
| "--inspect", |
| type=str, |
| default="", |
| help="Comma-separated task ids (e.g. easy_001,medium_002) to print raw model output vs reference", |
| ) |
| args = p.parse_args() |
| inspect_ids = {s.strip() for s in args.inspect.split(",") if s.strip()} |
|
|
| if not os.path.isdir(args.adapter_path): |
| print(f"Adapter path not found: {args.adapter_path}") |
| sys.exit(1) |
|
|
| import torch |
| from peft import PeftModel |
| from unsloth import FastLanguageModel |
|
|
| print(f"Loading base: {args.base_model}") |
| model, tokenizer = FastLanguageModel.from_pretrained( |
| model_name=args.base_model, |
| max_seq_length=1024, |
| dtype=None, |
| load_in_4bit=True, |
| ) |
| print(f"Loading LoRA: {args.adapter_path}") |
| model = PeftModel.from_pretrained(model, args.adapter_path) |
| FastLanguageModel.for_inference(model) |
|
|
| dev = next(model.parameters()).device |
|
|
| def generate(task: dict) -> str: |
| user = build_prompt(task) |
| messages = [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| {"role": "user", "content": user}, |
| ] |
| text = tokenizer.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| inputs = tokenizer(text, return_tensors="pt").to(dev) |
| with torch.inference_mode(): |
| out = model.generate( |
| **inputs, max_new_tokens=args.max_new_tokens, do_sample=False |
| ) |
| return tokenizer.decode( |
| out[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True |
| ) |
|
|
| by_diff: dict = {"easy": [], "medium": [], "hard": []} |
| for task in ALL_TASKS: |
| d = task["difficulty"] |
| if d not in by_diff: |
| continue |
| raw = generate(task) |
| comp = strip_code_fences(raw) |
| correct = task.get("correct_yaml", "") |
| label = partial_match_score(comp, correct, task["pipeline_yaml"], args.canonical_compare) |
| tid = task["id"] |
| if inspect_ids and tid in inspect_ids: |
| pred_c = canonical_yaml(comp) if args.canonical_compare else comp.strip() |
| gold_c = canonical_yaml(correct) if args.canonical_compare else correct.strip() |
| print(f"\n=== INSPECT {tid} (label={label}) ===\n") |
| print("--- raw model output ---") |
| print(raw) |
| print("--- after strip_code_fences ---") |
| print(comp) |
| print("--- canonical pred ---") |
| print(pred_c) |
| print("--- canonical reference (correct_yaml) ---") |
| print(gold_c) |
| print("--- match ---") |
| print("exact canonical match:", pred_c == gold_c) |
| by_diff[d].append( |
| { |
| "id": tid, |
| "label": label, |
| } |
| ) |
|
|
| print("\n--- LoRA eval (GRPO / train.py prompt style) ---") |
| for diff in ("easy", "medium", "hard"): |
| rows = by_diff[diff] |
| if not rows: |
| continue |
| n = len(rows) |
| exact = sum(1 for r in rows if r["label"] == "exact") |
| partial = sum(1 for r in rows if r["label"] == "partial") |
| print( |
| f"{diff.capitalize()}: n={n} exact={exact/n:.0%} partial+exact={(exact+partial)/n:.0%} " |
| f"(per-task: {[r['id']+':'+r['label'] for r in rows]})" |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|