cicd-rl-agent / eval_lora.py
Nikitasoni22's picture
training issue resolved
048dc4f
"""
Evaluate a trained LoRA adapter on ALL_TASKS (same prompt format as train.py).
Usage (Colab):
cd /content/cicd-rl-agent
!python eval_lora.py --adapter-path ./cicd_rl_agent_final
Optional: --base-model must match what you fine-tuned.
Debug a few tasks (raw vs canonical reference):
!python eval_lora.py --adapter-path ./cicd_rl_agent_final --inspect easy_003,medium_001
"""
import argparse
import os
import re
import sys
try:
import yaml
except Exception:
yaml = None
sys.path.insert(0, os.path.dirname(__file__))
import unsloth # noqa: F401 # before trl/transformers
from cicd_debug_env.tasks import ALL_TASKS
# Must match train.py
SYSTEM_PROMPT = (
"You are an expert DevOps engineer. "
"You receive a broken CI/CD pipeline YAML and error details. "
"Output ONLY the corrected YAML — no explanation, no markdown fences."
)
DEFAULT_BASE = "unsloth/Qwen2.5-0.5B-Instruct"
def build_prompt(task: dict) -> str:
return (
f"### Error\n{task.get('error_message', '')}\n\n"
f"### Broken Pipeline\n{task['pipeline_yaml']}\n\n"
f"### Fixed Pipeline (YAML only):\n"
)
def strip_code_fences(text: str) -> str:
t = text.strip()
if "```" in t:
t = re.sub(r"^```[a-zA-Z0-9]*\s*\n", "", t)
t = re.sub(r"\n```\s*$", "", t)
return t.strip()
def canonical_yaml(text: str) -> str:
stripped = strip_code_fences(text)
if not stripped:
return ""
lines = [ln.rstrip() for ln in stripped.splitlines() if ln.strip()]
stripped = "\n".join(lines).strip()
if yaml is None:
return stripped
try:
parsed = yaml.safe_load(stripped)
return yaml.safe_dump(parsed, sort_keys=True).strip()
except Exception:
return stripped
def partial_match_score(completion: str, correct: str, broken: str, canonical_compare: bool) -> str:
c = completion.strip()
if canonical_compare:
c_cmp = canonical_yaml(c)
correct_cmp = canonical_yaml(correct)
broken_cmp = canonical_yaml(broken)
else:
c_cmp = c
correct_cmp = correct.strip()
broken_cmp = broken.strip()
if c_cmp == correct_cmp:
return "exact"
if any(
line.strip() in c
for line in correct.splitlines()
if len(line.strip()) > 8
):
return "partial"
if c_cmp == broken_cmp:
return "unchanged"
return "wrong"
def main():
p = argparse.ArgumentParser(description="Evaluate LoRA on CI/CD YAML fix tasks")
p.add_argument("--adapter-path", default="./cicd_rl_agent_final", type=str)
p.add_argument("--base-model", default=DEFAULT_BASE, type=str)
p.add_argument("--max-new-tokens", type=int, default=128)
p.add_argument(
"--canonical-compare",
action=argparse.BooleanOptionalAction,
default=True,
help="Compare predicted YAML vs correct_yaml using canonicalized YAML tree",
)
p.add_argument(
"--inspect",
type=str,
default="",
help="Comma-separated task ids (e.g. easy_001,medium_002) to print raw model output vs reference",
)
args = p.parse_args()
inspect_ids = {s.strip() for s in args.inspect.split(",") if s.strip()}
if not os.path.isdir(args.adapter_path):
print(f"Adapter path not found: {args.adapter_path}")
sys.exit(1)
import torch
from peft import PeftModel
from unsloth import FastLanguageModel
print(f"Loading base: {args.base_model}")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.base_model,
max_seq_length=1024,
dtype=None,
load_in_4bit=True,
)
print(f"Loading LoRA: {args.adapter_path}")
model = PeftModel.from_pretrained(model, args.adapter_path)
FastLanguageModel.for_inference(model)
dev = next(model.parameters()).device
def generate(task: dict) -> str:
user = build_prompt(task)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user},
]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
inputs = tokenizer(text, return_tensors="pt").to(dev)
with torch.inference_mode():
out = model.generate(
**inputs, max_new_tokens=args.max_new_tokens, do_sample=False
)
return tokenizer.decode(
out[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
)
by_diff: dict = {"easy": [], "medium": [], "hard": []}
for task in ALL_TASKS:
d = task["difficulty"]
if d not in by_diff:
continue
raw = generate(task)
comp = strip_code_fences(raw)
correct = task.get("correct_yaml", "")
label = partial_match_score(comp, correct, task["pipeline_yaml"], args.canonical_compare)
tid = task["id"]
if inspect_ids and tid in inspect_ids:
pred_c = canonical_yaml(comp) if args.canonical_compare else comp.strip()
gold_c = canonical_yaml(correct) if args.canonical_compare else correct.strip()
print(f"\n=== INSPECT {tid} (label={label}) ===\n")
print("--- raw model output ---")
print(raw)
print("--- after strip_code_fences ---")
print(comp)
print("--- canonical pred ---")
print(pred_c)
print("--- canonical reference (correct_yaml) ---")
print(gold_c)
print("--- match ---")
print("exact canonical match:", pred_c == gold_c)
by_diff[d].append(
{
"id": tid,
"label": label,
}
)
print("\n--- LoRA eval (GRPO / train.py prompt style) ---")
for diff in ("easy", "medium", "hard"):
rows = by_diff[diff]
if not rows:
continue
n = len(rows)
exact = sum(1 for r in rows if r["label"] == "exact")
partial = sum(1 for r in rows if r["label"] == "partial")
print(
f"{diff.capitalize()}: n={n} exact={exact/n:.0%} partial+exact={(exact+partial)/n:.0%} "
f"(per-task: {[r['id']+':'+r['label'] for r in rows]})"
)
if __name__ == "__main__":
main()