Nikitasoni22
/

cicd-rl-agent

@@ -5,6 +5,9 @@ Usage (Colab):
   !python eval_lora.py --adapter-path ./cicd_rl_agent_final
 Optional: --base-model must match what you fine-tuned.
 """
 import argparse
@@ -96,7 +99,14 @@ def main():
         default=True,
         help="Compare predicted YAML vs correct_yaml using canonicalized YAML tree",
     )
     args = p.parse_args()
     if not os.path.isdir(args.adapter_path):
         print(f"Adapter path not found: {args.adapter_path}")
@@ -146,9 +156,24 @@ def main():
         comp = strip_code_fences(raw)
         correct = task.get("correct_yaml", "")
         label = partial_match_score(comp, correct, task["pipeline_yaml"], args.canonical_compare)
         by_diff[d].append(
             {
-                "id": task["id"],
                 "label": label,
             }
         )

   !python eval_lora.py --adapter-path ./cicd_rl_agent_final
 Optional: --base-model must match what you fine-tuned.
+Debug a few tasks (raw vs canonical reference):
+  !python eval_lora.py --adapter-path ./cicd_rl_agent_final --inspect easy_003,medium_001
 """
 import argparse
         default=True,
         help="Compare predicted YAML vs correct_yaml using canonicalized YAML tree",
     )
+    p.add_argument(
+        "--inspect",
+        type=str,
+        default="",
+        help="Comma-separated task ids (e.g. easy_001,medium_002) to print raw model output vs reference",
+    )
     args = p.parse_args()
+    inspect_ids = {s.strip() for s in args.inspect.split(",") if s.strip()}
     if not os.path.isdir(args.adapter_path):
         print(f"Adapter path not found: {args.adapter_path}")
         comp = strip_code_fences(raw)
         correct = task.get("correct_yaml", "")
         label = partial_match_score(comp, correct, task["pipeline_yaml"], args.canonical_compare)
+        tid = task["id"]
+        if inspect_ids and tid in inspect_ids:
+            pred_c = canonical_yaml(comp) if args.canonical_compare else comp.strip()
+            gold_c = canonical_yaml(correct) if args.canonical_compare else correct.strip()
+            print(f"\n=== INSPECT {tid} (label={label}) ===\n")
+            print("--- raw model output ---")
+            print(raw)
+            print("--- after strip_code_fences ---")
+            print(comp)
+            print("--- canonical pred ---")
+            print(pred_c)
+            print("--- canonical reference (correct_yaml) ---")
+            print(gold_c)
+            print("--- match ---")
+            print("exact canonical match:", pred_c == gold_c)
         by_diff[d].append(
             {
+                "id": tid,
                 "label": label,
             }
         )

train.py CHANGED Viewed

@@ -1,10 +1,24 @@
 """
-train.py — CICD RL Agent full training script
-Run: python train.py
 Requires: pip install unsloth trl datasets transformers
 """
 import os, re, sys
 sys.path.insert(0, os.path.dirname(__file__))
 try:
     import yaml
@@ -23,6 +37,23 @@ NUM_SAMPLES   = 512
 # GRPO: use `max_completion_length` (TRL); older examples used `max_new_tokens`.
 MAX_COMPLETION_TOKENS = 128
 from cicd_debug_env.tasks import ALL_TASKS
 from datasets import Dataset
 import random
@@ -58,6 +89,29 @@ def build_dataset():
         })
     return Dataset.from_list(records)
 def _completion_to_text(completion) -> str:
     """
     Normalize TRL/Unsloth completion payloads to plain text.
@@ -118,7 +172,8 @@ def reward_fix_correctness(completions, prompts, correct_yaml, pipeline_yaml, **
         pred_canon = _canonical_yaml(pred)
         correct_canon = _canonical_yaml(correct)
         # Strict reward: exact/canonical exact gets high reward; everything else is negative.
-        rewards.append(3.0 if pred_canon and pred_canon == correct_canon else -1.0)
     return rewards
 def reward_yaml_structure(completions, prompts, **kwargs):
@@ -137,7 +192,7 @@ def reward_yaml_structure(completions, prompts, **kwargs):
         score = 0.4 * int(starts_yaml) + 0.4 * int(has_yaml_keys) + 0.2 * int(line_count_ok)
         if has_prose_or_md:
             score -= 1.0
-        rewards.append(score)
     return rewards
 def reward_no_hallucination(completions, prompts, **kwargs):
@@ -149,27 +204,266 @@ def reward_no_hallucination(completions, prompts, **kwargs):
     for c in completions:
         lower = _completion_to_text(c).lower()
         bad_hits = sum(1 for p in bad if p in lower)
-        values.append(-2.0 if bad_hits > 0 else 0.5)
     return values
 REWARD_FUNCTIONS = [reward_fix_correctness, reward_yaml_structure, reward_no_hallucination]
 def main():
     # Colab often sets WANDB_DISABLED in the runtime env.
-    # If report_to is wandb, this env var causes a hard runtime error in Trainer callbacks.
     if os.environ.get("WANDB_DISABLED", "").strip().lower() in {"1", "true", "yes", "on"}:
-        print("Detected WANDB_DISABLED; unsetting it because report_to='wandb'.")
         os.environ.pop("WANDB_DISABLED", None)
     if USE_UNSLOTH:
         from unsloth import FastLanguageModel
         model, tokenizer = FastLanguageModel.from_pretrained(
-            model_name=MODEL_NAME, max_seq_length=1024, dtype=None, load_in_4bit=True)
         model = FastLanguageModel.get_peft_model(
-            model, r=16,
-            target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
-            lora_alpha=16, lora_dropout=0.0, bias="none",
-            use_gradient_checkpointing="unsloth", random_state=42)
     else:
         from transformers import AutoModelForCausalLM, AutoTokenizer
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
@@ -177,53 +471,118 @@ def main():
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-    dataset = build_dataset()
-    print(f"Dataset: {len(dataset)} samples")
-    # Prefer wandb logging when available; gracefully fall back if not installed.
-    use_wandb = True
-    try:
-        import wandb  # noqa: F401
-    except Exception:
-        use_wandb = False
-        print("wandb is not installed; falling back to report_to='none'.")
-    from trl import GRPOTrainer, GRPOConfig
-    args = GRPOConfig(
-        output_dir="./cicd_rl_output",
-        per_device_train_batch_size=BATCH_SIZE,
-        gradient_accumulation_steps=GRAD_ACCUM,
-        learning_rate=5e-6, max_steps=MAX_STEPS,
-        num_generations=4, max_completion_length=MAX_COMPLETION_TOKENS,
-        logging_steps=5, save_steps=50,
-        report_to="wandb" if use_wandb else "none", remove_unused_columns=False,
-        warmup_steps=10, lr_scheduler_type="cosine", optim="adamw_8bit",
-    )
-    trainer = GRPOTrainer(
-        model=model, args=args, reward_funcs=REWARD_FUNCTIONS,
-        train_dataset=dataset, processing_class=tokenizer)
-    print("Starting GRPO training...")
-    if use_wandb:
-        import wandb
-        wandb.init(project="cicd-rl-agent", name="grpo-run-1")
-    trainer.train()
-    print("Training complete!")
     save_path = "./cicd_rl_agent_final"
-    if USE_UNSLOTH:
         model.save_pretrained(save_path)
         tokenizer.save_pretrained(save_path)
-        print(f"LoRA adapters saved to {save_path}")
-        print("Testing post-training inference...")
-        FastLanguageModel.for_inference(model)
-        test_input = tokenizer("Fix this YAML: steps:\n  - run: npm tset", return_tensors="pt").to("cuda")
-        out = model.generate(**test_input, max_new_tokens=64)
-        print(tokenizer.decode(out[0], skip_special_tokens=True))
-    else:
         model.save_pretrained(save_path)
         tokenizer.save_pretrained(save_path)
-        print(f"Model saved to {save_path}")
 if __name__ == "__main__":
     main()

 """
+train.py — CICD RL Agent: optional SFT (supervised) then GRPO (RL) on CI/CD YAML fixes.
+Default: short SFT on (prompt → correct_yaml), then GRPO with correctness-heavy rewards.
+  python train.py                          # SFT (short) + GRPO (same as before)
+  python train.py --stages grpo            # GRPO only (old behavior, no SFT)
+  python train.py --stages sft             # SFT only; saves ./cicd_rl_sft_lora
+  train.py --stages sft,grpo --sft-epochs 2
+  train.py --no-final-eval
+  train.py --eval-timeout 90
+Console: SFT/GRPO log lines (loss/rewards + step X/Y), per-stage times and step counts, then
+a final eval of every task with correct/wrong/timeout, wall time, and reward breakdown.
 Requires: pip install unsloth trl datasets transformers
 """
+import argparse
 import os, re, sys
+import time
 sys.path.insert(0, os.path.dirname(__file__))
 try:
     import yaml
 # GRPO: use `max_completion_length` (TRL); older examples used `max_new_tokens`.
 MAX_COMPLETION_TOKENS = 128
+# SFT: teach exact gold YAML before RL polish (short run by design).
+SFT_EPOCHS = 1
+SFT_LEARNING_RATE = 2e-4
+SFT_MAX_SEQ = 1024
+SFT_DATASET_SIZE = 512
+SFT_OUTPUT = "./cicd_rl_sft_lora"
+# Post-training quick eval: mark each task correct / wrong / timeout if generation exceeds this (seconds).
+EVAL_GEN_TIMEOUT_SEC = 60.0
+# Reward mix: GRPO sums per-function rewards; keep correctness as the dominant term.
+REWARD_FIX_MATCH = 5.0
+REWARD_FIX_MISS = -1.5
+REWARD_STRUCT_SCALE = 0.2
+REWARD_HALLU_GOOD = 0.1
+REWARD_HALLU_BAD = -0.35
 from cicd_debug_env.tasks import ALL_TASKS
 from datasets import Dataset
 import random
         })
     return Dataset.from_list(records)
+def build_sft_dataset(tokenizer) -> Dataset:
+    """Supervised (prompt, assistant) = same chat format as inference; target is exact correct_yaml."""
+    easy = [t for t in ALL_TASKS if t["difficulty"] == "easy"]
+    medium = [t for t in ALL_TASKS if t["difficulty"] == "medium"]
+    hard = [t for t in ALL_TASKS if t["difficulty"] == "hard"]
+    records = []
+    for _ in range(SFT_DATASET_SIZE):
+        r = random.random()
+        task = random.choice(easy if r < 0.5 else medium if r < 0.8 else hard)
+        gold = (task.get("correct_yaml") or "").strip()
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": build_prompt(task)},
+            {"role": "assistant", "content": gold},
+        ]
+        text = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=False
+        )
+        records.append({"text": text})
+    return Dataset.from_list(records)
 def _completion_to_text(completion) -> str:
     """
     Normalize TRL/Unsloth completion payloads to plain text.
         pred_canon = _canonical_yaml(pred)
         correct_canon = _canonical_yaml(correct)
         # Strict reward: exact/canonical exact gets high reward; everything else is negative.
+        ok = bool(pred_canon and pred_canon == correct_canon)
+        rewards.append(REWARD_FIX_MATCH if ok else REWARD_FIX_MISS)
     return rewards
 def reward_yaml_structure(completions, prompts, **kwargs):
         score = 0.4 * int(starts_yaml) + 0.4 * int(has_yaml_keys) + 0.2 * int(line_count_ok)
         if has_prose_or_md:
             score -= 1.0
+        rewards.append(score * REWARD_STRUCT_SCALE)
     return rewards
 def reward_no_hallucination(completions, prompts, **kwargs):
     for c in completions:
         lower = _completion_to_text(c).lower()
         bad_hits = sum(1 for p in bad if p in lower)
+        values.append(REWARD_HALLU_BAD if bad_hits > 0 else REWARD_HALLU_GOOD)
     return values
 REWARD_FUNCTIONS = [reward_fix_correctness, reward_yaml_structure, reward_no_hallucination]
+def _grpo_console_callback(max_steps: int, label: str = "GRPO"):
+    from transformers import TrainerCallback
+    class _GRPOConsoleLogCallback(TrainerCallback):
+        def __init__(self) -> None:
+            self._max = max_steps
+            self._label = label
+        def on_log(self, args, state, control, logs=None, **kwargs):
+            if not logs:
+                return
+            parts = [f"[{self._label} turn/step {state.global_step}/{self._max}]"]
+            for k in sorted(logs.keys()):
+                kl = k.lower()
+                if "reward" in kl or k in ("loss", "kl", "learning_rate", "train_loss") or "loss" in kl:
+                    v = logs[k]
+                    if isinstance(v, (int, float)):
+                        parts.append(f"{k}={v:.6g}")
+                    else:
+                        parts.append(f"{k}={v}")
+            print(" | ".join(parts), flush=True)
+    return _GRPOConsoleLogCallback()
+def _sft_console_callback():
+    from transformers import TrainerCallback
+    class _SFTConsoleLogCallback(TrainerCallback):
+        def on_log(self, args, state, control, logs=None, **kwargs):
+            if not logs:
+                return
+            line = f"[SFT turn/step {state.global_step}]"
+            for k, v in sorted(logs.items()):
+                if "loss" in k.lower() or "learning_rate" in k:
+                    if isinstance(v, (int, float)):
+                        line += f" {k}={v:.6g}"
+            print(line, flush=True)
+    return _SFTConsoleLogCallback()
+def _format_seconds(sec: float) -> str:
+    if sec < 60:
+        return f"{sec:.1f}s"
+    m, s = int(sec // 60), sec % 60
+    if m < 60:
+        return f"{m}m {s:.1f}s"
+    h, m = m // 60, m % 60
+    return f"{h}h {m}m {s:.0f}s"
+def _print_grpo_reward_tail(trainer) -> None:
+    hist = getattr(trainer.state, "log_history", None) or []
+    if not hist:
+        print("(No log_history available for reward summary.)", flush=True)
+        return
+    print("\n--- Last GRPO log entries (rewards) ---", flush=True)
+    for row in hist[-5:]:
+        rbits = {k: v for k, v in row.items() if "reward" in k.lower() or k == "loss"}
+        if rbits:
+            print(f"  step {row.get('step', '?')}: {rbits}", flush=True)
+def _set_inference_mode(model) -> None:
+    if USE_UNSLOTH:
+        from unsloth import FastLanguageModel
+        FastLanguageModel.for_inference(model)
+    else:
+        model.eval()
+def _generate_for_task(model, tokenizer, task: dict, max_new_tokens: int) -> str:
+    import torch
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user", "content": build_prompt(task)},
+    ]
+    text = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    dev = next(model.parameters()).device
+    inputs = tokenizer(text, return_tensors="pt").to(dev)
+    with torch.inference_mode():
+        out = model.generate(
+            **inputs, max_new_tokens=max_new_tokens, do_sample=False
+        )
+    return tokenizer.decode(
+        out[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
+    )
+def _eval_task_status(raw: str, task: dict, took_sec: float, timeout_sec: float) -> str:
+    if took_sec > timeout_sec:
+        return "timeout"
+    pred = _strip_markdown_fences(_completion_to_text(raw))
+    gold = (task.get("correct_yaml") or "").strip()
+    p_can = _canonical_yaml(pred)
+    g_can = _canonical_yaml(gold)
+    if p_can and g_can and p_can == g_can:
+        return "correct"
+    return "wrong"
+def run_final_task_eval(
+    model,
+    tokenizer,
+    max_new_tokens: int = MAX_COMPLETION_TOKENS,
+    timeout_sec: float = EVAL_GEN_TIMEOUT_SEC,
+) -> None:
+    """One generation per task; labels: correct, wrong, or timeout (if wall time > timeout_sec)."""
+    _set_inference_mode(model)
+    print(
+        f"\n========== EVAL: all {len(ALL_TASKS)} tasks (1 turn each; max_new_tokens={max_new_tokens}, "
+        f"timeout if wall time > {timeout_sec}s) ==========",
+        flush=True,
+    )
+    for task in ALL_TASKS:
+        tid = task.get("id", "?")
+        t0 = time.perf_counter()
+        try:
+            raw = _generate_for_task(model, tokenizer, task, max_new_tokens)
+        except Exception as e:  # noqa: BLE001
+            took = time.perf_counter() - t0
+            print(
+                f"  {tid}: error — {e!r} (after {took:.1f}s)",
+                flush=True,
+            )
+            continue
+        took = time.perf_counter() - t0
+        status = _eval_task_status(raw, task, took, timeout_sec)
+        r_fix = reward_fix_correctness(
+            [raw], [None], [task.get("correct_yaml", "")], [task["pipeline_yaml"]]
+        )[0]
+        r_stru = reward_yaml_structure([raw], [None])[0]
+        r_hallu = reward_no_hallucination([raw], [None])[0]
+        r_sum = r_fix + r_stru + r_hallu
+        print(
+            f"  {tid}: {status:7s}  |  t={took:5.2f}s  |  rewards: total={r_sum:+.2f} "
+            f"(fix={r_fix:+.2f} struct={r_stru:+.2f} no_hallu={r_hallu:+.2f})",
+            flush=True,
+        )
+    print("========== EVAL end ==========\n", flush=True)
+def _wandb_ok() -> bool:
+    try:
+        import wandb  # noqa: F401
+        return True
+    except Exception:
+        return False
+def run_sft(model, tokenizer, use_wandb: bool, sft_epochs: float):
+    from trl import SFTTrainer, SFTConfig
+    sft_data = build_sft_dataset(tokenizer)
+    print(f"SFT dataset: {len(sft_data)} samples, {sft_epochs} epoch(s)")
+    sft_config = SFTConfig(
+        output_dir="./cicd_rl_sft_output",
+        per_device_train_batch_size=BATCH_SIZE,
+        gradient_accumulation_steps=GRAD_ACCUM,
+        num_train_epochs=sft_epochs,
+        learning_rate=SFT_LEARNING_RATE,
+        logging_steps=10,
+        save_strategy="no",
+        max_length=SFT_MAX_SEQ,
+        dataset_text_field="text",
+        report_to="wandb" if use_wandb else "none",
+        remove_unused_columns=False,
+        optim="adamw_8bit",
+        # Train loss on assistant tokens only (full gold YAML in the assistant turn).
+        assistant_only_loss=True,
+    )
+    trainer = SFTTrainer(
+        model=model,
+        args=sft_config,
+        train_dataset=sft_data,
+        processing_class=tokenizer,
+        callbacks=[_sft_console_callback()],
+    )
+    if use_wandb:
+        import wandb
+        wandb.init(project="cicd-rl-agent", name="sft-cicd-yaml", reinit=True)
+    print("Starting SFT (supervised: prompt -> correct YAML)...")
+    trainer.train()
+    model.save_pretrained(SFT_OUTPUT)
+    tokenizer.save_pretrained(SFT_OUTPUT)
+    print(f"SFT LoRA saved to {SFT_OUTPUT}")
+    return trainer
+def _post_train_smoke_unsloth(tokenizer, model) -> None:
+    import torch
+    from unsloth import FastLanguageModel
+    print("Testing post-training inference...")
+    FastLanguageModel.for_inference(model)
+    if not torch.cuda.is_available():
+        print("(CUDA not available; skip generate smoke test.)")
+        return
+    test_input = tokenizer("Fix this YAML: steps:\n  - run: npm tset", return_tensors="pt").to("cuda")
+    with torch.inference_mode():
+        out = model.generate(**test_input, max_new_tokens=64)
+    print(tokenizer.decode(out[0], skip_special_tokens=True))
 def main():
+    p = argparse.ArgumentParser(description="SFT (optional) + GRPO training for CICD YAML fix agent")
+    p.add_argument(
+        "--stages",
+        type=str,
+        default="sft,grpo",
+        help="Comma list: sft, grpo (default: sft,grpo = supervised then RL)",
+    )
+    p.add_argument("--sft-epochs", type=float, default=SFT_EPOCHS, help="SFT pass size (set 0 to skip SFT in code paths that still use --stages; prefer --stages grpo)")
+    p.add_argument(
+        "--no-final-eval",
+        action="store_true",
+        help="Skip end-of-run eval (correct / wrong / timeout per task).",
+    )
+    p.add_argument(
+        "--eval-timeout",
+        type=float,
+        default=EVAL_GEN_TIMEOUT_SEC,
+        help="Mark task eval as 'timeout' if a single generate() takes longer than this (seconds).",
+    )
+    args = p.parse_args()
+    wants = {s.strip().lower() for s in args.stages.split(",") if s.strip()}
+    if not wants.issubset({"sft", "grpo"}) or not wants:
+        print("Error: --stages must list one or more of: sft, grpo (e.g. sft,grpo or grpo)")
+        sys.exit(1)
     # Colab often sets WANDB_DISABLED in the runtime env.
     if os.environ.get("WANDB_DISABLED", "").strip().lower() in {"1", "true", "yes", "on"}:
+        print("Detected WANDB_DISABLED; unsetting it because report_to may be 'wandb'.")
         os.environ.pop("WANDB_DISABLED", None)
     if USE_UNSLOTH:
         from unsloth import FastLanguageModel
         model, tokenizer = FastLanguageModel.from_pretrained(
+            model_name=MODEL_NAME, max_seq_length=1024, dtype=None, load_in_4bit=True
+        )
         model = FastLanguageModel.get_peft_model(
+            model,
+            r=16,
+            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+            lora_alpha=16,
+            lora_dropout=0.0,
+            bias="none",
+            use_gradient_checkpointing="unsloth",
+            random_state=42,
+        )
     else:
         from transformers import AutoModelForCausalLM, AutoTokenizer
         tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
+    use_wandb = _wandb_ok()
+    if not use_wandb:
+        print("wandb is not installed; falling back to report_to='none' where applicable.")
+    if "sft" in wants and args.sft_epochs <= 0:
+        print("Error: --sft-epochs must be > 0 when SFT is in --stages")
+        sys.exit(1)
+    t_start = time.perf_counter()
+    sft_time_s = 0.0
+    grpo_time_s = 0.0
+    sft_steps = 0
+    grpo_steps = 0
+    sft_trainer = None
+    grpo_trainer = None
+    if "sft" in wants:
+        t0 = time.perf_counter()
+        sft_trainer = run_sft(model, tokenizer, use_wandb, float(args.sft_epochs))
+        sft_time_s = time.perf_counter() - t0
+        sft_steps = getattr(sft_trainer.state, "global_step", 0) if sft_trainer else 0
+        print(
+            f"--- SFT done: {sft_steps} optimizer turn(s) / step(s), time {_format_seconds(sft_time_s)} ---\n",
+            flush=True,
+        )
+    if "grpo" in wants:
+        dataset = build_dataset()
+        print(f"GRPO dataset: {len(dataset)} samples")
+        from trl import GRPOTrainer, GRPOConfig
+        grpo_args = GRPOConfig(
+            output_dir="./cicd_rl_output",
+            per_device_train_batch_size=BATCH_SIZE,
+            gradient_accumulation_steps=GRAD_ACCUM,
+            learning_rate=5e-6,
+            max_steps=MAX_STEPS,
+            num_generations=4,
+            max_completion_length=MAX_COMPLETION_TOKENS,
+            logging_steps=5,
+            save_steps=50,
+            report_to="wandb" if use_wandb else "none",
+            remove_unused_columns=False,
+            warmup_steps=10,
+            lr_scheduler_type="cosine",
+            optim="adamw_8bit",
+        )
+        grpo_trainer = GRPOTrainer(
+            model=model,
+            args=grpo_args,
+            reward_funcs=REWARD_FUNCTIONS,
+            train_dataset=dataset,
+            processing_class=tokenizer,
+            callbacks=[_grpo_console_callback(MAX_STEPS, "GRPO")],
+        )
+        print("Starting GRPO training... (rewards + loss in log lines; online reward below)\n", flush=True)
+        if use_wandb:
+            import wandb
+            wandb.init(project="cicd-rl-agent", name="grpo-cicd-yaml", reinit=True)
+        t0 = time.perf_counter()
+        grpo_trainer.train()
+        grpo_time_s = time.perf_counter() - t0
+        grpo_steps = getattr(grpo_trainer.state, "global_step", 0)
+        print("GRPO training complete!", flush=True)
+        _print_grpo_reward_tail(grpo_trainer)
+        print(
+            f"\n--- GRPO done: {grpo_steps} optimizer turn(s) / step(s) (of {MAX_STEPS} max), "
+            f'time { _format_seconds(grpo_time_s) } ---\n',
+            flush=True,
+        )
     save_path = "./cicd_rl_agent_final"
+    if "grpo" in wants:
         model.save_pretrained(save_path)
         tokenizer.save_pretrained(save_path)
+        print(f"Final LoRA saved to {save_path} (SFT+GRPO pipeline end state).")
+        if USE_UNSLOTH:
+            _post_train_smoke_unsloth(tokenizer, model)
+        else:
+            print("Non-Unsloth path: inference test skipped.")
+    elif "sft" in wants:
+        # SFT weights already written in run_sft(); also mirror to default eval path for convenience.
         model.save_pretrained(save_path)
         tokenizer.save_pretrained(save_path)
+        print(f"SFT-only run: LoRA is in {SFT_OUTPUT} and copied to {save_path} for eval_lora defaults.")
+    total_s = time.perf_counter() - t_start
+    print("\n========== TRAINING SUMMARY ==========", flush=True)
+    print(f"Total wall time: {_format_seconds(total_s)}", flush=True)
+    if sft_time_s:
+        print(
+            f"  SFT:  time={_format_seconds(sft_time_s)}  |  turn(s)/step(s) = {sft_steps}  |  (supervised, loss in [SFT turn/step ...] lines)",
+            flush=True,
+        )
+    if grpo_time_s:
+        print(
+            f"  GRPO: time={_format_seconds(grpo_time_s)}  |  turn(s)/step(s) = {grpo_steps}  |  (online rewards in [GRPO turn/step ...] lines)",
+            flush=True,
+        )
+    print(
+        "  Note: each eval task is a single user→assistant 'turn'; GRPO/SFT 'turns' = optimizer update steps.\n"
+        "========================================\n",
+        flush=True,
+    )
+    if not args.no_final_eval and (sft_time_s or grpo_time_s):
+        run_final_task_eval(
+            model, tokenizer, MAX_COMPLETION_TOKENS, timeout_sec=float(args.eval_timeout)
+        )
+    elif args.no_final_eval:
+        print("Skipped final per-task eval (--no-final-eval).", flush=True)
 if __name__ == "__main__":
     main()

train_colab.ipynb CHANGED Viewed

@@ -9,12 +9,12 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "!pip install unsloth trl transformers datasets torch wandb pydantic"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -25,9 +25,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "import os\n",
         "import random\n",
@@ -82,7 +80,9 @@
         "    return Dataset.from_list(records)\n",
         "\n",
         "print(f\"Loaded {len(ALL_TASKS)} tasks (easy/medium/hard). Sample task ids:\", [t['id'] for t in ALL_TASKS[:3]], \"...\")"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -93,9 +93,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "import torch\n",
         "from unsloth import FastLanguageModel\n",
@@ -119,7 +117,9 @@
         ")\n",
         "if tokenizer.pad_token is None:\n",
         "    tokenizer.pad_token = tokenizer.eos_token"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -130,13 +130,13 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "train_dataset = build_dataset()\n",
         "print(f\"Dataset size: {len(train_dataset)} (target split ~50% easy / 30% medium / 20% hard)\")"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -147,9 +147,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "def reward_fix_correctness(completions, prompts, correct_yaml, pipeline_yaml, **kwargs):\n",
         "    \"\"\"How closely the completion matches the reference `correct_yaml` (full match, partial, unchanged, or wrong).\"\"\"\n",
@@ -188,20 +186,29 @@
         "    return [-0.3 if any(p.lower() in c.lower() for p in bad) else 0.3 for c in completions]\n",
         "\n",
         "REWARD_FUNCTIONS = [reward_fix_correctness, reward_yaml_structure, reward_no_hallucination]"
-      ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## 🚀 Configure and Run GRPO Training"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "import wandb\n",
         "from trl import GRPOConfig, GRPOTrainer\n",
@@ -232,7 +239,9 @@
         ")\n",
         "wandb.init(project=\"cicd-rl-agent\")\n",
         "trainer.train()"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -243,9 +252,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "import matplotlib.pyplot as plt\n",
         "\n",
@@ -269,7 +276,9 @@
         "plt.tight_layout()\n",
         "plt.savefig(\"reward_curve.png\", dpi=150, bbox_inches=\"tight\")\n",
         "plt.show()"
-      ]
     },
     {
       "cell_type": "markdown",
@@ -280,9 +289,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": null,
       "metadata": {},
-      "outputs": [],
       "source": [
         "def generate_yaml(model, tok, task: dict) -> str:\n",
         "    FastLanguageModel.for_inference(model)\n",
@@ -322,7 +329,9 @@
         "    print(out_train[:800])\n",
         "    print(f\"\\nBase matches correct_yaml:   {ok_base}\")\n",
         "    print(f\"Trained matches correct_yaml: {ok_train}\")"
-      ]
     }
   ],
   "metadata": {
@@ -338,4 +347,4 @@
   },
   "nbformat": 4,
   "nbformat_minor": 4
-}

     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "!pip install unsloth trl transformers datasets torch wandb pydantic"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "import os\n",
         "import random\n",
         "    return Dataset.from_list(records)\n",
         "\n",
         "print(f\"Loaded {len(ALL_TASKS)} tasks (easy/medium/hard). Sample task ids:\", [t['id'] for t in ALL_TASKS[:3]], \"...\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "import torch\n",
         "from unsloth import FastLanguageModel\n",
         ")\n",
         "if tokenizer.pad_token is None:\n",
         "    tokenizer.pad_token = tokenizer.eos_token"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "train_dataset = build_dataset()\n",
         "print(f\"Dataset size: {len(train_dataset)} (target split ~50% easy / 30% medium / 20% hard)\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "def reward_fix_correctness(completions, prompts, correct_yaml, pipeline_yaml, **kwargs):\n",
         "    \"\"\"How closely the completion matches the reference `correct_yaml` (full match, partial, unchanged, or wrong).\"\"\"\n",
         "    return [-0.3 if any(p.lower() in c.lower() for p in bad) else 0.3 for c in completions]\n",
         "\n",
         "REWARD_FUNCTIONS = [reward_fix_correctness, reward_yaml_structure, reward_no_hallucination]"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
+        "## 🚀 Training: SFT + GRPO (recommended) or GRPO in-notebook\n",
+        "\n",
+        "**Best path (matches `train.py` in the repo):** in the repo root run:\n",
+        "`!cd $REPO_DIR && python train.py`  \n",
+        "Default is a short **supervised (SFT)** pass on exact `correct_yaml`, then **GRPO** with correctness-weighted rewards.  \n",
+        "- GRPO only (old one-stage): `python train.py --stages grpo`  \n",
+        "- SFT only: `python train.py --stages sft`  \n",
+        "- Two SFT epochs: `python train.py --sft-epochs 2`\n",
+        "\n",
+        "**Alternative below:** the next cell runs **GRPO only** in the notebook (no SFT), like older Colab flows."
       ]
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "import wandb\n",
         "from trl import GRPOConfig, GRPOTrainer\n",
         ")\n",
         "wandb.init(project=\"cicd-rl-agent\")\n",
         "trainer.train()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "import matplotlib.pyplot as plt\n",
         "\n",
         "plt.tight_layout()\n",
         "plt.savefig(\"reward_curve.png\", dpi=150, bbox_inches=\"tight\")\n",
         "plt.show()"
+      ],
+      "execution_count": null,
+      "outputs": []
     },
     {
       "cell_type": "markdown",
     },
     {
       "cell_type": "code",
       "metadata": {},
       "source": [
         "def generate_yaml(model, tok, task: dict) -> str:\n",
         "    FastLanguageModel.for_inference(model)\n",
         "    print(out_train[:800])\n",
         "    print(f\"\\nBase matches correct_yaml:   {ok_base}\")\n",
         "    print(f\"Trained matches correct_yaml: {ok_train}\")"
+      ],
+      "execution_count": null,
+      "outputs": []
     }
   ],
   "metadata": {
   },
   "nbformat": 4,
   "nbformat_minor": 4
+}