updated code

Browse files

Files changed (4) hide show

cicd_debug_env/env.py +22 -2
cicd_debug_env/rewards.py +28 -6
train.py +12 -3
train_colab.ipynb +321 -7

cicd_debug_env/env.py CHANGED Viewed

@@ -3,7 +3,15 @@ import random
 from .models import Action, Observation
 from .tasks import ALL_TASKS
-from .rewards import compute_total_reward
 from .memory.failure_bank import FailureMemoryBank
 try:
@@ -76,7 +84,19 @@ class CICDDebugEnv(_BaseEnv):
         self.current_observation.available_actions = self.available_actions()
         self._update_state()
-        return self.current_observation, reward, self.done, {"task_id": self.current_task["id"], "reward_breakdown": reward}
     def state(self) -> dict:
         return self._state_dict

 from .models import Action, Observation
 from .tasks import ALL_TASKS
+from .rewards import (
+    compute_total_reward,
+    reward_execution_success,
+    reward_fix_correctness,
+    reward_step_efficiency,
+    reward_format_compliance,
+    reward_robustness,
+    check_anti_hacking_guards,
+)
 from .memory.failure_bank import FailureMemoryBank
 try:
         self.current_observation.available_actions = self.available_actions()
         self._update_state()
+        reward_components = {
+            "execution_success": reward_execution_success(self.current_observation, self.current_task),
+            "fix_correctness":   reward_fix_correctness(self.current_observation, action, self.current_task),
+            "step_efficiency":   reward_step_efficiency(self.current_observation, self.max_steps),
+            "format_compliance": reward_format_compliance(action),
+            "robustness":        reward_robustness(self.current_observation, self.current_task),
+            "anti_hacking":      check_anti_hacking_guards(self.current_observation, action),
+            "total":             reward,
+        }
+        return self.current_observation, reward, self.done, {
+            "task_id": self.current_task["id"],
+            "reward_breakdown": reward_components,
+        }
     def state(self) -> dict:
         return self._state_dict

cicd_debug_env/rewards.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Dict, Any, List
 from .models import Action, Observation
 def reward_execution_success(state: Observation, task: Dict[str, Any] = None) -> float:
@@ -31,11 +31,33 @@ def reward_format_compliance(action: Action) -> float:
             return 1.0
     return 0.0
-def reward_robustness(state: Observation, task: Dict[str, Any] = None) -> float:
-    # After fix, does the pipeline pass 3 adversarial variants? (0, 0.33, 0.66, 1)
-    if task and state.pipeline_yaml.strip() == task.get("correct_yaml", "").strip():
-        return 1.0
-    return 0.0
 def check_anti_hacking_guards(state: Observation, action: Action) -> float:
     penalty = 0.0

+from typing import Dict, Any, List, Optional
 from .models import Action, Observation
 def reward_execution_success(state: Observation, task: Dict[str, Any] = None) -> float:
             return 1.0
     return 0.0
+def reward_robustness(state: Observation, task: Optional[Dict[str, Any]] = None) -> float:
+    """
+    Checks if the agent's fix is robust to 3 minor perturbations of the correct YAML.
+    Perturbations: trailing whitespace, extra blank lines, lowercase keys.
+    Score: 0.33 per perturbation passed (max 1.0).
+    Only runs if the agent has attempted an edit_config action.
+    """
+    if task is None:
+        return 0.0
+    correct = task.get("correct_yaml", "").strip()
+    agent_fix = state.pipeline_yaml.strip()
+    if not correct or not agent_fix:
+        return 0.0
+    def normalize(yaml_str: str) -> str:
+        lines = [l.rstrip() for l in yaml_str.splitlines()]
+        return "\n".join(l for l in lines if l)
+    perturbations = [
+        normalize(agent_fix) == normalize(correct),                        # trailing whitespace
+        agent_fix.replace("\n\n", "\n") == correct.replace("\n\n", "\n"), # blank lines
+        agent_fix.lower() == correct.lower(),                              # case insensitivity
+    ]
+    score = sum(perturbations) / 3.0
+    return round(score, 4)
 def check_anti_hacking_guards(state: Observation, action: Action) -> float:
     penalty = 0.0

train.py CHANGED Viewed

@@ -109,7 +109,7 @@ def main():
         learning_rate=5e-6, max_steps=MAX_STEPS,
         num_generations=4, max_new_tokens=MAX_NEW_TOKENS,
         logging_steps=5, save_steps=50,
-        report_to="none", remove_unused_columns=False,
         warmup_steps=10, lr_scheduler_type="cosine", optim="adamw_8bit",
     )
     trainer = GRPOTrainer(
@@ -117,16 +117,25 @@ def main():
         train_dataset=dataset, processing_class=tokenizer)
     print("Starting GRPO training...")
     trainer.train()
     print("Training complete!")
     save_path = "./cicd_rl_agent_final"
     if USE_UNSLOTH:
-        model.save_pretrained_merged(save_path, tokenizer, save_method="merged_16bit")
     else:
         model.save_pretrained(save_path)
         tokenizer.save_pretrained(save_path)
-    print(f"Model saved to {save_path}")
 if __name__ == "__main__":
     main()

         learning_rate=5e-6, max_steps=MAX_STEPS,
         num_generations=4, max_new_tokens=MAX_NEW_TOKENS,
         logging_steps=5, save_steps=50,
+        report_to="wandb", remove_unused_columns=False,
         warmup_steps=10, lr_scheduler_type="cosine", optim="adamw_8bit",
     )
     trainer = GRPOTrainer(
         train_dataset=dataset, processing_class=tokenizer)
     print("Starting GRPO training...")
+    import wandb
+    wandb.init(project="cicd-rl-agent", name="grpo-run-1")
     trainer.train()
     print("Training complete!")
     save_path = "./cicd_rl_agent_final"
     if USE_UNSLOTH:
+        model.save_pretrained(save_path)
+        tokenizer.save_pretrained(save_path)
+        print(f"LoRA adapters saved to {save_path}")
+        print("Testing post-training inference...")
+        FastLanguageModel.for_inference(model)
+        test_input = tokenizer("Fix this YAML: steps:\n  - run: npm tset", return_tensors="pt").to("cuda")
+        out = model.generate(**test_input, max_new_tokens=64)
+        print(tokenizer.decode(out[0], skip_special_tokens=True))
     else:
         model.save_pretrained(save_path)
         tokenizer.save_pretrained(save_path)
+        print(f"Model saved to {save_path}")
 if __name__ == "__main__":
     main()

train_colab.ipynb CHANGED Viewed

@@ -1,15 +1,94 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {"colab": {"name": "train_colab.ipynb"}},
   "cells": [
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
-        "!pip install unsloth trl openenv pydantic fastapi uvicorn"
       ]
     },
     {
@@ -18,10 +97,245 @@
       "metadata": {},
       "outputs": [],
       "source": [
         "from unsloth import FastLanguageModel\n",
-        "from trl import GRPOTrainer, GRPOConfig\n",
-        "# Start building dataset and running train script directly\n"
       ]
     }
-  ]
 }

 {
   "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🔧 Install Dependencies"
+      ]
+    },
     {
       "cell_type": "code",
       "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
+        "!pip install unsloth trl transformers datasets torch wandb pydantic"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 📦 Clone Environment & Import Tasks"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import random\n",
+        "import sys\n",
+        "\n",
+        "# Colab: clone your fork, or mount Drive and set CICD_RL_REPO to the project path.\n",
+        "REPO_DIR = os.environ.get(\"CICD_RL_REPO\", \"/content/cicd-rl-agent\")\n",
+        "# !git clone https://github.com/<your-org>/cicd-rl-agent.git {REPO_DIR}  # noqa: E501\n",
+        "if os.path.isdir(REPO_DIR) and REPO_DIR not in sys.path:\n",
+        "    sys.path.insert(0, REPO_DIR)\n",
+        "\n",
+        "from datasets import Dataset\n",
+        "from cicd_debug_env.tasks import ALL_TASKS\n",
+        "\n",
+        "NUM_SAMPLES = 128\n",
+        "random.seed(42)\n",
+        "\n",
+        "SYSTEM_PROMPT = (\n",
+        "    \"You are an expert DevOps engineer. \"\n",
+        "    \"You receive a broken CI/CD pipeline YAML and error details. \"\n",
+        "    \"Output ONLY the corrected YAML — no explanation, no markdown fences.\"\n",
+        ")\n",
+        "\n",
+        "def build_prompt(task: dict) -> str:\n",
+        "    return (\n",
+        "        f\"### Error\\n{task.get('error_message', '')}\\n\\n\"\n",
+        "        f\"### Broken Pipeline\\n{task['pipeline_yaml']}\\n\\n\"\n",
+        "        f\"### Fixed Pipeline (YAML only):\\n\"\n",
+        "    )\n",
+        "\n",
+        "def build_dataset():\n",
+        "    easy = [t for t in ALL_TASKS if t[\"difficulty\"] == \"easy\"]\n",
+        "    medium = [t for t in ALL_TASKS if t[\"difficulty\"] == \"medium\"]\n",
+        "    hard = [t for t in ALL_TASKS if t[\"difficulty\"] == \"hard\"]\n",
+        "    records = []\n",
+        "    for _ in range(NUM_SAMPLES):\n",
+        "        r = random.random()\n",
+        "        if r < 0.5:\n",
+        "            task = random.choice(easy)\n",
+        "        elif r < 0.8:\n",
+        "            task = random.choice(medium)\n",
+        "        else:\n",
+        "            task = random.choice(hard)\n",
+        "        records.append({\n",
+        "            \"prompt\": [\n",
+        "                {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+        "                {\"role\": \"user\", \"content\": build_prompt(task)},\n",
+        "            ],\n",
+        "            \"correct_yaml\": task.get(\"correct_yaml\", \"\"),\n",
+        "            \"pipeline_yaml\": task[\"pipeline_yaml\"],\n",
+        "        })\n",
+        "    return Dataset.from_list(records)\n",
+        "\n",
+        "print(f\"Loaded {len(ALL_TASKS)} tasks (easy/medium/hard). Sample task ids:\", [t['id'] for t in ALL_TASKS[:3]], \"...\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🤖 Load Model with Unsloth"
       ]
     },
     {
       "metadata": {},
       "outputs": [],
       "source": [
+        "import torch\n",
         "from unsloth import FastLanguageModel\n",
+        "\n",
+        "MODEL_ID = \"unsloth/Qwen2.5-0.5B-Instruct\"\n",
+        "model, tokenizer = FastLanguageModel.from_pretrained(\n",
+        "    model_name=MODEL_ID,\n",
+        "    max_seq_length=1024,\n",
+        "    dtype=None,\n",
+        "    load_in_4bit=True,\n",
+        ")\n",
+        "model = FastLanguageModel.get_peft_model(\n",
+        "    model,\n",
+        "    r=16,\n",
+        "    target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\"],\n",
+        "    lora_alpha=16,\n",
+        "    lora_dropout=0.0,\n",
+        "    bias=\"none\",\n",
+        "    use_gradient_checkpointing=\"unsloth\",\n",
+        "    random_state=42,\n",
+        ")\n",
+        "if tokenizer.pad_token is None:\n",
+        "    tokenizer.pad_token = tokenizer.eos_token"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 📝 Build Training Dataset"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "train_dataset = build_dataset()\n",
+        "print(f\"Dataset size: {len(train_dataset)} (target split ~50% easy / 30% medium / 20% hard)\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🏆 Define Reward Functions"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def reward_fix_correctness(completions, prompts, correct_yaml, pipeline_yaml, **kwargs):\n",
+        "    \"\"\"How closely the completion matches the reference `correct_yaml` (full match, partial, unchanged, or wrong).\"\"\"\n",
+        "    rewards = []\n",
+        "    for c, correct, broken in zip(completions, correct_yaml, pipeline_yaml):\n",
+        "        c = c.strip()\n",
+        "        if c == correct.strip():\n",
+        "            rewards.append(1.0)\n",
+        "        elif any(line.strip() in c for line in correct.splitlines() if len(line.strip()) > 8):\n",
+        "            rewards.append(0.5)\n",
+        "        elif c == broken.strip():\n",
+        "            rewards.append(-0.2)\n",
+        "        else:\n",
+        "            rewards.append(0.0)\n",
+        "    return rewards\n",
+        "\n",
+        "def reward_yaml_structure(completions, prompts, **kwargs):\n",
+        "    \"\"\"Whether the output looks like valid pipeline YAML (keywords, length bounds).\"\"\"\n",
+        "    rewards = []\n",
+        "    for c in completions:\n",
+        "        t = c.strip()\n",
+        "        score = (\n",
+        "            0.4 * int(any(k in t for k in [\"steps:\", \"jobs:\", \"name:\", \"run:\", \"uses:\"]))\n",
+        "            + 0.3 * int(len(t) > 10)\n",
+        "            + 0.3 * int(len(t) < 3000)\n",
+        "        )\n",
+        "        rewards.append(score)\n",
+        "    return rewards\n",
+        "\n",
+        "def reward_no_hallucination(completions, prompts, **kwargs):\n",
+        "    \"\"\"Penalizes assistant-style or fenced markdown responses instead of raw YAML.\"\"\"\n",
+        "    bad = [\n",
+        "        \"I cannot\", \"I am sorry\", \"As an AI\", \"Here is\", \"```yaml\", \"```\",\n",
+        "        \"Explanation:\", \"Note:\", \"Sure!\", \"Of course\",\n",
+        "    ]\n",
+        "    return [-0.3 if any(p.lower() in c.lower() for p in bad) else 0.3 for c in completions]\n",
+        "\n",
+        "REWARD_FUNCTIONS = [reward_fix_correctness, reward_yaml_structure, reward_no_hallucination]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🚀 Configure and Run GRPO Training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import wandb\n",
+        "from trl import GRPOConfig, GRPOTrainer\n",
+        "\n",
+        "MAX_NEW_TOKENS = 256\n",
+        "args = GRPOConfig(\n",
+        "    output_dir=\"./cicd_rl_output\",\n",
+        "    per_device_train_batch_size=2,\n",
+        "    gradient_accumulation_steps=4,\n",
+        "    learning_rate=5e-6,\n",
+        "    max_steps=200,\n",
+        "    num_generations=4,\n",
+        "    max_new_tokens=MAX_NEW_TOKENS,\n",
+        "    logging_steps=5,\n",
+        "    save_steps=50,\n",
+        "    report_to=\"wandb\",\n",
+        "    remove_unused_columns=False,\n",
+        "    warmup_steps=10,\n",
+        "    lr_scheduler_type=\"cosine\",\n",
+        "    optim=\"adamw_8bit\",\n",
+        ")\n",
+        "trainer = GRPOTrainer(\n",
+        "    model=model,\n",
+        "    args=args,\n",
+        "    reward_funcs=REWARD_FUNCTIONS,\n",
+        "    train_dataset=train_dataset,\n",
+        "    processing_class=tokenizer,\n",
+        ")\n",
+        "wandb.init(project=\"cicd-rl-agent\")\n",
+        "trainer.train()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 📊 Plot Reward Curve"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "step_vals, reward_vals = [], []\n",
+        "for h in trainer.state.log_history:\n",
+        "    st = h.get(\"step\")\n",
+        "    for k, v in h.items():\n",
+        "        if \"reward\" in k.lower() and isinstance(v, (int, float)):\n",
+        "            if st is not None:\n",
+        "                step_vals.append(st)\n",
+        "                reward_vals.append(float(v))\n",
+        "            break\n",
+        "fig, ax = plt.subplots(figsize=(8, 4))\n",
+        "if step_vals and reward_vals:\n",
+        "    ax.plot(step_vals, reward_vals, marker=\"o\", markersize=2)\n",
+        "else:\n",
+        "    ax.text(0.5, 0.5, \"No reward fields in log_history; check TRL/W&B logs.\", ha=\"center\", va=\"center\")\n",
+        "ax.set_xlabel(\"Training Step\")\n",
+        "ax.set_ylabel(\"Reward\")\n",
+        "ax.set_title(\"GRPO training reward (from log_history)\")\n",
+        "plt.tight_layout()\n",
+        "plt.savefig(\"reward_curve.png\", dpi=150, bbox_inches=\"tight\")\n",
+        "plt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## 🧪 Before/After Inference Demo"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "def generate_yaml(model, tok, task: dict) -> str:\n",
+        "    FastLanguageModel.for_inference(model)\n",
+        "    user = build_prompt(task)\n",
+        "    messages = [\n",
+        "        {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n",
+        "        {\"role\": \"user\", \"content\": user},\n",
+        "    ]\n",
+        "    text = tok.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
+        "    dev = next(model.parameters()).device\n",
+        "    inputs = tok(text, return_tensors=\"pt\").to(dev)\n",
+        "    with torch.inference_mode():\n",
+        "        out = model.generate(**inputs, max_new_tokens=256)\n",
+        "    return tok.decode(out[0][inputs[\"input_ids\"].shape[1] :], skip_special_tokens=True).strip()\n",
+        "\n",
+        "easy_demo = next(t for t in ALL_TASKS if t[\"difficulty\"] == \"easy\")\n",
+        "med_demo = next(t for t in ALL_TASKS if t[\"difficulty\"] == \"medium\")\n",
+        "\n",
+        "base_model, base_tok = FastLanguageModel.from_pretrained(\n",
+        "    model_name=MODEL_ID,\n",
+        "    max_seq_length=1024,\n",
+        "    dtype=None,\n",
+        "    load_in_4bit=True,\n",
+        ")\n",
+        "for label, task in [(\"EASY\", easy_demo), (\"MEDIUM\", med_demo)]:\n",
+        "    print(\"=\" * 60)\n",
+        "    print(f\"Task [{label}]: {task['id']}\")\n",
+        "    print(\"\\n--- Broken YAML ---\")\n",
+        "    print(task[\"pipeline_yaml\"])\n",
+        "    out_base = generate_yaml(base_model, base_tok, task)\n",
+        "    out_train = generate_yaml(model, tokenizer, task)\n",
+        "    ok_base = out_base.strip() == task[\"correct_yaml\"].strip()\n",
+        "    ok_train = out_train.strip() == task[\"correct_yaml\"].strip()\n",
+        "    print(\"\\n--- Untrained (base checkpoint) output ---\")\n",
+        "    print(out_base[:800])\n",
+        "    print(\"\\n--- Trained model output ---\")\n",
+        "    print(out_train[:800])\n",
+        "    print(f\"\\nBase matches correct_yaml:   {ok_base}\")\n",
+        "    print(f\"Trained matches correct_yaml: {ok_train}\")"
       ]
     }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python",
+      "version": "3.10.0"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 4
 }