Spaces:

junaid0600
/

sql-db-engineer-agent

Sleeping

App Files Files Community

junaid0600 commited on Apr 25

Commit

4edd88e

verified ·

1 Parent(s): 028dbb9

Update training/train_agent.py

Browse files

Files changed (1) hide show

training/train_agent.py +286 -285

training/train_agent.py CHANGED Viewed

@@ -1,372 +1,373 @@
 """
 training/train_agent.py — SQL Database Engineer Agent
-FIXED: Uses local DatabaseSimulator for rewards (no HF Space calls)
-- No shared singleton state
-- Real delta rewards (0.0 for wrong actions, 40-75pts for correct)
-- Clear reward difference teaches model to prefer create_index over inspect_query
 """
-import os, json, sys, time
-from pathlib import Path
-# ── GPU check ─────────────────────────────────────────────────
-UNSLOTH_AVAILABLE = False
 try:
-    import torch
-    if not torch.cuda.is_available():
-        print("❌ No GPU. Unsloth requires CUDA GPU.")
-        sys.exit(1)
     from unsloth import FastLanguageModel
     from trl import GRPOTrainer, GRPOConfig
-    from datasets import Dataset
     UNSLOTH_AVAILABLE = True
-    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
-    print(f"✅ VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")
-except ImportError as e:
-    print(f"❌ {e}\nRun: pip install unsloth trl transformers datasets accelerate")
-    sys.exit(1)
-# Add project root so we can import DatabaseSimulator
-sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from env.db_simulator import DatabaseSimulator
-# ── Config ────────────────────────────────────────────────────
 ENV_URL    = os.getenv("ENV_URL",    "https://junaid0600-sql-db-engineer-agent.hf.space")
 HF_TOKEN   = os.getenv("HF_TOKEN",  "")
-MODEL_NAME = os.getenv("MODEL_NAME", "unsloth/Qwen2.5-1.5B-Instruct")
 OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./sdea-trained")
-MAX_STEPS  = int(os.getenv("MAX_STEPS", "100"))
-print(f"\n[CONFIG] Model:     {MODEL_NAME}")
-print(f"[CONFIG] Max steps: {MAX_STEPS}")
-print(f"[CONFIG] Output:    {OUTPUT_DIR}\n")
-# ── System prompt ─────────────────────────────────────────────
-SYSTEM_PROMPT = """You are a senior database engineer fixing slow database queries.
-You will see slow queries and table structures. Choose the BEST action.
-Key insight: create_index on the RIGHT columns fixes slow queries.
-Wrong columns = no improvement. Right columns = massive improvement.
-Respond with ONLY valid JSON:
-{"action_type": "create_index", "payload": {"table": "TABLE_NAME", "columns": ["COL1", "COL2"]}}
-Available actions: inspect_query, analyze_indexes, create_index, rewrite_query, analyze_statistics, submit_report"""
-# ── Load all 15 scenarios ─────────────────────────────────────
-def load_all_scenarios() -> list:
-    scenarios = []
-    base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-    for fname in ["easy_scenarios.json", "medium_scenarios.json", "hard_scenarios.json"]:
-        path = os.path.join(base, "dataset", fname)
-        try:
-            with open(path) as f:
-                data = json.load(f)
-                scenarios.extend(data)
-                print(f"  ✅ Loaded {len(data)} from {fname}")
-        except FileNotFoundError:
-            print(f"  ⚠️ {fname} not found")
-    print(f"  Total: {len(scenarios)} scenarios\n")
-    return scenarios
-ALL_SCENARIOS = load_all_scenarios()
-# ── Parse LLM output ─────────────────────────────────────────
-def parse_action(text: str) -> dict:
-    """Parse LLM output into action dict."""
-    try:
-        text = text.strip()
-        for marker in ["```json", "```"]:
-            if marker in text:
-                parts = text.split(marker)
-                text = parts[1] if len(parts) > 1 else parts[0]
-        text = text.strip()
-        data = json.loads(text)
-        if "action_type" in data and "payload" in data:
-            return data
-    except Exception:
-        pass
-    return None  # None = invalid JSON = penalized
-# ── LOCAL reward function using DatabaseSimulator ─────────────
-def compute_local_reward(action: dict, scenario: dict) -> tuple:
-    """
-    Compute reward LOCALLY using DatabaseSimulator.
-    No HF Space calls. No shared state. Clean every time.
-    Returns (reward_score, db_delta, milestone_bonus)
-    """
-    sim      = DatabaseSimulator(scenario)
-    baseline = sim.get_performance_score()
-    hints    = scenario.get("missing_index_hints", [])
-    action_type = action.get("action_type", "")
-    payload     = action.get("payload", {})
-    # Apply action to simulator
-    if action_type == "create_index":
-        result  = sim.apply_action("create_index", payload)
-        delta   = result.get("delta", 0.0)
-    elif action_type == "inspect_query":
-        # Investigation — small reward, no DB change
-        delta = 0.0
-    elif action_type == "analyze_indexes":
-        delta = 0.0
-    elif action_type == "rewrite_query":
-        result = sim.apply_action("rewrite_query", payload)
-        delta  = result.get("delta", 0.0)
-    elif action_type == "analyze_statistics":
-        result = sim.apply_action("analyze_statistics", payload)
-        delta  = result.get("delta", 0.0)
-    elif action_type == "partition_table":
-        result = sim.apply_action("partition_table", payload)
-        delta  = result.get("delta", 0.0)
-    elif action_type == "submit_report":
-        # Terminal: score based on how much DB improved so far
-        final = sim.get_performance_score()
-        improvement = max(0, final - baseline)
-        delta = improvement
-    else:
-        delta = -5.0  # Unknown action = penalty
-    final_score  = sim.get_performance_score()
-    improvement  = max(0.0, final_score - baseline)
-    max_possible = max(1.0, 100.0 - baseline)
-    # ── Reward components ─────────────────────────────────────
-    # 1. Step reward — different per action type
-    step_rewards = {
-        "inspect_query":     0.10,
-        "analyze_indexes":   0.10,
-        "create_index":      0.15,
-        "rewrite_query":     0.20,
-        "analyze_statistics":0.08,
-        "partition_table":   0.15,
-        "submit_report":     0.05,
-    }
-    step_r = step_rewards.get(action_type, 0.001)
-    # 2. Delta reward — proportional to actual improvement
-    delta_r = min(0.70, (improvement / max_possible) * 0.70)
-    # 3. Milestone bonus — one-time for big improvements
-    milestone_r = 0.0
-    if improvement / max_possible >= 0.75:
-        milestone_r = 0.40
-    elif improvement / max_possible >= 0.50:
-        milestone_r = 0.25
-    elif improvement / max_possible >= 0.25:
-        milestone_r = 0.15
-    # 4. Penalty for wrong index (delta=0 on create_index)
-    wrong_index_pen = 0.0
-    if action_type == "create_index" and delta <= 0.0:
-        wrong_index_pen = -0.15  # created useless index
-    total = step_r + delta_r + milestone_r + wrong_index_pen
-    total = max(0.001, min(0.999, total))
-    return total, improvement, milestone_r
-# ── GRPO reward function ──────────────────────────────────────
 def reward_fn(prompts, completions, **kwargs):
     """
-    LOCAL reward — no HTTP calls, no shared state.
-    Each completion gets its own fresh DatabaseSimulator.
-    Reward differences:
-      inspect_query (always):     0.10 + 0.0  = 0.10
-      create_index (wrong col):   0.15 - 0.15 = 0.001
-      create_index (right col):   0.15 + 0.60 = 0.75+
-    GRPO will learn: right create_index >> inspect_query >> wrong create_index
     """
-    rewards = []
     for i, (prompt, completion) in enumerate(zip(prompts, completions)):
         try:
-            # Get text
-            if isinstance(completion, list):
-                text = completion[0].get("content", "") if completion else ""
-            else:
-                text = str(completion)
-            # Pick scenario (rotate through all)
-            scenario = ALL_SCENARIOS[i % len(ALL_SCENARIOS)]
-            # Parse action
-            action = parse_action(text)
-            if action is None:
-                # Invalid JSON output — penalize
-                rewards.append(0.001)
-                print(f"  [REWARD] scenario={scenario['id']} | "
-                      f"INVALID JSON | score=0.001")
-                continue
-            # Compute reward locally
-            score, improvement, milestone = compute_local_reward(action, scenario)
             rewards.append(score)
-            print(f"  [REWARD] scenario={scenario['id']} | "
-                  f"action={action.get('action_type')} | "
-                  f"improvement=+{improvement:.1f}pts | "
-                  f"milestone=+{milestone:.2f} | "
-                  f"score={score:.3f}")
         except Exception as e:
-            print(f"  [REWARD] Error: {e}")
             rewards.append(0.001)
     return rewards
-# ── Build dataset ─────────────────────────────────────────────
-def build_dataset() -> Dataset:
     examples = []
-    for i, s in enumerate(ALL_SCENARIOS):
-        tables_str  = json.dumps(s.get("tables", []))
-        queries_str = json.dumps(s.get("slow_queries", []))
-        hints_str   = json.dumps(s.get("missing_index_hints", []))
         prompt = (
             f"{SYSTEM_PROMPT}\n\n"
-            f"=== DATABASE STATE ===\n"
-            f"Scenario: {s['id']}\n"
-            f"Description: {s.get('description','')}\n"
-            f"Tables: {tables_str}\n"
-            f"Slow Queries: {queries_str}\n"
-            f"Missing Index Hints: {hints_str}\n"
-            f"Performance: {s.get('performance_score_baseline',0)}/100 "
-            f"→ Target: {s.get('target_score',85)}/100\n\n"
-            f"What action should you take? Output JSON only:"
         )
         examples.append({
-            "prompt":      prompt,
-            "scenario_id": s["id"],
         })
-    print(f"  ✅ Dataset: {len(examples)} examples")
     return Dataset.from_list(examples)
-# ── Generate plots ────────────────────────────────────────────
-def generate_plots(trainer):
-    import matplotlib
-    matplotlib.use("Agg")
-    import matplotlib.pyplot as plt
-    logs = [l for l in trainer.state.log_history if "loss" in l]
-    if not logs:
-        print("⚠️ No logs for plotting")
-        return
-    steps  = [l.get("step", i) for i,l in enumerate(logs)]
-    losses = [l.get("loss", 0) for l in logs]
-    fig, ax = plt.subplots(1, 1, figsize=(8, 4))
-    fig.suptitle("GRPO Training — SQL Database Engineer Agent",
-                 fontsize=13, fontweight="bold")
-    ax.plot(steps, losses, "b-o", lw=2, ms=4)
-    ax.set_xlabel("Training Step")
-    ax.set_ylabel("Loss")
-    ax.set_title("Training Loss (↓ = model learning DBA pattern)")
-    ax.grid(True, alpha=0.3)
-    if losses:
-        ax.annotate(f"Start: {losses[0]:.4f}",
-                   xy=(steps[0], losses[0]),
-                   xytext=(steps[0]+1, losses[0]*1.1),
-                   fontsize=9, color="red")
-        ax.annotate(f"End: {losses[-1]:.4f}",
-                   xy=(steps[-1], losses[-1]),
-                   xytext=(steps[-1]-5, losses[-1]*1.1),
-                   fontsize=9, color="green")
-    plt.tight_layout()
-    plt.savefig("loss_curve.png", dpi=150, bbox_inches="tight")
-    print("✅ loss_curve.png saved")
-    print(f"   Loss: {losses[0]:.4f} → {losses[-1]:.4f}")
-# ── Main ──────────────────────────────────────────────────────
-def train():
-    if not ALL_SCENARIOS:
-        print("❌ No scenarios found. Check dataset/ folder.")
-        sys.exit(1)
-    print(f"⏳ Loading {MODEL_NAME}...")
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name     = MODEL_NAME,
-        max_seq_length = 2048,
         load_in_4bit   = True,
         token          = HF_TOKEN or None,
     )
     model = FastLanguageModel.get_peft_model(
         model,
-        r=16, lora_alpha=16,
-        target_modules=["q_proj","k_proj","v_proj","o_proj",
-                        "gate_proj","up_proj","down_proj"],
-        lora_dropout=0, bias="none",
-        use_gradient_checkpointing="unsloth",
-        random_state=42,
     )
-    print("✅ Model ready\n")
     dataset = build_dataset()
     config = GRPOConfig(
         output_dir                  = OUTPUT_DIR,
-        max_steps                   = MAX_STEPS,
-        per_device_train_batch_size = 1,
-        gradient_accumulation_steps = 4,
-        learning_rate               = 5e-6,
-        max_completion_length       = 150,
-        num_generations             = 4,   # compare 4 actions per step
-        temperature                 = 1.0,
-        logging_steps               = 1,
-        save_steps                  = 25,
-        save_total_limit            = 3,
         warmup_ratio                = 0.1,
         report_to                   = "none",
-        remove_unused_columns       = False,
     )
     trainer = GRPOTrainer(
         model         = model,
         tokenizer     = tokenizer,
-        reward_funcs  = reward_fn,
         args          = config,
         train_dataset = dataset,
     )
-    print(f"🏋️  GRPO training — {MAX_STEPS} steps")
-    print("Watch for: improvement > 0 and score > 0.5 on create_index\n")
     trainer.train()
-    print("\n✅ Training complete!")
-    Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
     model.save_pretrained(f"{OUTPUT_DIR}/final")
     tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
-    print(f"✅ Saved to {OUTPUT_DIR}/final")
-    generate_plots(trainer)
-    print("\n" + "="*50)
-    print("NEXT: python training/evaluate_agent.py")
-    print("THEN: git add loss_curve.png reward_curve.png")
-    print("="*50)
 if __name__ == "__main__":
-    train()

 """
 training/train_agent.py — SQL Database Engineer Agent
+Unsloth + GRPO training script.
+Run on venue GPU (April 25-26) with compute credits.
+FIXES applied:
+  1. Robust JSON extraction via regex (kills PARSE FALLBACK)
+  2. task_id from kwargs directly — not from kwargs["batch"] (kills only-easy_s001)
+  3. Reward calls /grader (stateless) instead of /reset+/step (kills race condition + flat 0.500)
+  4. Format bonus so valid JSON gets non-zero reward even before agent learns DBA actions
 """
+import os
+import re
+import json
+import requests
+from datasets import Dataset
+# ── Try importing Unsloth (GPU only) ─────────────────────────
 try:
     from unsloth import FastLanguageModel
     from trl import GRPOTrainer, GRPOConfig
     UNSLOTH_AVAILABLE = True
+except ImportError:
+    UNSLOTH_AVAILABLE = False
+    print("⚠️  Unsloth not available. Run: pip install unsloth trl")
+# ─────────────────────────────────────────────
+#  CONFIG
+# ─────────────────────────────────────────────
 ENV_URL    = os.getenv("ENV_URL",    "https://junaid0600-sql-db-engineer-agent.hf.space")
 HF_TOKEN   = os.getenv("HF_TOKEN",  "")
+MODEL_NAME = os.getenv("MODEL_NAME", "unsloth/Qwen2.5-7B-Instruct")
 OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./sdea-trained")
+# Valid Round 2 action types — model must use one of these
+VALID_ACTION_TYPES = {
+    "inspect_query", "analyze_indexes", "create_index",
+    "rewrite_query", "add_column", "drop_index",
+    "partition_table", "analyze_statistics",
+    "request_hint", "submit_report",
+}
+SYSTEM_PROMPT = """You are a senior database engineer.
+Given a database scenario with slow queries, choose the BEST single action to improve performance.
+Investigation pattern (follow this order):
+1. Use inspect_query to understand WHY a query is slow (scan type, rows examined)
+2. Use analyze_indexes to see what indexes exist and what is missing
+3. Use create_index to add the missing index on WHERE/JOIN columns
+4. Use rewrite_query if the SQL itself is inefficient
+5. Use partition_table for tables with 1M+ rows and range queries
+6. Use submit_report when performance target is reached
+RESPOND WITH VALID JSON ONLY. No explanation. No markdown. No preamble.
+Examples:
+{"action_type": "inspect_query", "payload": {"query_id": "q1"}}
+{"action_type": "analyze_indexes", "payload": {"table": "users"}}
+{"action_type": "create_index", "payload": {"table": "users", "columns": ["email"]}}
+{"action_type": "create_index", "payload": {"table": "orders", "columns": ["user_id", "status"]}}
+{"action_type": "submit_report", "payload": {"summary": "Added composite index on orders(user_id, status). Performance improved from 5.0 to 85.0."}}"""
+# ─────────────────────────────────────────────
+#  JSON EXTRACTION  (FIX 1 — kills PARSE FALLBACK)
+# ─────────────────────────────────────────────
+def _extract_json(text: str) -> dict | None:
+    """
+    Robustly extract a JSON object from model output.
+    Handles: pure JSON, markdown blocks, JSON buried in text, partial JSON.
+    Returns parsed dict or None if nothing parseable found.
+    """
+    if not text:
+        return None
+    # Strip common markdown wrappers
+    text = text.strip()
+    text = re.sub(r"```(?:json)?", "", text).replace("```", "").strip()
+    # Try 1: entire text is valid JSON
+    try:
+        obj = json.loads(text)
+        if isinstance(obj, dict) and "action_type" in obj:
+            return obj
+    except json.JSONDecodeError:
+        pass
+    # Try 2: find outermost {...} block using regex (handles extra text around JSON)
+    matches = re.findall(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)?\}', text, re.DOTALL)
+    for m in matches:
+        try:
+            obj = json.loads(m)
+            if isinstance(obj, dict) and "action_type" in obj:
+                return obj
+        except json.JSONDecodeError:
+            continue
+    # Try 3: greedy — find first { to last }
+    start = text.find("{")
+    end   = text.rfind("}")
+    if start != -1 and end != -1 and end > start:
+        try:
+            obj = json.loads(text[start:end + 1])
+            if isinstance(obj, dict) and "action_type" in obj:
+                return obj
+        except json.JSONDecodeError:
+            pass
+    return None
+def _is_valid_action(action: dict) -> bool:
+    """Check action has correct structure before sending to /grader."""
+    if not isinstance(action, dict):
+        return False
+    if "action_type" not in action:
+        return False
+    if action["action_type"] not in VALID_ACTION_TYPES:
+        return False
+    if "payload" not in action or not isinstance(action.get("payload"), dict):
+        return False
+    return True
+# ─────────────────────────────────────────────
+#  REWARD FUNCTION  (FIX 2 + FIX 3)
+# ─────────────────────────────────────────────
 def reward_fn(prompts, completions, **kwargs):
     """
+    GRPO reward function — calls /grader (STATELESS).
+    FIX 2: task_ids from kwargs["task_id"] directly (TRL passes dataset
+            columns as direct kwargs, NOT inside a "batch" key).
+    FIX 3: calls /grader instead of /reset + /step.
+            /grader is stateless — no race condition, no global env mutation,
+            no flat reward from concurrent resets overwriting each other.
     """
+    rewards  = []
+    # ── FIX 2: correct task_id extraction ────────────────────────
+    # TRL GRPO passes dataset columns directly as kwargs.
+    # With num_generations=4, each task_id is repeated 4x in the list.
+    raw_task_ids = kwargs.get("task_id", [])
+    if isinstance(raw_task_ids, str):
+        raw_task_ids = [raw_task_ids]
     for i, (prompt, completion) in enumerate(zip(prompts, completions)):
+        task_id = (
+            raw_task_ids[i]
+            if i < len(raw_task_ids)
+            else "easy_s001"
+        )
+        # ── Extract text from completion ──────────────────────────
+        if isinstance(completion, list):
+            # Standard TRL format: [{"role": "assistant", "content": "..."}]
+            text = completion[0].get("content", "") if completion else ""
+        elif isinstance(completion, dict):
+            text = completion.get("content", "")
+        else:
+            text = str(completion)
+        # ── FIX 1: robust JSON parse ──────────────────────────────
+        action = _extract_json(text)
+        if action is None:
+            # Complete parse failure — 0.001 (not 0.0, avoids GRPO div-by-zero)
+            rewards.append(0.001)
+            continue
+        # Format bonus: valid JSON with correct structure = small positive signal
+        # This gives the model SOMETHING to learn from even before it learns
+        # the right actions, avoiding the all-zero gradient problem.
+        if not _is_valid_action(action):
+            # JSON parsed but action_type is wrong/missing
+            rewards.append(0.05)
+            continue
+        # ── FIX 3: stateless /grader call ────────────────────────
         try:
+            resp = requests.post(
+                f"{ENV_URL}/grader",
+                json={"task_id": task_id, "action": action},
+                timeout=20,
+            )
+            resp.raise_for_status()
+            score = float(resp.json().get("score", 0.001))
+            score = max(0.001, min(0.999, score))
             rewards.append(score)
+        except requests.exceptions.Timeout:
+            rewards.append(0.05)   # grader timed out — give format credit
         except Exception as e:
+            print(f"[reward_fn] grader call failed for {task_id}: {e}")
             rewards.append(0.001)
     return rewards
+# ─────────────────────────────────────────────
+#  BUILD TRAINING DATASET
+# ─────────────────────────────────────────────
+def build_dataset():
+    """
+    Build training examples from all Round 2 scenario JSON files.
+    Each example: {"prompt": "...", "task_id": "easy_s001"}.
+    task_id is passed through to reward_fn via kwargs (TRL behaviour).
+    """
+    scenarios = []
+    for fname in [
+        "dataset/easy_scenarios.json",
+        "dataset/medium_scenarios.json",
+        "dataset/hard_scenarios.json",
+    ]:
+        try:
+            with open(fname) as f:
+                loaded = json.load(f)
+                scenarios.extend(loaded)
+                print(f"  Loaded {len(loaded)} scenarios from {fname}")
+        except FileNotFoundError:
+            print(f"  {fname} not found, skipping")
+    if not scenarios:
+        print("  Falling back to /tasks endpoint...")
+        try:
+            resp     = requests.get(f"{ENV_URL}/tasks", timeout=15)
+            tasks    = resp.json().get("tasks", [])
+            scenarios = [{"id": t["id"], "description": t.get("description", "")}
+                         for t in tasks if "_s" in t["id"]]
+        except Exception as e:
+            print(f"  /tasks fallback failed: {e}")
+            # Minimal fallback so training doesn't crash
+            scenarios = [{"id": "easy_s001",
+                          "description": "User lookup query taking 2s. Add index.",
+                          "tables": [{"name": "users", "rows": 10000, "indexes": ["PRIMARY"]}],
+                          "slow_queries": [{"id": "q1", "sql": "SELECT * FROM users WHERE email=?", "avg_ms": 2000}],
+                          "performance_score_baseline": 8.0,
+                          "target_score": 80.0}]
     examples = []
+    for s in scenarios:
+        tables_txt = json.dumps(s.get("tables", []), separators=(",", ":"))
+        queries_txt = json.dumps(s.get("slow_queries", []), separators=(",", ":"))
+        baseline    = s.get("performance_score_baseline", s.get("performance_score", 0))
+        target      = s.get("target_score", 85)
+        max_steps   = s.get("max_steps", 50)
         prompt = (
             f"{SYSTEM_PROMPT}\n\n"
+            f"=== DATABASE SCENARIO ===\n"
+            f"Scenario ID: {s.get('id', 'unknown')}\n"
+            f"Description: {s.get('description', '')}\n"
+            f"Tables: {tables_txt}\n"
+            f"Slow Queries: {queries_txt}\n"
+            f"Current Performance Score: {baseline} / 100\n"
+            f"Target Performance Score: {target} / 100\n"
+            f"Step Budget: {max_steps}\n\n"
+            f"What is your FIRST action?"
         )
         examples.append({
+            "prompt":  prompt,
+            "task_id": s.get("id", "easy_s001"),
         })
+    print(f"Built {len(examples)} training examples from {len(scenarios)} scenarios")
     return Dataset.from_list(examples)
+# ─────────────────────────────────────────────
+#  REWARD WRAPPER  (FIX 2 continued)
+# ─────────────────────────────────────────────
+def reward_wrapper(prompts, completions, **kwargs):
+    """
+    Thin wrapper — passes kwargs straight through.
+    TRL GRPO sends dataset columns (including task_id) as direct kwargs.
+    DO NOT use kwargs.get("batch") — that key does not exist in TRL GRPO.
+    """
+    return reward_fn(prompts, completions, **kwargs)
+# ─────────────────────────────────────────────
+#  MAIN TRAINING
+# ─────────────────────────────────────────────
+def train():
+    if not UNSLOTH_AVAILABLE:
+        print("Cannot train — Unsloth not installed")
+        print("Run: pip install 'unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git' trl transformers datasets accelerate")
+        return
+    print(f"🚀 Loading model: {MODEL_NAME}")
+    print(f"🌐 Environment:   {ENV_URL}")
+    # Sanity check — make sure environment is reachable
+    try:
+        r = requests.get(f"{ENV_URL}/health", timeout=10)
+        print(f"✅ Environment health: {r.json()}")
+    except Exception as e:
+        print(f"⚠️  Cannot reach environment at {ENV_URL}: {e}")
+        print("   Training will likely fail — check ENV_URL")
+    # Load model with Unsloth 4-bit quantization
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name     = MODEL_NAME,
+        max_seq_length = 4096,
         load_in_4bit   = True,
         token          = HF_TOKEN or None,
     )
+    # Add LoRA adapters
     model = FastLanguageModel.get_peft_model(
         model,
+        r              = 16,
+        lora_alpha     = 16,
+        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                          "gate_proj", "up_proj", "down_proj"],
+        lora_dropout   = 0,
+        bias           = "none",
+        use_gradient_checkpointing = "unsloth",
     )
+    # Build dataset
     dataset = build_dataset()
+    # GRPO config
     config = GRPOConfig(
         output_dir                  = OUTPUT_DIR,
+        num_train_epochs            = 3,
+        per_device_train_batch_size = 2,
+        gradient_accumulation_steps = 8,
+        learning_rate               = 5e-5,
+        max_completion_length       = 256,
+        num_generations             = 4,
+        logging_steps               = 5,
+        save_steps                  = 50,
         warmup_ratio                = 0.1,
         report_to                   = "none",
     )
     trainer = GRPOTrainer(
         model         = model,
         tokenizer     = tokenizer,
+        reward_funcs  = reward_wrapper,
         args          = config,
         train_dataset = dataset,
     )
+    print("🏋️  Starting GRPO training...")
+    print("   Expected reward progression:")
+    print("   Steps  10: ~0.05-0.15 (model still outputting free text)")
+    print("   Steps  50: ~0.20-0.35 (learning JSON format)")
+    print("   Steps 100: ~0.35-0.50 (learning correct action types)")
+    print("   Steps 200: ~0.55-0.70 (learning DBA investigation pattern)")
+    print("   Steps 300: ~0.70-0.82 (strategic multi-action planning)")
     trainer.train()
+    # Save
     model.save_pretrained(f"{OUTPUT_DIR}/final")
     tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
+    print(f"✅ Training complete. Model saved to {OUTPUT_DIR}/final")
 if __name__ == "__main__":
+    train()