Spaces:

junaid0600
/

sql-db-engineer-agent

Sleeping

App Files Files Community

junaid0600 commited on Apr 25

Commit

bb2cfec

verified ·

1 Parent(s): 188eb9c

Update training/train_agent.py

Browse files

Files changed (1) hide show

training/train_agent.py +266 -347

training/train_agent.py CHANGED Viewed

@@ -1,184 +1,228 @@
 """
 training/train_agent.py — SQL Database Engineer Agent
-Unsloth + GRPO training script.
-Run on venue GPU (April 25-26) with compute credits.
-FREE T4 (Colab):  MODEL_NAME=unsloth/Qwen2.5-1.5B-Instruct  (default)
-VENUE A100:       set ENV_VAR MODEL_NAME=unsloth/Qwen2.5-7B-Instruct
 """
-import os
-import json
-import requests
-import sys
-import re
 from pathlib import Path
-# ── Try importing Unsloth (GPU only) ─────────────────────────
 try:
     from unsloth import FastLanguageModel
     from trl import GRPOTrainer, GRPOConfig
-    import torch
     UNSLOTH_AVAILABLE = True
-    print("Unsloth + TRL loaded successfully")
-except ImportError:
-    UNSLOTH_AVAILABLE = False
-    print("Unsloth not available. Run: pip install unsloth trl")
-# ─────────────────────────────────────────────
-#  CONFIG — change MODEL_NAME via env var at venue
-# ─────────────────────────────────────────────
 ENV_URL    = os.getenv("ENV_URL",    "https://junaid0600-sql-db-engineer-agent.hf.space")
 HF_TOKEN   = os.getenv("HF_TOKEN",  "")
-MODEL_NAME = os.getenv("MODEL_NAME", "unsloth/Qwen2.5-1.5B-Instruct")  # 1.5B for free T4
 OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./sdea-trained")
-MAX_STEPS  = int(os.getenv("MAX_STEPS", "100"))  # increase to 300+ at venue
-print(f"[CONFIG] Model:      {MODEL_NAME}")
-print(f"[CONFIG] Output:     {OUTPUT_DIR}")
-print(f"[CONFIG] Max steps:  {MAX_STEPS}")
-print(f"[CONFIG] ENV URL:    {ENV_URL}")
-# ─────────────────────────────────────────────
-#  SYSTEM PROMPT
-# ─────────────────────────────────────────────
-SYSTEM_PROMPT = """You are a senior database engineer.
-Given the current database state with slow queries, choose the BEST action to improve performance.
-Think step by step:
-1. If you have not inspected queries yet -> use inspect_query
-2. If you have not analyzed indexes -> use analyze_indexes
-3. If you know which index is missing -> use create_index
-4. If query can be rewritten better -> use rewrite_query
-5. If table is huge (1M+ rows) -> use partition_table
-6. When performance target is reached -> use submit_report
-Respond with JSON only — no explanation, no markdown:
-{"action_type": "...", "payload": {...}}"""
-# ─────────────────────────────────────────────
-#  REWARD FUNCTION (calls live HF Space)
-# ─────────────────────────────────────────────
-def parse_action(text: str) -> dict | None:
-    """Parse LLM output into action dict. Returns None on failure."""
     try:
         text = text.strip()
-        if "```" in text:
-            text = text.split("```")[1]
-            if text.startswith("json"):
-                text = text[4:]
         text = text.strip()
-        # Try direct JSON first
         data = json.loads(text)
-        if "action_type" in data:
             return data
     except Exception:
-        # Try extracting first JSON object from mixed text output
-        match = re.search(r"\{[\s\S]*\}", text)
-        if match:
-            try:
-                data = json.loads(match.group(0))
-                if "action_type" in data:
-                    return data
-            except Exception:
-                pass
-    return None
-def _extract_task_id_from_prompt(prompt_text: str) -> str | None:
-    """Fallback extractor when GRPO doesn't pass task_id column."""
-    match = re.search(r"-\s*Scenario:\s*([a-z]+_[a-z]?\d+)", prompt_text, flags=re.IGNORECASE)
-    if match:
-        return match.group(1)
-    return None
 def reward_fn(prompts, completions, **kwargs):
     """
-    GRPO reward function — calls /grader on live environment.
-    Returns list of float rewards, one per completion.
-    Score always between 0.001 and 0.999.
     """
     rewards = []
-    task_ids = kwargs.get("task_ids")
-    if not task_ids:
-        # GRPO can pass dataset columns directly as kwargs, not always via batch.
-        task_ids = kwargs.get("task_id")
-    if not task_ids:
-        task_ids = ["easy_s001"] * len(prompts)
-    if isinstance(task_ids, str):
-        task_ids = [task_ids] * len(prompts)
     for i, (prompt, completion) in enumerate(zip(prompts, completions)):
         try:
-            # Get completion text
             if isinstance(completion, list):
                 text = completion[0].get("content", "") if completion else ""
             else:
                 text = str(completion)
-            # Parse into action
             action = parse_action(text)
-            task_id = task_ids[i] if i < len(task_ids) else "easy_s001"
-            if not task_id:
-                task_id = _extract_task_id_from_prompt(str(prompt)) or "easy_s001"
-            task_id = str(task_id)
             if action is None:
                 rewards.append(0.001)
-                print(f"  [REWARD] task={task_id} | action=parse_failed | score=0.001")
                 continue
-            # Use environment step reward so dense + milestone logic is used.
-            # This also guarantees the sampled task_id actually drives reward.
-            difficulty = "easy"
-            if str(task_id).startswith("medium_"):
-                difficulty = "medium"
-            elif str(task_id).startswith("hard_"):
-                difficulty = "hard"
-            reset_resp = requests.post(
-                f"{ENV_URL}/reset",
-                json={"difficulty": difficulty, "task_id": task_id},
-                timeout=15,
-                headers={"Content-Type": "application/json"},
-            )
-            if reset_resp.status_code != 200:
-                raise RuntimeError(f"/reset failed for {task_id}: {reset_resp.status_code}")
-            step_resp = requests.post(
-                f"{ENV_URL}/step",
-                json=action,
-                timeout=15,
-                headers={"Content-Type": "application/json"},
-            )
-            if step_resp.status_code == 200:
-                score = step_resp.json().get("reward", {}).get("score", 0.001)
-                score = max(0.001, min(0.999, float(score)))
-            else:
-                # Fallback to grader for robustness.
-                grader_resp = requests.post(
-                    f"{ENV_URL}/grader",
-                    json={"task_id": task_id, "action": action},
-                    timeout=15,
-                    headers={"Content-Type": "application/json"},
-                )
-                if grader_resp.status_code == 200:
-                    score = grader_resp.json().get("score", 0.001)
-                    score = max(0.001, min(0.999, float(score)))
-                else:
-                    score = 0.001
-            action_name = str(action.get("action_type", "unknown"))
             rewards.append(score)
-            print(f"  [REWARD] task={task_id} | action={action_name} | score={score:.3f}")
-        except json.JSONDecodeError:
-            rewards.append(0.001)
         except Exception as e:
             print(f"  [REWARD] Error: {e}")
             rewards.append(0.001)
@@ -186,210 +230,113 @@ def reward_fn(prompts, completions, **kwargs):
     return rewards
-# ─────────────────────────────────────────────
-#  BUILD TRAINING DATASET
-# ─────────────────────────────────────────────
-def build_dataset():
-    """Build training examples from all 15 Round 2 scenarios."""
-    scenarios = []
-    for fname in [
-        "dataset/easy_scenarios.json",
-        "dataset/medium_scenarios.json",
-        "dataset/hard_scenarios.json"
-    ]:
-        try:
-            with open(fname) as f:
-                data = json.load(f)
-                scenarios.extend(data)
-                print(f"  Loaded {len(data)} scenarios from {fname}")
-        except FileNotFoundError:
-            print(f"{fname} not found, skipping")
-    if not scenarios:
-        print("No local scenarios found. Fetching from live environment...")
-        try:
-            resp = requests.get(f"{ENV_URL}/tasks", timeout=15)
-            tasks = resp.json().get("tasks", [])
-            scenarios = [{"id": t["id"], "description": t["description"]} for t in tasks]
-            print(f"  Fetched {len(scenarios)} tasks from HF Space")
-        except Exception as e:
-            print(f"Could not fetch tasks: {e}")
-            sys.exit(1)
-    examples = []
-    for s in scenarios:
-        prompt = f"""{SYSTEM_PROMPT}
-Current Database State:
-- Scenario: {s.get('id', 'unknown')}
-- Description: {s.get('description', '')}
-- Tables: {json.dumps(s.get('tables', []))}
-- Slow Queries: {json.dumps(s.get('slow_queries', []))}
-- Performance Score: {s.get('performance_score_baseline', 0)} / 100
-- Target Score: {s.get('target_score', 85)}
-What is your next action?"""
-        examples.append({
-            "prompt":  prompt,
-            "task_id": s.get("id", "easy_s001"),
-        })
-    diff_counts = {"easy": 0, "medium": 0, "hard": 0}
-    for ex in examples:
-        tid = ex["task_id"]
-        if str(tid).startswith("medium_"):
-            diff_counts["medium"] += 1
-        elif str(tid).startswith("hard_"):
-            diff_counts["hard"] += 1
-        else:
-            diff_counts["easy"] += 1
-    print(f"  Built {len(examples)} training examples total")
-    print(f"  Difficulty mix: easy={diff_counts['easy']} medium={diff_counts['medium']} hard={diff_counts['hard']}")
-    from datasets import Dataset
-    return Dataset.from_list(examples)
-# ─────────────────────────────────────────────
-#  INFERENCE TEST — run immediately after save
-# ─────────────────────────────────────────────
-def test_inference(model, tokenizer):
-    """
-    REQUIRED: Test inference immediately after saving.
-    If this fails, the model was not saved correctly.
-    """
-    print("\n[INFERENCE TEST] Testing saved model...")
-    try:
-        FastLanguageModel.for_inference(model)
-        test_prompt = f"""{SYSTEM_PROMPT}
-Current Database State:
-- Scenario: easy_s001
-- Description: User lookup query taking 2s on 10K users table
-- Tables: [{{"name": "users", "rows": 10000, "indexes": ["PRIMARY"]}}]
-- Slow Queries: [{{"id": "q1", "sql": "SELECT * FROM users WHERE email=?", "avg_ms": 2000}}]
-- Performance Score: 8.0 / 100
-- Target Score: 80.0
-What is your next action?"""
-        inputs = tokenizer(
-            test_prompt,
-            return_tensors="pt",
-            truncation=True,
-            max_length=1024
-        ).to(model.device)
-        with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
-                max_new_tokens  = 100,
-                temperature     = 0.3,
-                do_sample       = True,
-                pad_token_id    = tokenizer.eos_token_id,
-            )
-        response = tokenizer.decode(
-            outputs[0][inputs["input_ids"].shape[1]:],
-            skip_special_tokens=True
-        ).strip()
-        print(f"[INFERENCE TEST] Model output:\n  {response}")
-        # Validate output
-        action = parse_action(response)
-        print(f"[INFERENCE TEST] Parsed action: {action}")
-        print("[INFERENCE TEST] PASSED — model saved correctly!")
-        return True
-    except Exception as e:
-        print(f"[INFERENCE TEST] FAILED: {e}")
-        print("[INFERENCE TEST] Check model save path. Do NOT proceed without fixing this.")
-        return False
-# ─────────────────────────────────────────────
-#  MAIN TRAINING
-# ─────────────────────────────────────────────
 def train():
-    if not UNSLOTH_AVAILABLE:
-        print(" Cannot train — Unsloth not installed or no GPU found")
-        print("Run: pip install unsloth trl transformers datasets accelerate")
-        return
-    print(f"\n Loading model: {MODEL_NAME}")
-    print(f" Environment:   {ENV_URL}\n")
-    # Verify environment is reachable
-    try:
-        r = requests.get(f"{ENV_URL}/health", timeout=10)
-        version = r.json().get("version", "?")
-        print(f" Environment reachable — version {version}")
-    except Exception as e:
-        print(f" Cannot reach environment at {ENV_URL}: {e}")
-        print("Check ENV_URL and make sure HF Space is running.")
         sys.exit(1)
-    # ── Load model ───────────────────────────────────────────
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name     = MODEL_NAME,
         max_seq_length = 2048,
-        load_in_4bit   = True,   # QLoRA — required for T4
-        dtype          = None,   # Auto detect
         token          = HF_TOKEN or None,
     )
-    print(" Model loaded")
-    # ── Apply LoRA adapters ──────────────────────────────────
     model = FastLanguageModel.get_peft_model(
         model,
-        r              = 16,
-        lora_alpha     = 16,
-        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
-                          "gate_proj", "up_proj", "down_proj"],
-        lora_dropout   = 0,
-        bias           = "none",
-        use_gradient_checkpointing = "unsloth",
-        random_state   = 42,
     )
-    print(" LoRA adapters applied")
-    # ── Build dataset ────────────────────────────────────────
-    print("\n[DATASET] Building training dataset...")
     dataset = build_dataset()
-    print(f" Dataset ready: {len(dataset)} examples")
-    # ── Reward wrapper ───────────────────────────────────────
-    def reward_wrapper(prompts, completions, **kwargs):
-        batch = kwargs.get("batch", [])
-        if batch and hasattr(batch[0], "get"):
-            task_ids = [b.get("task_id", "easy_s001") for b in batch]
-        elif "task_id" in kwargs and kwargs["task_id"]:
-            task_ids = kwargs["task_id"]
-        else:
-            task_ids = ["easy_s001"] * len(prompts)
-        return reward_fn(prompts, completions, task_ids=task_ids)
-    # ── GRPO config ──────────────────────────────────────────
-    # NOTE: batch_size=1, num_generations=2 for free T4
-    # At venue A100: increase to batch_size=2, num_generations=4
     config = GRPOConfig(
         output_dir                  = OUTPUT_DIR,
         max_steps                   = MAX_STEPS,
-        per_device_train_batch_size = 1,   # 1 for T4, 2 for A100
-        gradient_accumulation_steps = 8,
         learning_rate               = 5e-6,
-        max_completion_length       = 256,
-        num_generations             = 2,   # 2 for T4, 4 for A100
-        temperature                 = 0.8,
-        logging_steps               = 5,
-        save_steps                  = 50,
-        save_total_limit            = 2,
         warmup_ratio                = 0.1,
         report_to                   = "none",
         remove_unused_columns       = False,
@@ -398,56 +345,28 @@ def train():
     trainer = GRPOTrainer(
         model         = model,
         tokenizer     = tokenizer,
-        reward_funcs  = reward_wrapper,
         args          = config,
         train_dataset = dataset,
     )
-    # ── Train ────────────────────────────────────────────────
-    print(f"\n🏋️  Starting GRPO training — {MAX_STEPS} steps...")
-    print("Watch the 'reward' column — it should increase over time.\n")
     trainer.train()
-    print("\n Training complete!")
-    # ── Save — ADAPTER ONLY (correct way for QLoRA) ──────────
-    # DO NOT call merge_and_unload() on 4-bit model
-    # DO NOT upcast to 16-bit and merge naively
-    # CORRECT: save adapter weights only, load with from_pretrained later
-    print(f"\n[SAVE] Saving adapter to {OUTPUT_DIR}/final ...")
     Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
     model.save_pretrained(f"{OUTPUT_DIR}/final")
     tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
-    # Save config for reference
-    with open(f"{OUTPUT_DIR}/final/training_config.json", "w") as f:
-        json.dump({
-            "model_name":   MODEL_NAME,
-            "max_steps":    MAX_STEPS,
-            "save_method":  "adapter_only_qlora",
-            "lora_r":       16,
-            "lora_alpha":   16,
-        }, f, indent=2)
-    print(f" Adapter saved to {OUTPUT_DIR}/final")
-    # ── IMMEDIATE inference test (required) ──────────────────
-    passed = test_inference(model, tokenizer)
-    # ── Summary ──────────────────────────────────────────────
-    print("\n" + "="*60)
-    print("TRAINING COMPLETE")
-    print("="*60)
-    print(f"  Model:         {MODEL_NAME}")
-    print(f"  Steps:         {MAX_STEPS}")
-    print(f"  Saved to:      {OUTPUT_DIR}/final")
-    print(f"  Save method:   Adapter only (QLoRA safe)")
-    print(f"  Inference test: {' PASSED' if passed else ' FAILED'}")
-    print("="*60)
-    print("\nNext steps:")
-    print("  1. python training/evaluate_agent.py")
-    print("  2. Open reward_curve.png — show to judges")
-    print("  3. git add reward_curve.png && git commit && git push")
-    print("="*60)
 if __name__ == "__main__":
-    train()

 """
 training/train_agent.py — SQL Database Engineer Agent
+FIXED: Uses local DatabaseSimulator for rewards (no HF Space calls)
+- No shared singleton state
+- Real delta rewards (0.0 for wrong actions, 40-75pts for correct)
+- Clear reward difference teaches model to prefer create_index over inspect_query
 """
+import os, json, sys, time
 from pathlib import Path
+# ── GPU check ─────────────────────────────────────────────────
+UNSLOTH_AVAILABLE = False
 try:
+    import torch
+    if not torch.cuda.is_available():
+        print("❌ No GPU. Unsloth requires CUDA GPU.")
+        sys.exit(1)
     from unsloth import FastLanguageModel
     from trl import GRPOTrainer, GRPOConfig
+    from datasets import Dataset
     UNSLOTH_AVAILABLE = True
+    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
+    print(f"✅ VRAM: {torch.cuda.get_device_properties(0).total_memory/1e9:.1f}GB")
+except ImportError as e:
+    print(f"❌ {e}\nRun: pip install unsloth trl transformers datasets accelerate")
+    sys.exit(1)
+# Add project root so we can import DatabaseSimulator
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from env.db_simulator import DatabaseSimulator
+# ── Config ────────────────────────────────────────────────────
 ENV_URL    = os.getenv("ENV_URL",    "https://junaid0600-sql-db-engineer-agent.hf.space")
 HF_TOKEN   = os.getenv("HF_TOKEN",  "")
+MODEL_NAME = os.getenv("MODEL_NAME", "unsloth/Qwen2.5-1.5B-Instruct")
 OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./sdea-trained")
+MAX_STEPS  = int(os.getenv("MAX_STEPS", "100"))
+print(f"\n[CONFIG] Model:     {MODEL_NAME}")
+print(f"[CONFIG] Max steps: {MAX_STEPS}")
+print(f"[CONFIG] Output:    {OUTPUT_DIR}\n")
+# ── System prompt ─────────────────────────────────────────────
+SYSTEM_PROMPT = """You are a senior database engineer fixing slow database queries.
+You will see slow queries and table structures. Choose the BEST action.
+Key insight: create_index on the RIGHT columns fixes slow queries.
+Wrong columns = no improvement. Right columns = massive improvement.
+Respond with ONLY valid JSON:
+{"action_type": "create_index", "payload": {"table": "TABLE_NAME", "columns": ["COL1", "COL2"]}}
+Available actions: inspect_query, analyze_indexes, create_index, rewrite_query, analyze_statistics, submit_report"""
+# ── Load all 15 scenarios ─────────────────────────────────────
+def load_all_scenarios() -> list:
+    scenarios = []
+    base = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+    for fname in ["easy_scenarios.json", "medium_scenarios.json", "hard_scenarios.json"]:
+        path = os.path.join(base, "dataset", fname)
+        try:
+            with open(path) as f:
+                data = json.load(f)
+                scenarios.extend(data)
+                print(f"  ✅ Loaded {len(data)} from {fname}")
+        except FileNotFoundError:
+            print(f"  ⚠️ {fname} not found")
+    print(f"  Total: {len(scenarios)} scenarios\n")
+    return scenarios
+ALL_SCENARIOS = load_all_scenarios()
+# ── Parse LLM output ─────────────────────────────────────────
+def parse_action(text: str) -> dict:
+    """Parse LLM output into action dict."""
     try:
         text = text.strip()
+        for marker in ["```json", "```"]:
+            if marker in text:
+                parts = text.split(marker)
+                text = parts[1] if len(parts) > 1 else parts[0]
         text = text.strip()
         data = json.loads(text)
+        if "action_type" in data and "payload" in data:
             return data
     except Exception:
+        pass
+    return None  # None = invalid JSON = penalized
+# ── LOCAL reward function using DatabaseSimulator ─────────────
+def compute_local_reward(action: dict, scenario: dict) -> tuple:
+    """
+    Compute reward LOCALLY using DatabaseSimulator.
+    No HF Space calls. No shared state. Clean every time.
+    Returns (reward_score, db_delta, milestone_bonus)
+    """
+    sim      = DatabaseSimulator(scenario)
+    baseline = sim.get_performance_score()
+    hints    = scenario.get("missing_index_hints", [])
+    action_type = action.get("action_type", "")
+    payload     = action.get("payload", {})
+    # Apply action to simulator
+    if action_type == "create_index":
+        result  = sim.apply_action("create_index", payload)
+        delta   = result.get("delta", 0.0)
+    elif action_type == "inspect_query":
+        # Investigation — small reward, no DB change
+        delta = 0.0
+    elif action_type == "analyze_indexes":
+        delta = 0.0
+    elif action_type == "rewrite_query":
+        result = sim.apply_action("rewrite_query", payload)
+        delta  = result.get("delta", 0.0)
+    elif action_type == "analyze_statistics":
+        result = sim.apply_action("analyze_statistics", payload)
+        delta  = result.get("delta", 0.0)
+    elif action_type == "partition_table":
+        result = sim.apply_action("partition_table", payload)
+        delta  = result.get("delta", 0.0)
+    elif action_type == "submit_report":
+        # Terminal: score based on how much DB improved so far
+        final = sim.get_performance_score()
+        improvement = max(0, final - baseline)
+        delta = improvement
+    else:
+        delta = -5.0  # Unknown action = penalty
+    final_score  = sim.get_performance_score()
+    improvement  = max(0.0, final_score - baseline)
+    max_possible = max(1.0, 100.0 - baseline)
+    # ── Reward components ─────────────────────────────────────
+    # 1. Step reward — different per action type
+    step_rewards = {
+        "inspect_query":     0.10,
+        "analyze_indexes":   0.10,
+        "create_index":      0.15,
+        "rewrite_query":     0.20,
+        "analyze_statistics":0.08,
+        "partition_table":   0.15,
+        "submit_report":     0.05,
+    }
+    step_r = step_rewards.get(action_type, 0.001)
+    # 2. Delta reward — proportional to actual improvement
+    delta_r = min(0.70, (improvement / max_possible) * 0.70)
+    # 3. Milestone bonus — one-time for big improvements
+    milestone_r = 0.0
+    if improvement / max_possible >= 0.75:
+        milestone_r = 0.40
+    elif improvement / max_possible >= 0.50:
+        milestone_r = 0.25
+    elif improvement / max_possible >= 0.25:
+        milestone_r = 0.15
+    # 4. Penalty for wrong index (delta=0 on create_index)
+    wrong_index_pen = 0.0
+    if action_type == "create_index" and delta <= 0.0:
+        wrong_index_pen = -0.15  # created useless index
+    total = step_r + delta_r + milestone_r + wrong_index_pen
+    total = max(0.001, min(0.999, total))
+    return total, improvement, milestone_r
+# ── GRPO reward function ──────────────────────────────────────
 def reward_fn(prompts, completions, **kwargs):
     """
+    LOCAL reward — no HTTP calls, no shared state.
+    Each completion gets its own fresh DatabaseSimulator.
+    Reward differences:
+      inspect_query (always):     0.10 + 0.0  = 0.10
+      create_index (wrong col):   0.15 - 0.15 = 0.001
+      create_index (right col):   0.15 + 0.60 = 0.75+
+    GRPO will learn: right create_index >> inspect_query >> wrong create_index
     """
     rewards = []
     for i, (prompt, completion) in enumerate(zip(prompts, completions)):
         try:
+            # Get text
             if isinstance(completion, list):
                 text = completion[0].get("content", "") if completion else ""
             else:
                 text = str(completion)
+            # Pick scenario (rotate through all)
+            scenario = ALL_SCENARIOS[i % len(ALL_SCENARIOS)]
+            # Parse action
             action = parse_action(text)
             if action is None:
+                # Invalid JSON output — penalize
                 rewards.append(0.001)
+                print(f"  [REWARD] scenario={scenario['id']} | "
+                      f"INVALID JSON | score=0.001")
                 continue
+            # Compute reward locally
+            score, improvement, milestone = compute_local_reward(action, scenario)
             rewards.append(score)
+            print(f"  [REWARD] scenario={scenario['id']} | "
+                  f"action={action.get('action_type')} | "
+                  f"improvement=+{improvement:.1f}pts | "
+                  f"milestone=+{milestone:.2f} | "
+                  f"score={score:.3f}")
         except Exception as e:
             print(f"  [REWARD] Error: {e}")
             rewards.append(0.001)
     return rewards
+# ── Build dataset ─────────────────────────────────────────────
+def build_dataset() -> Dataset:
+    examples = []
+    for i, s in enumerate(ALL_SCENARIOS):
+        tables_str  = json.dumps(s.get("tables", []))
+        queries_str = json.dumps(s.get("slow_queries", []))
+        hints_str   = json.dumps(s.get("missing_index_hints", []))
+        prompt = (
+            f"{SYSTEM_PROMPT}\n\n"
+            f"=== DATABASE STATE ===\n"
+            f"Scenario: {s['id']}\n"
+            f"Description: {s.get('description','')}\n"
+            f"Tables: {tables_str}\n"
+            f"Slow Queries: {queries_str}\n"
+            f"Missing Index Hints: {hints_str}\n"
+            f"Performance: {s.get('performance_score_baseline',0)}/100 "
+            f"→ Target: {s.get('target_score',85)}/100\n\n"
+            f"What action should you take? Output JSON only:"
+        )
+        examples.append({
+            "prompt":      prompt,
+            "scenario_id": s["id"],
+        })
+    print(f"  ✅ Dataset: {len(examples)} examples")
+    return Dataset.from_list(examples)
+# ── Generate plots ────────────────────────────────────────────
+def generate_plots(trainer):
+    import matplotlib
+    matplotlib.use("Agg")
+    import matplotlib.pyplot as plt
+    logs = [l for l in trainer.state.log_history if "loss" in l]
+    if not logs:
+        print("⚠️ No logs for plotting")
+        return
+    steps  = [l.get("step", i) for i,l in enumerate(logs)]
+    losses = [l.get("loss", 0) for l in logs]
+    fig, ax = plt.subplots(1, 1, figsize=(8, 4))
+    fig.suptitle("GRPO Training — SQL Database Engineer Agent",
+                 fontsize=13, fontweight="bold")
+    ax.plot(steps, losses, "b-o", lw=2, ms=4)
+    ax.set_xlabel("Training Step")
+    ax.set_ylabel("Loss")
+    ax.set_title("Training Loss (↓ = model learning DBA pattern)")
+    ax.grid(True, alpha=0.3)
+    if losses:
+        ax.annotate(f"Start: {losses[0]:.4f}",
+                   xy=(steps[0], losses[0]),
+                   xytext=(steps[0]+1, losses[0]*1.1),
+                   fontsize=9, color="red")
+        ax.annotate(f"End: {losses[-1]:.4f}",
+                   xy=(steps[-1], losses[-1]),
+                   xytext=(steps[-1]-5, losses[-1]*1.1),
+                   fontsize=9, color="green")
+    plt.tight_layout()
+    plt.savefig("loss_curve.png", dpi=150, bbox_inches="tight")
+    print("✅ loss_curve.png saved")
+    print(f"   Loss: {losses[0]:.4f} → {losses[-1]:.4f}")
+# ── Main ──────────────────────────────────────────────────────
 def train():
+    if not ALL_SCENARIOS:
+        print("❌ No scenarios found. Check dataset/ folder.")
         sys.exit(1)
+    print(f"⏳ Loading {MODEL_NAME}...")
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name     = MODEL_NAME,
         max_seq_length = 2048,
+        load_in_4bit   = True,
         token          = HF_TOKEN or None,
     )
     model = FastLanguageModel.get_peft_model(
         model,
+        r=16, lora_alpha=16,
+        target_modules=["q_proj","k_proj","v_proj","o_proj",
+                        "gate_proj","up_proj","down_proj"],
+        lora_dropout=0, bias="none",
+        use_gradient_checkpointing="unsloth",
+        random_state=42,
     )
+    print("✅ Model ready\n")
     dataset = build_dataset()
     config = GRPOConfig(
         output_dir                  = OUTPUT_DIR,
         max_steps                   = MAX_STEPS,
+        per_device_train_batch_size = 1,
+        gradient_accumulation_steps = 4,
         learning_rate               = 5e-6,
+        max_completion_length       = 150,
+        num_generations             = 4,   # compare 4 actions per step
+        temperature                 = 1.0,
+        logging_steps               = 1,
+        save_steps                  = 25,
+        save_total_limit            = 3,
         warmup_ratio                = 0.1,
         report_to                   = "none",
         remove_unused_columns       = False,
     trainer = GRPOTrainer(
         model         = model,
         tokenizer     = tokenizer,
+        reward_funcs  = reward_fn,
         args          = config,
         train_dataset = dataset,
     )
+    print(f"🏋️  GRPO training — {MAX_STEPS} steps")
+    print("Watch for: improvement > 0 and score > 0.5 on create_index\n")
     trainer.train()
+    print("\n✅ Training complete!")
     Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
     model.save_pretrained(f"{OUTPUT_DIR}/final")
     tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
+    print(f"✅ Saved to {OUTPUT_DIR}/final")
+    generate_plots(trainer)
+    print("\n" + "="*50)
+    print("NEXT: python training/evaluate_agent.py")
+    print("THEN: git add loss_curve.png reward_curve.png")
+    print("="*50)
 if __name__ == "__main__":
+    train()