Spaces:

junaid0600
/

sql-db-engineer-agent

Sleeping

App Files Files Community

junaid0600 commited on Apr 25

Commit

44e9354

verified ·

1 Parent(s): 86abfc1

Update training/train_agent.py

Browse files

Files changed (1) hide show

training/train_agent.py +330 -225

training/train_agent.py CHANGED Viewed

@@ -1,98 +1,116 @@
 """
 training/train_agent.py — SQL Database Engineer Agent
-Fixed version: reward_fn runs FULL EPISODES via /reset + /step
-This gives real delta rewards, milestones, and meaningful learning signal.
-FREE T4 (Colab/Kaggle): MODEL_NAME=unsloth/Qwen2.5-1.5B-Instruct
-VENUE A100:             MODEL_NAME=unsloth/Qwen2.5-7B-Instruct
 """
-import os, json, requests, sys, time
 from pathlib import Path
-# ── GPU check + imports ───────────────────────────────────────
-UNSLOTH_AVAILABLE = False
 try:
-    import torch
-    if not torch.cuda.is_available():
-        print("❌ No GPU found. Unsloth requires GPU.")
-        sys.exit(1)
     from unsloth import FastLanguageModel
     from trl import GRPOTrainer, GRPOConfig
-    from datasets import Dataset
     UNSLOTH_AVAILABLE = True
-    print(f"✅ GPU: {torch.cuda.get_device_name(0)}")
-    print(f"✅ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
-except ImportError as e:
-    print(f"❌ Import error: {e}")
-    print("Run: pip install unsloth trl transformers datasets accelerate")
-    sys.exit(1)
-# ── Config ────────────────────────────────────────────────────
 ENV_URL    = os.getenv("ENV_URL",    "https://junaid0600-sql-db-engineer-agent.hf.space")
 HF_TOKEN   = os.getenv("HF_TOKEN",  "")
-MODEL_NAME = os.getenv("MODEL_NAME", "unsloth/Qwen2.5-1.5B-Instruct")
 OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./sdea-trained")
-MAX_STEPS  = int(os.getenv("MAX_STEPS", "100"))
-print(f"\n[CONFIG] Model:     {MODEL_NAME}")
-print(f"[CONFIG] ENV URL:   {ENV_URL}")
-print(f"[CONFIG] Max steps: {MAX_STEPS}")
-print(f"[CONFIG] Output:    {OUTPUT_DIR}\n")
-# ── System prompt ─────────────────────────────────────────────
 SYSTEM_PROMPT = """You are a senior database engineer.
-Given the current database state, choose the BEST next action.
-Rules:
-1. First action MUST be inspect_query to see what's slow
-2. Then analyze_indexes to see what's missing
-3. Then create_index with correct table and columns
-4. Then analyze_statistics to update planner
-5. Finally submit_report when performance target is reached
-Respond with ONLY valid JSON — no markdown, no explanation:
-{"action_type": "create_index", "payload": {"table": "users", "columns": ["email"]}}"""
-# ── All 15 Round 2 scenario IDs ───────────────────────────────
-ALL_SCENARIOS = [
-    "easy_s001", "easy_s002", "easy_s003", "easy_s004", "easy_s005",
-    "medium_s001", "medium_s002", "medium_s003", "medium_s004", "medium_s005",
-    "hard_s001", "hard_s002", "hard_s003", "hard_s004", "hard_s005",
-]
-# ── Parse LLM output → action dict ───────────────────────────
-def parse_action(text: str) -> dict:
     try:
         text = text.strip()
-        # Strip markdown
-        for marker in ["```json", "```"]:
-            if marker in text:
-                parts = text.split(marker)
-                text = parts[1] if len(parts) > 1 else parts[0]
         text = text.strip()
         data = json.loads(text)
-        if "action_type" in data and "payload" in data:
             return data
     except Exception:
-        pass
-    # Safe fallback
-    return {"action_type": "inspect_query", "payload": {"query_id": "q1"}}
-# ── REWARD FUNCTION — runs FULL EPISODE ───────────────────────
 def reward_fn(prompts, completions, **kwargs):
     """
-    KEY FIX: Runs a full episode per completion.
-    1. /reset with scenario
-    2. Parse LLM output as action
-    3. /step with that action
-    4. Get REAL reward including delta + milestones
-    5. /step with submit_report to get terminal score
-    Returns real rewards — not constant 0.5
     """
     rewards = []
-    batch = kwargs.get("batch", [])
     for i, (prompt, completion) in enumerate(zip(prompts, completions)):
         try:
@@ -102,62 +120,65 @@ def reward_fn(prompts, completions, **kwargs):
             else:
                 text = str(completion)
-            # Pick scenario — rotate through all 15
-            scenario_id = ALL_SCENARIOS[i % len(ALL_SCENARIOS)]
-            # Parse LLM output
             action = parse_action(text)
-            # Step 1: Reset environment for this scenario
-            r = requests.post(f"{ENV_URL}/reset",
-                json={"task_id": scenario_id}, timeout=15)
-            if r.status_code != 200:
-                rewards.append(0.001)
-                continue
-            obs      = r.json()
-            baseline = obs.get("current_context", {}).get("performance_score", 0)
-            # Step 2: Submit the LLM's action
-            r2 = requests.post(f"{ENV_URL}/step",
-                json=action, timeout=15)
-            if r2.status_code != 200:
                 rewards.append(0.001)
                 continue
-            data       = r2.json()
-            step_score = data.get("reward", {}).get("score", 0.001)
-            db_delta   = data.get("info", {}).get("db_delta", 0)
-            perf_score = data.get("info", {}).get("performance_score", baseline)
-            milestones = data.get("info", {}).get("milestones", [])
-            done       = data.get("done", False)
-            # Step 3: If not done, submit report to get terminal score
-            if not done:
-                r3 = requests.post(f"{ENV_URL}/step",
-                    json={"action_type": "submit_report",
-                          "payload": {"summary": "Training episode complete."}},
-                    timeout=15)
-                if r3.status_code == 200:
-                    final_data = r3.json()
-                    terminal   = final_data.get("reward", {}).get("score", step_score)
-                    # Combine step reward + terminal
-                    final_score = (step_score * 0.4) + (terminal * 0.6)
-                else:
-                    final_score = step_score
             else:
-                final_score = step_score
-            # Clamp
-            final_score = max(0.001, min(0.999, final_score))
-            rewards.append(final_score)
-            print(f"  [REWARD] scenario={scenario_id} | "
-                  f"action={action.get('action_type')} | "
-                  f"db_delta=+{db_delta:.1f} | "
-                  f"milestones={milestones} | "
-                  f"score={final_score:.3f}")
         except Exception as e:
             print(f"  [REWARD] Error: {e}")
             rewards.append(0.001)
@@ -165,149 +186,210 @@ def reward_fn(prompts, completions, **kwargs):
     return rewards
-# ── Build dataset with all 15 scenarios ───────────────────────
 def build_dataset():
     scenarios = []
-    for fname in ["dataset/easy_scenarios.json",
-                  "dataset/medium_scenarios.json",
-                  "dataset/hard_scenarios.json"]:
         try:
             with open(fname) as f:
                 data = json.load(f)
                 scenarios.extend(data)
-                print(f"  Loaded {len(data)} from {fname}")
         except FileNotFoundError:
-            print(f"  ⚠️ {fname} not found")
     if not scenarios:
-        print("  Fetching from live environment...")
-        resp = requests.get(f"{ENV_URL}/tasks", timeout=15)
-        tasks = resp.json().get("tasks", [])
-        scenarios = [t for t in tasks if t["id"].startswith(("easy_s","medium_s","hard_s"))]
     examples = []
-    for i, s in enumerate(scenarios):
-        # Build rich prompt with full DB state
-        tables_str = json.dumps(s.get("tables", []))
-        queries_str = json.dumps(s.get("slow_queries", []))
-        prompt = (
-            f"{SYSTEM_PROMPT}\n\n"
-            f"Scenario: {s.get('id')}\n"
-            f"Description: {s.get('description','')}\n"
-            f"Tables: {tables_str}\n"
-            f"Slow Queries: {queries_str}\n"
-            f"Performance: {s.get('performance_score_baseline',0)}/100 "
-            f"(target: {s.get('target_score',85)})\n\n"
-            f"What is your FIRST action?"
-        )
         examples.append({
-            "prompt":      prompt,
-            "task_id":     s.get("id", ALL_SCENARIOS[i % len(ALL_SCENARIOS)]),
-            "scenario_id": s.get("id", ALL_SCENARIOS[i % len(ALL_SCENARIOS)]),
         })
-    print(f"  ✅ Dataset: {len(examples)} examples")
     return Dataset.from_list(examples)
-# ── Generate loss + reward curve from training logs ───────────
-def generate_training_plots(trainer):
-    import matplotlib
-    matplotlib.use("Agg")
-    import matplotlib.pyplot as plt
-    logs    = [l for l in trainer.state.log_history if "loss" in l]
-    if not logs:
-        print("⚠️ No training logs found for plotting")
-        return
-    steps   = [l.get("step", i)   for i, l in enumerate(logs)]
-    losses  = [l.get("loss", 0)   for l in logs]
-    rewards = [l.get("reward", l.get("train_loss", 0)) for l in logs]
-    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
-    fig.suptitle("GRPO Training — SQL Database Engineer Agent",
-                 fontsize=13, fontweight="bold")
-    ax1.plot(steps, losses, "b-o", lw=2, ms=4, label="Loss")
-    ax1.set_xlabel("Training Step")
-    ax1.set_ylabel("Loss")
-    ax1.set_title("Training Loss (↓ = learning)")
-    ax1.grid(True, alpha=0.3)
-    ax1.legend()
-    ax2.plot(steps, rewards, "g-o", lw=2, ms=4, label="Reward")
-    ax2.set_xlabel("Training Step")
-    ax2.set_ylabel("Reward")
-    ax2.set_title("Reward During Training (↑ = improving)")
-    ax2.grid(True, alpha=0.3)
-    ax2.legend()
-    plt.tight_layout()
-    plt.savefig("loss_curve.png", dpi=150, bbox_inches="tight")
-    print("✅ loss_curve.png saved")
-    # Print summary
-    if losses:
-        print(f"  Loss:   {losses[0]:.4f} → {losses[-1]:.4f}")
-    if rewards:
-        valid_r = [r for r in rewards if r > 0]
-        if valid_r:
-            print(f"  Reward: {valid_r[0]:.4f} → {valid_r[-1]:.4f}")
-# ── Main training ─────────────────────────────────────────────
 def train():
-    # Verify environment
     try:
         r = requests.get(f"{ENV_URL}/health", timeout=10)
-        print(f"✅ Environment live: v{r.json().get('version','?')}\n")
     except Exception as e:
-        print(f"❌ Cannot reach {ENV_URL}: {e}")
         sys.exit(1)
-    # Load model
-    print(f"⏳ Loading {MODEL_NAME}...")
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name     = MODEL_NAME,
         max_seq_length = 2048,
-        load_in_4bit   = True,
         token          = HF_TOKEN or None,
     )
     model = FastLanguageModel.get_peft_model(
         model,
         r              = 16,
         lora_alpha     = 16,
-        target_modules = ["q_proj","k_proj","v_proj","o_proj",
-                          "gate_proj","up_proj","down_proj"],
         lora_dropout   = 0,
         bias           = "none",
         use_gradient_checkpointing = "unsloth",
         random_state   = 42,
     )
-    print("✅ Model + LoRA ready\n")
-    # Dataset
-    print("⏳ Building dataset...")
     dataset = build_dataset()
-    # Reward wrapper
     def reward_wrapper(prompts, completions, **kwargs):
-        return reward_fn(prompts, completions, **kwargs)
-    # GRPO config
     config = GRPOConfig(
         output_dir                  = OUTPUT_DIR,
         max_steps                   = MAX_STEPS,
-        per_device_train_batch_size = 1,
-        gradient_accumulation_steps = 4,
         learning_rate               = 5e-6,
-        max_completion_length       = 200,
-        num_generations             = 2,
-        temperature                 = 0.9,
-        logging_steps               = 1,
-        save_steps                  = 25,
-        save_total_limit            = 3,
         warmup_ratio                = 0.1,
         report_to                   = "none",
         remove_unused_columns       = False,
@@ -321,27 +403,50 @@ def train():
         train_dataset = dataset,
     )
-    print(f"🏋️  Starting GRPO — {MAX_STEPS} steps")
-    print("Watch for: db_delta > 0 and milestones in logs\n")
     trainer.train()
-    print("\n✅ Training complete!")
-    # Save
     Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
     model.save_pretrained(f"{OUTPUT_DIR}/final")
     tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
-    print(f"✅ Saved to {OUTPUT_DIR}/final")
-    # Generate plots
-    generate_training_plots(trainer)
-    print("\n" + "="*55)
-    print("NEXT STEPS:")
-    print("  python training/evaluate_agent.py")
-    print("  git add loss_curve.png reward_curve.png")
-    print("  git commit -m 'Real training evidence'")
-    print("  git push origin main")
-    print("="*55)
 if __name__ == "__main__":

 """
 training/train_agent.py — SQL Database Engineer Agent
+Unsloth + GRPO training script.
+Run on venue GPU (April 25-26) with compute credits.
+FREE T4 (Colab):  MODEL_NAME=unsloth/Qwen2.5-1.5B-Instruct  (default)
+VENUE A100:       set ENV_VAR MODEL_NAME=unsloth/Qwen2.5-7B-Instruct
 """
+import os
+import json
+import requests
+import sys
+import re
 from pathlib import Path
+# ── Try importing Unsloth (GPU only) ─────────────────────────
 try:
     from unsloth import FastLanguageModel
     from trl import GRPOTrainer, GRPOConfig
+    import torch
     UNSLOTH_AVAILABLE = True
+    print("Unsloth + TRL loaded successfully")
+except ImportError:
+    UNSLOTH_AVAILABLE = False
+    print("Unsloth not available. Run: pip install unsloth trl")
+# ─────────────────────────────────────────────
+#  CONFIG — change MODEL_NAME via env var at venue
+# ─────────────────────────────────────────────
 ENV_URL    = os.getenv("ENV_URL",    "https://junaid0600-sql-db-engineer-agent.hf.space")
 HF_TOKEN   = os.getenv("HF_TOKEN",  "")
+MODEL_NAME = os.getenv("MODEL_NAME", "unsloth/Qwen2.5-1.5B-Instruct")  # 1.5B for free T4
 OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./sdea-trained")
+MAX_STEPS  = int(os.getenv("MAX_STEPS", "100"))  # increase to 300+ at venue
+print(f"[CONFIG] Model:      {MODEL_NAME}")
+print(f"[CONFIG] Output:     {OUTPUT_DIR}")
+print(f"[CONFIG] Max steps:  {MAX_STEPS}")
+print(f"[CONFIG] ENV URL:    {ENV_URL}")
+# ─────────────────────────────────────────────
+#  SYSTEM PROMPT
+# ─────────────────────────────────────────────
 SYSTEM_PROMPT = """You are a senior database engineer.
+Given the current database state with slow queries, choose the BEST action to improve performance.
+Think step by step:
+1. If you have not inspected queries yet -> use inspect_query
+2. If you have not analyzed indexes -> use analyze_indexes
+3. If you know which index is missing -> use create_index
+4. If query can be rewritten better -> use rewrite_query
+5. If table is huge (1M+ rows) -> use partition_table
+6. When performance target is reached -> use submit_report
+Respond with JSON only — no explanation, no markdown:
+{"action_type": "...", "payload": {...}}"""
+# ─────────────────────────────────────────────
+#  REWARD FUNCTION (calls live HF Space)
+# ─────────────────────────────────────────────
+def parse_action(text: str) -> dict | None:
+    """Parse LLM output into action dict. Returns None on failure."""
     try:
         text = text.strip()
+        if "```" in text:
+            text = text.split("```")[1]
+            if text.startswith("json"):
+                text = text[4:]
         text = text.strip()
+        # Try direct JSON first
         data = json.loads(text)
+        if "action_type" in data:
             return data
     except Exception:
+        # Try extracting first JSON object from mixed text output
+        match = re.search(r"\{[\s\S]*\}", text)
+        if match:
+            try:
+                data = json.loads(match.group(0))
+                if "action_type" in data:
+                    return data
+            except Exception:
+                pass
+    return None
+def _extract_task_id_from_prompt(prompt_text: str) -> str | None:
+    """Fallback extractor when GRPO doesn't pass task_id column."""
+    match = re.search(r"-\s*Scenario:\s*([a-z]+_[a-z]?\d+)", prompt_text, flags=re.IGNORECASE)
+    if match:
+        return match.group(1)
+    return None
 def reward_fn(prompts, completions, **kwargs):
     """
+    GRPO reward function — calls /grader on live environment.
+    Returns list of float rewards, one per completion.
+    Score always between 0.001 and 0.999.
     """
     rewards = []
+    task_ids = kwargs.get("task_ids")
+    if not task_ids:
+        # GRPO can pass dataset columns directly as kwargs, not always via batch.
+        task_ids = kwargs.get("task_id")
+    if not task_ids:
+        task_ids = ["easy_s001"] * len(prompts)
+    if isinstance(task_ids, str):
+        task_ids = [task_ids] * len(prompts)
     for i, (prompt, completion) in enumerate(zip(prompts, completions)):
         try:
             else:
                 text = str(completion)
+            # Parse into action
             action = parse_action(text)
+            task_id = task_ids[i] if i < len(task_ids) else "easy_s001"
+            if not task_id:
+                task_id = _extract_task_id_from_prompt(str(prompt)) or "easy_s001"
+            task_id = str(task_id)
+            if action is None:
                 rewards.append(0.001)
+                print(f"  [REWARD] task={task_id} | action=parse_failed | score=0.001")
                 continue
+            # Use environment step reward so dense + milestone logic is used.
+            # This also guarantees the sampled task_id actually drives reward.
+            difficulty = "easy"
+            if str(task_id).startswith("medium_"):
+                difficulty = "medium"
+            elif str(task_id).startswith("hard_"):
+                difficulty = "hard"
+            reset_resp = requests.post(
+                f"{ENV_URL}/reset",
+                json={"difficulty": difficulty, "task_id": task_id},
+                timeout=15,
+                headers={"Content-Type": "application/json"},
+            )
+            if reset_resp.status_code != 200:
+                raise RuntimeError(f"/reset failed for {task_id}: {reset_resp.status_code}")
+            step_resp = requests.post(
+                f"{ENV_URL}/step",
+                json=action,
+                timeout=15,
+                headers={"Content-Type": "application/json"},
+            )
+            if step_resp.status_code == 200:
+                score = step_resp.json().get("reward", {}).get("score", 0.001)
+                score = max(0.001, min(0.999, float(score)))
             else:
+                # Fallback to grader for robustness.
+                grader_resp = requests.post(
+                    f"{ENV_URL}/grader",
+                    json={"task_id": task_id, "action": action},
+                    timeout=15,
+                    headers={"Content-Type": "application/json"},
+                )
+                if grader_resp.status_code == 200:
+                    score = grader_resp.json().get("score", 0.001)
+                    score = max(0.001, min(0.999, float(score)))
+                else:
+                    score = 0.001
+            action_name = str(action.get("action_type", "unknown"))
+            rewards.append(score)
+            print(f"  [REWARD] task={task_id} | action={action_name} | score={score:.3f}")
+        except json.JSONDecodeError:
+            rewards.append(0.001)
         except Exception as e:
             print(f"  [REWARD] Error: {e}")
             rewards.append(0.001)
     return rewards
+# ─────────────────────────────────────────────
+#  BUILD TRAINING DATASET
+# ─────────────────────────────────────────────
 def build_dataset():
+    """Build training examples from all 15 Round 2 scenarios."""
     scenarios = []
+    for fname in [
+        "dataset/easy_scenarios.json",
+        "dataset/medium_scenarios.json",
+        "dataset/hard_scenarios.json"
+    ]:
         try:
             with open(fname) as f:
                 data = json.load(f)
                 scenarios.extend(data)
+                print(f"  Loaded {len(data)} scenarios from {fname}")
         except FileNotFoundError:
+            print(f"{fname} not found, skipping")
     if not scenarios:
+        print("No local scenarios found. Fetching from live environment...")
+        try:
+            resp = requests.get(f"{ENV_URL}/tasks", timeout=15)
+            tasks = resp.json().get("tasks", [])
+            scenarios = [{"id": t["id"], "description": t["description"]} for t in tasks]
+            print(f"  Fetched {len(scenarios)} tasks from HF Space")
+        except Exception as e:
+            print(f"Could not fetch tasks: {e}")
+            sys.exit(1)
     examples = []
+    for s in scenarios:
+        prompt = f"""{SYSTEM_PROMPT}
+Current Database State:
+- Scenario: {s.get('id', 'unknown')}
+- Description: {s.get('description', '')}
+- Tables: {json.dumps(s.get('tables', []))}
+- Slow Queries: {json.dumps(s.get('slow_queries', []))}
+- Performance Score: {s.get('performance_score_baseline', 0)} / 100
+- Target Score: {s.get('target_score', 85)}
+What is your next action?"""
         examples.append({
+            "prompt":  prompt,
+            "task_id": s.get("id", "easy_s001"),
         })
+    diff_counts = {"easy": 0, "medium": 0, "hard": 0}
+    for ex in examples:
+        tid = ex["task_id"]
+        if str(tid).startswith("medium_"):
+            diff_counts["medium"] += 1
+        elif str(tid).startswith("hard_"):
+            diff_counts["hard"] += 1
+        else:
+            diff_counts["easy"] += 1
+    print(f"  Built {len(examples)} training examples total")
+    print(f"  Difficulty mix: easy={diff_counts['easy']} medium={diff_counts['medium']} hard={diff_counts['hard']}")
+    from datasets import Dataset
     return Dataset.from_list(examples)
+# ─────────────────────────────────────────────
+#  INFERENCE TEST — run immediately after save
+# ─────────────────────────────────────────────
+def test_inference(model, tokenizer):
+    """
+    REQUIRED: Test inference immediately after saving.
+    If this fails, the model was not saved correctly.
+    """
+    print("\n[INFERENCE TEST] Testing saved model...")
+    try:
+        FastLanguageModel.for_inference(model)
+        test_prompt = f"""{SYSTEM_PROMPT}
+Current Database State:
+- Scenario: easy_s001
+- Description: User lookup query taking 2s on 10K users table
+- Tables: [{{"name": "users", "rows": 10000, "indexes": ["PRIMARY"]}}]
+- Slow Queries: [{{"id": "q1", "sql": "SELECT * FROM users WHERE email=?", "avg_ms": 2000}}]
+- Performance Score: 8.0 / 100
+- Target Score: 80.0
+What is your next action?"""
+        inputs = tokenizer(
+            test_prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=1024
+        ).to(model.device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens  = 100,
+                temperature     = 0.3,
+                do_sample       = True,
+                pad_token_id    = tokenizer.eos_token_id,
+            )
+        response = tokenizer.decode(
+            outputs[0][inputs["input_ids"].shape[1]:],
+            skip_special_tokens=True
+        ).strip()
+        print(f"[INFERENCE TEST] Model output:\n  {response}")
+        # Validate output
+        action = parse_action(response)
+        print(f"[INFERENCE TEST] Parsed action: {action}")
+        print("[INFERENCE TEST] PASSED — model saved correctly!")
+        return True
+    except Exception as e:
+        print(f"[INFERENCE TEST] FAILED: {e}")
+        print("[INFERENCE TEST] Check model save path. Do NOT proceed without fixing this.")
+        return False
+# ─────────────────────────────────────────────
+#  MAIN TRAINING
+# ─────────────────────────────────────────────
 def train():
+    if not UNSLOTH_AVAILABLE:
+        print(" Cannot train — Unsloth not installed or no GPU found")
+        print("Run: pip install unsloth trl transformers datasets accelerate")
+        return
+    print(f"\n Loading model: {MODEL_NAME}")
+    print(f" Environment:   {ENV_URL}\n")
+    # Verify environment is reachable
     try:
         r = requests.get(f"{ENV_URL}/health", timeout=10)
+        version = r.json().get("version", "?")
+        print(f" Environment reachable — version {version}")
     except Exception as e:
+        print(f" Cannot reach environment at {ENV_URL}: {e}")
+        print("Check ENV_URL and make sure HF Space is running.")
         sys.exit(1)
+    # ── Load model ───────────────────────────────────────────
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name     = MODEL_NAME,
         max_seq_length = 2048,
+        load_in_4bit   = True,   # QLoRA — required for T4
+        dtype          = None,   # Auto detect
         token          = HF_TOKEN or None,
     )
+    print(" Model loaded")
+    # ── Apply LoRA adapters ──────────────────────────────────
     model = FastLanguageModel.get_peft_model(
         model,
         r              = 16,
         lora_alpha     = 16,
+        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                          "gate_proj", "up_proj", "down_proj"],
         lora_dropout   = 0,
         bias           = "none",
         use_gradient_checkpointing = "unsloth",
         random_state   = 42,
     )
+    print(" LoRA adapters applied")
+    # ── Build dataset ────────────────────────────────────────
+    print("\n[DATASET] Building training dataset...")
     dataset = build_dataset()
+    print(f" Dataset ready: {len(dataset)} examples")
+    # ── Reward wrapper ───────────────────────────────────────
     def reward_wrapper(prompts, completions, **kwargs):
+        batch = kwargs.get("batch", [])
+        if batch and hasattr(batch[0], "get"):
+            task_ids = [b.get("task_id", "easy_s001") for b in batch]
+        elif "task_id" in kwargs and kwargs["task_id"]:
+            task_ids = kwargs["task_id"]
+        else:
+            task_ids = ["easy_s001"] * len(prompts)
+        return reward_fn(prompts, completions, task_ids=task_ids)
+    # ── GRPO config ──────────────────────────────────────────
+    # NOTE: batch_size=1, num_generations=2 for free T4
+    # At venue A100: increase to batch_size=2, num_generations=4
     config = GRPOConfig(
         output_dir                  = OUTPUT_DIR,
         max_steps                   = MAX_STEPS,
+        per_device_train_batch_size = 1,   # 1 for T4, 2 for A100
+        gradient_accumulation_steps = 8,
         learning_rate               = 5e-6,
+        max_completion_length       = 256,
+        num_generations             = 2,   # 2 for T4, 4 for A100
+        temperature                 = 0.8,
+        logging_steps               = 5,
+        save_steps                  = 50,
+        save_total_limit            = 2,
         warmup_ratio                = 0.1,
         report_to                   = "none",
         remove_unused_columns       = False,
         train_dataset = dataset,
     )
+    # ── Train ────────────────────────────────────────────────
+    print(f"\n🏋️  Starting GRPO training — {MAX_STEPS} steps...")
+    print("Watch the 'reward' column — it should increase over time.\n")
     trainer.train()
+    print("\n Training complete!")
+    # ── Save — ADAPTER ONLY (correct way for QLoRA) ──────────
+    # DO NOT call merge_and_unload() on 4-bit model
+    # DO NOT upcast to 16-bit and merge naively
+    # CORRECT: save adapter weights only, load with from_pretrained later
+    print(f"\n[SAVE] Saving adapter to {OUTPUT_DIR}/final ...")
     Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
     model.save_pretrained(f"{OUTPUT_DIR}/final")
     tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
+    # Save config for reference
+    with open(f"{OUTPUT_DIR}/final/training_config.json", "w") as f:
+        json.dump({
+            "model_name":   MODEL_NAME,
+            "max_steps":    MAX_STEPS,
+            "save_method":  "adapter_only_qlora",
+            "lora_r":       16,
+            "lora_alpha":   16,
+        }, f, indent=2)
+    print(f" Adapter saved to {OUTPUT_DIR}/final")
+    # ── IMMEDIATE inference test (required) ──────────────────
+    passed = test_inference(model, tokenizer)
+    # ── Summary ──────────────────────────────────────────────
+    print("\n" + "="*60)
+    print("TRAINING COMPLETE")
+    print("="*60)
+    print(f"  Model:         {MODEL_NAME}")
+    print(f"  Steps:         {MAX_STEPS}")
+    print(f"  Saved to:      {OUTPUT_DIR}/final")
+    print(f"  Save method:   Adapter only (QLoRA safe)")
+    print(f"  Inference test: {' PASSED' if passed else ' FAILED'}")
+    print("="*60)
+    print("\nNext steps:")
+    print("  1. python training/evaluate_agent.py")
+    print("  2. Open reward_curve.png — show to judges")
+    print("  3. git add reward_curve.png && git commit && git push")
+    print("="*60)
 if __name__ == "__main__":