Spaces:

junaid0600
/

sql-db-engineer-agent

Sleeping

App Files Files Community

junaid0600 commited on Apr 25

Commit

987f2db

verified ·

1 Parent(s): a7802a8

Update training/train_agent.py

Browse files

Files changed (1) hide show

training/train_agent.py +243 -70

training/train_agent.py CHANGED Viewed

@@ -2,41 +2,56 @@
 training/train_agent.py — SQL Database Engineer Agent
 Unsloth + GRPO training script.
 Run on venue GPU (April 25-26) with compute credits.
-ENV_URL points to live HF Space environment.
 """
 import os
 import json
 import requests
-from datasets import Dataset
 # ── Try importing Unsloth (GPU only) ─────────────────────────
 try:
     from unsloth import FastLanguageModel
     from trl import GRPOTrainer, GRPOConfig
     UNSLOTH_AVAILABLE = True
 except ImportError:
     UNSLOTH_AVAILABLE = False
-    print("⚠️  Unsloth not available. Run: pip install unsloth trl")
 # ─────────────────────────────────────────────
-#  CONFIG
 # ─────────────────────────────────────────────
 ENV_URL    = os.getenv("ENV_URL",    "https://junaid0600-sql-db-engineer-agent.hf.space")
 HF_TOKEN   = os.getenv("HF_TOKEN",  "")
-MODEL_NAME = os.getenv("MODEL_NAME", "unsloth/Qwen2.5-7B-Instruct")
 OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./sdea-trained")
-SYSTEM_PROMPT = """You are a senior database engineer.
 Given the current database state with slow queries, choose the BEST action to improve performance.
 Think step by step:
-1. If you haven't inspected queries yet → use inspect_query
-2. If you haven't analyzed indexes → use analyze_indexes
-3. If you know which index is missing → use create_index
-4. If query can be rewritten better → use rewrite_query
-5. If table is huge (1M+ rows) → use partition_table
-6. When performance target is reached → use submit_report
 Respond with JSON only — no explanation, no markdown:
 {"action_type": "...", "payload": {...}}"""
@@ -46,37 +61,65 @@ Respond with JSON only — no explanation, no markdown:
 #  REWARD FUNCTION (calls live HF Space)
 # ─────────────────────────────────────────────
 def reward_fn(prompts, completions, **kwargs):
     """
-    GRPO reward function — calls /step on live environment.
     Returns list of float rewards, one per completion.
     """
     rewards = []
     task_ids = kwargs.get("task_ids", ["easy_s001"] * len(prompts))
     for i, (prompt, completion) in enumerate(zip(prompts, completions)):
         try:
-            # Parse action from model output
-            text = completion[0]["content"] if isinstance(completion, list) else str(completion)
-            text = text.strip().replace("```json", "").replace("```", "").strip()
-            action = json.loads(text)
-            # Reset environment for this task
             task_id = task_ids[i] if i < len(task_ids) else "easy_s001"
-            requests.post(f"{ENV_URL}/reset",
-                json={"task_id": task_id}, timeout=15)
-            # Submit action and get reward
-            resp = requests.post(f"{ENV_URL}/step",
-                json=action, timeout=15)
-            data = resp.json()
-            score = data.get("reward", {}).get("score", 0.001)
-            rewards.append(float(score))
         except json.JSONDecodeError:
-            rewards.append(0.001)  # Invalid JSON output
         except Exception as e:
-            print(f"Reward fn error: {e}")
             rewards.append(0.001)
     return rewards
@@ -90,21 +133,29 @@ def build_dataset():
     """Build training examples from all 15 Round 2 scenarios."""
     scenarios = []
-    # Load all scenario files
-    for fname in ["dataset/easy_scenarios.json",
-                  "dataset/medium_scenarios.json",
-                  "dataset/hard_scenarios.json"]:
         try:
             with open(fname) as f:
-                scenarios.extend(json.load(f))
         except FileNotFoundError:
             print(f"{fname} not found, skipping")
     if not scenarios:
-        # Fallback: fetch from live environment
-        resp = requests.get(f"{ENV_URL}/tasks", timeout=15)
-        tasks = resp.json().get("tasks", [])
-        scenarios = [{"id": t["id"], "description": t["description"]} for t in tasks]
     examples = []
     for s in scenarios:
@@ -121,36 +172,109 @@ Current Database State:
 What is your next action?"""
         examples.append({
-            "prompt":   prompt,
-            "task_id":  s.get("id", "easy_s001"),
         })
-    print(f"Built {len(examples)} training examples")
     return Dataset.from_list(examples)
 # ─────────────────────────────────────────────
 #  MAIN TRAINING
 # ─────────────────────────────────────────────
 def train():
     if not UNSLOTH_AVAILABLE:
-        print("Cannot train — Unsloth not installed")
         print("Run: pip install unsloth trl transformers datasets accelerate")
         return
-    print(f"🚀 Loading model: {MODEL_NAME}")
-    print(f"🌐 Environment: {ENV_URL}")
-    # Load model with Unsloth 4-bit quantization
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name     = MODEL_NAME,
-        max_seq_length = 4096,
-        load_in_4bit   = True,
         token          = HF_TOKEN or None,
     )
-    # Add LoRA adapters
     model = FastLanguageModel.get_peft_model(
         model,
         r              = 16,
@@ -160,48 +284,97 @@ def train():
         lora_dropout   = 0,
         bias           = "none",
         use_gradient_checkpointing = "unsloth",
     )
-    # Build dataset
     dataset = build_dataset()
-    # GRPO config
     config = GRPOConfig(
         output_dir                  = OUTPUT_DIR,
-        num_train_epochs            = 3,
-        per_device_train_batch_size = 2,
         gradient_accumulation_steps = 8,
-        learning_rate               = 5e-5,
         max_completion_length       = 256,
-        num_generations             = 4,
-        logging_steps               = 10,
         save_steps                  = 50,
         warmup_ratio                = 0.1,
         report_to                   = "none",
     )
-    # Reward function wrapper
-    def reward_wrapper(prompts, completions, **kwargs):
-        task_ids = [ex.get("task_id", "easy_s001") for ex in kwargs.get("batch", [])]
-        return reward_fn(prompts, completions, task_ids=task_ids)
-    # Train
     trainer = GRPOTrainer(
-        model        = model,
-        tokenizer    = tokenizer,
-        reward_funcs = reward_wrapper,
-        args         = config,
         train_dataset = dataset,
     )
-    print("🏋️  Starting GRPO training...")
     trainer.train()
-    # Save
     model.save_pretrained(f"{OUTPUT_DIR}/final")
     tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
-    print(f"Training complete. Model saved to {OUTPUT_DIR}/final")
 if __name__ == "__main__":
-    train()

 training/train_agent.py — SQL Database Engineer Agent
 Unsloth + GRPO training script.
 Run on venue GPU (April 25-26) with compute credits.
+FREE T4 (Colab):  MODEL_NAME=unsloth/Qwen2.5-1.5B-Instruct  (default)
+VENUE A100:       set ENV_VAR MODEL_NAME=unsloth/Qwen2.5-7B-Instruct
 """
 import os
 import json
 import requests
+import sys
+from pathlib import Path
 # ── Try importing Unsloth (GPU only) ─────────────────────────
 try:
     from unsloth import FastLanguageModel
     from trl import GRPOTrainer, GRPOConfig
+    import torch
     UNSLOTH_AVAILABLE = True
+    print("Unsloth + TRL loaded successfully")
 except ImportError:
     UNSLOTH_AVAILABLE = False
+    print("Unsloth not available. Run: pip install unsloth trl")
 # ─────────────────────────────────────────────
+#  CONFIG — change MODEL_NAME via env var at venue
 # ─────────────────────────────────────────────
 ENV_URL    = os.getenv("ENV_URL",    "https://junaid0600-sql-db-engineer-agent.hf.space")
 HF_TOKEN   = os.getenv("HF_TOKEN",  "")
+MODEL_NAME = os.getenv("MODEL_NAME", "unsloth/Qwen2.5-1.5B-Instruct")  # 1.5B for free T4
 OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./sdea-trained")
+MAX_STEPS  = int(os.getenv("MAX_STEPS", "100"))  # increase to 300+ at venue
+print(f"[CONFIG] Model:      {MODEL_NAME}")
+print(f"[CONFIG] Output:     {OUTPUT_DIR}")
+print(f"[CONFIG] Max steps:  {MAX_STEPS}")
+print(f"[CONFIG] ENV URL:    {ENV_URL}")
+# ─────────────────────────────────────────────
+#  SYSTEM PROMPT
+# ─────────────────────────────────────────────
+SYSTEM_PROMPT = """You are a senior database engineer.
 Given the current database state with slow queries, choose the BEST action to improve performance.
 Think step by step:
+1. If you have not inspected queries yet -> use inspect_query
+2. If you have not analyzed indexes -> use analyze_indexes
+3. If you know which index is missing -> use create_index
+4. If query can be rewritten better -> use rewrite_query
+5. If table is huge (1M+ rows) -> use partition_table
+6. When performance target is reached -> use submit_report
 Respond with JSON only — no explanation, no markdown:
 {"action_type": "...", "payload": {...}}"""
 #  REWARD FUNCTION (calls live HF Space)
 # ─────────────────────────────────────────────
+def parse_action(text: str) -> dict:
+    """Parse LLM output into action dict. Returns safe fallback on failure."""
+    try:
+        text = text.strip()
+        if "```" in text:
+            text = text.split("```")[1]
+            if text.startswith("json"):
+                text = text[4:]
+        text = text.strip()
+        data = json.loads(text)
+        if "action_type" in data:
+            return data
+    except Exception:
+        pass
+    # Safe fallback — never crashes
+    return {"action_type": "inspect_query", "payload": {"query_id": "q1"}}
 def reward_fn(prompts, completions, **kwargs):
     """
+    GRPO reward function — calls /grader on live environment.
     Returns list of float rewards, one per completion.
+    Score always between 0.001 and 0.999.
     """
     rewards = []
     task_ids = kwargs.get("task_ids", ["easy_s001"] * len(prompts))
     for i, (prompt, completion) in enumerate(zip(prompts, completions)):
         try:
+            # Get completion text
+            if isinstance(completion, list):
+                text = completion[0].get("content", "") if completion else ""
+            else:
+                text = str(completion)
+            # Parse into action
+            action = parse_action(text)
             task_id = task_ids[i] if i < len(task_ids) else "easy_s001"
+            # Call grader endpoint
+            resp = requests.post(
+                f"{ENV_URL}/grader",
+                json={"task_id": task_id, "action": action},
+                timeout=15,
+                headers={"Content-Type": "application/json"}
+            )
+            if resp.status_code == 200:
+                score = resp.json().get("score", 0.001)
+                score = max(0.001, min(0.999, float(score)))
+            else:
+                score = 0.001
+            rewards.append(score)
+            print(f"  [REWARD] task={task_id} | action={action.get('action_type')} | score={score:.3f}")
         except json.JSONDecodeError:
+            rewards.append(0.001)
         except Exception as e:
+            print(f"  [REWARD] Error: {e}")
             rewards.append(0.001)
     return rewards
     """Build training examples from all 15 Round 2 scenarios."""
     scenarios = []
+    for fname in [
+        "dataset/easy_scenarios.json",
+        "dataset/medium_scenarios.json",
+        "dataset/hard_scenarios.json"
+    ]:
         try:
             with open(fname) as f:
+                data = json.load(f)
+                scenarios.extend(data)
+                print(f"  Loaded {len(data)} scenarios from {fname}")
         except FileNotFoundError:
             print(f"{fname} not found, skipping")
     if not scenarios:
+        print("No local scenarios found. Fetching from live environment...")
+        try:
+            resp = requests.get(f"{ENV_URL}/tasks", timeout=15)
+            tasks = resp.json().get("tasks", [])
+            scenarios = [{"id": t["id"], "description": t["description"]} for t in tasks]
+            print(f"  Fetched {len(scenarios)} tasks from HF Space")
+        except Exception as e:
+            print(f"Could not fetch tasks: {e}")
+            sys.exit(1)
     examples = []
     for s in scenarios:
 What is your next action?"""
         examples.append({
+            "prompt":  prompt,
+            "task_id": s.get("id", "easy_s001"),
         })
+    print(f"  Built {len(examples)} training examples total")
+    from datasets import Dataset
     return Dataset.from_list(examples)
+# ─────────────────────────────────────────────
+#  INFERENCE TEST — run immediately after save
+# ─────────────────────────────────────────────
+def test_inference(model, tokenizer):
+    """
+    REQUIRED: Test inference immediately after saving.
+    If this fails, the model was not saved correctly.
+    """
+    print("\n[INFERENCE TEST] Testing saved model...")
+    try:
+        FastLanguageModel.for_inference(model)
+        test_prompt = f"""{SYSTEM_PROMPT}
+Current Database State:
+- Scenario: easy_s001
+- Description: User lookup query taking 2s on 10K users table
+- Tables: [{{"name": "users", "rows": 10000, "indexes": ["PRIMARY"]}}]
+- Slow Queries: [{{"id": "q1", "sql": "SELECT * FROM users WHERE email=?", "avg_ms": 2000}}]
+- Performance Score: 8.0 / 100
+- Target Score: 80.0
+What is your next action?"""
+        inputs = tokenizer(
+            test_prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=1024
+        ).to(model.device)
+        with torch.no_grad():
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens  = 100,
+                temperature     = 0.3,
+                do_sample       = True,
+                pad_token_id    = tokenizer.eos_token_id,
+            )
+        response = tokenizer.decode(
+            outputs[0][inputs["input_ids"].shape[1]:],
+            skip_special_tokens=True
+        ).strip()
+        print(f"[INFERENCE TEST] Model output:\n  {response}")
+        # Validate output
+        action = parse_action(response)
+        print(f"[INFERENCE TEST] Parsed action: {action}")
+        print("[INFERENCE TEST] PASSED — model saved correctly!")
+        return True
+    except Exception as e:
+        print(f"[INFERENCE TEST] FAILED: {e}")
+        print("[INFERENCE TEST] Check model save path. Do NOT proceed without fixing this.")
+        return False
 # ─────────────────────────────────────────────
 #  MAIN TRAINING
 # ─────────────────────────────────────────────
 def train():
     if not UNSLOTH_AVAILABLE:
+        print(" Cannot train — Unsloth not installed or no GPU found")
         print("Run: pip install unsloth trl transformers datasets accelerate")
         return
+    print(f"\n Loading model: {MODEL_NAME}")
+    print(f" Environment:   {ENV_URL}\n")
+    # Verify environment is reachable
+    try:
+        r = requests.get(f"{ENV_URL}/health", timeout=10)
+        version = r.json().get("version", "?")
+        print(f" Environment reachable — version {version}")
+    except Exception as e:
+        print(f" Cannot reach environment at {ENV_URL}: {e}")
+        print("Check ENV_URL and make sure HF Space is running.")
+        sys.exit(1)
+    # ── Load model ───────────────────────────────────────────
     model, tokenizer = FastLanguageModel.from_pretrained(
         model_name     = MODEL_NAME,
+        max_seq_length = 2048,
+        load_in_4bit   = True,   # QLoRA — required for T4
+        dtype          = None,   # Auto detect
         token          = HF_TOKEN or None,
     )
+    print(" Model loaded")
+    # ── Apply LoRA adapters ──────────────────────────────────
     model = FastLanguageModel.get_peft_model(
         model,
         r              = 16,
         lora_dropout   = 0,
         bias           = "none",
         use_gradient_checkpointing = "unsloth",
+        random_state   = 42,
     )
+    print(" LoRA adapters applied")
+    # ── Build dataset ────────────────────────────────────────
+    print("\n[DATASET] Building training dataset...")
     dataset = build_dataset()
+    print(f" Dataset ready: {len(dataset)} examples")
+    # ── Reward wrapper ───────────────────────────────────────
+    def reward_wrapper(prompts, completions, **kwargs):
+        batch = kwargs.get("batch", [])
+        if batch and hasattr(batch[0], "get"):
+            task_ids = [b.get("task_id", "easy_s001") for b in batch]
+        else:
+            task_ids = ["easy_s001"] * len(prompts)
+        return reward_fn(prompts, completions, task_ids=task_ids)
+    # ── GRPO config ──────────────────────────────────────────
+    # NOTE: batch_size=1, num_generations=2 for free T4
+    # At venue A100: increase to batch_size=2, num_generations=4
     config = GRPOConfig(
         output_dir                  = OUTPUT_DIR,
+        max_steps                   = MAX_STEPS,
+        per_device_train_batch_size = 1,   # 1 for T4, 2 for A100
         gradient_accumulation_steps = 8,
+        learning_rate               = 5e-6,
         max_completion_length       = 256,
+        num_generations             = 2,   # 2 for T4, 4 for A100
+        temperature                 = 0.8,
+        logging_steps               = 5,
         save_steps                  = 50,
+        save_total_limit            = 2,
         warmup_ratio                = 0.1,
         report_to                   = "none",
+        remove_unused_columns       = False,
     )
     trainer = GRPOTrainer(
+        model         = model,
+        tokenizer     = tokenizer,
+        reward_funcs  = reward_wrapper,
+        args          = config,
         train_dataset = dataset,
     )
+    # ── Train ────────────────────────────────────────────────
+    print(f"\n🏋️  Starting GRPO training — {MAX_STEPS} steps...")
+    print("Watch the 'reward' column — it should increase over time.\n")
     trainer.train()
+    print("\n Training complete!")
+    # ── Save — ADAPTER ONLY (correct way for QLoRA) ──────────
+    # DO NOT call merge_and_unload() on 4-bit model
+    # DO NOT upcast to 16-bit and merge naively
+    # CORRECT: save adapter weights only, load with from_pretrained later
+    print(f"\n[SAVE] Saving adapter to {OUTPUT_DIR}/final ...")
+    Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
     model.save_pretrained(f"{OUTPUT_DIR}/final")
     tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
+    # Save config for reference
+    with open(f"{OUTPUT_DIR}/final/training_config.json", "w") as f:
+        json.dump({
+            "model_name":   MODEL_NAME,
+            "max_steps":    MAX_STEPS,
+            "save_method":  "adapter_only_qlora",
+            "lora_r":       16,
+            "lora_alpha":   16,
+        }, f, indent=2)
+    print(f" Adapter saved to {OUTPUT_DIR}/final")
+    # ── IMMEDIATE inference test (required) ──────────────────
+    passed = test_inference(model, tokenizer)
+    # ── Summary ──────────────────────────────────────────────
+    print("\n" + "="*60)
+    print("TRAINING COMPLETE")
+    print("="*60)
+    print(f"  Model:         {MODEL_NAME}")
+    print(f"  Steps:         {MAX_STEPS}")
+    print(f"  Saved to:      {OUTPUT_DIR}/final")
+    print(f"  Save method:   Adapter only (QLoRA safe)")
+    print(f"  Inference test: {' PASSED' if passed else ' FAILED'}")
+    print("="*60)
+    print("\nNext steps:")
+    print("  1. python training/evaluate_agent.py")
+    print("  2. Open reward_curve.png — show to judges")
+    print("  3. git add reward_curve.png && git commit && git push")
+    print("="*60)
 if __name__ == "__main__":
+    train()