"""
training/train_agent.py — SQL Database Engineer Agent
Unsloth + GRPO training script.
Run on venue GPU (April 25-26) with compute credits.

FREE T4 (Colab):  MODEL_NAME=unsloth/Qwen2.5-1.5B-Instruct  (default)
VENUE A100:       set ENV_VAR MODEL_NAME=unsloth/Qwen2.5-7B-Instruct
"""

import os
import json
import requests
import sys
import re
from pathlib import Path

# ── Try importing Unsloth (GPU only) ─────────────────────────
try:
    from unsloth import FastLanguageModel
    from trl import GRPOTrainer, GRPOConfig
    import torch
    UNSLOTH_AVAILABLE = True
    print("Unsloth + TRL loaded successfully")
except ImportError:
    UNSLOTH_AVAILABLE = False
    print("Unsloth not available. Run: pip install unsloth trl")

# ─────────────────────────────────────────────
#  CONFIG — change MODEL_NAME via env var at venue
# ─────────────────────────────────────────────

ENV_URL    = os.getenv("ENV_URL",    "https://junaid0600-sql-db-engineer-agent.hf.space")
HF_TOKEN   = os.getenv("HF_TOKEN",  "")
MODEL_NAME = os.getenv("MODEL_NAME", "unsloth/Qwen2.5-1.5B-Instruct")  # 1.5B for free T4
OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./sdea-trained")
MAX_STEPS  = int(os.getenv("MAX_STEPS", "100"))  # increase to 300+ at venue

print(f"[CONFIG] Model:      {MODEL_NAME}")
print(f"[CONFIG] Output:     {OUTPUT_DIR}")
print(f"[CONFIG] Max steps:  {MAX_STEPS}")
print(f"[CONFIG] ENV URL:    {ENV_URL}")

# ─────────────────────────────────────────────
#  SYSTEM PROMPT
# ─────────────────────────────────────────────

SYSTEM_PROMPT = """You are a senior database engineer.
Given the current database state with slow queries, choose the BEST action to improve performance.
Think step by step:
1. If you have not inspected queries yet -> use inspect_query
2. If you have not analyzed indexes -> use analyze_indexes
3. If you know which index is missing -> use create_index
4. If query can be rewritten better -> use rewrite_query
5. If table is huge (1M+ rows) -> use partition_table
6. When performance target is reached -> use submit_report

Respond with JSON only — no explanation, no markdown:
{"action_type": "...", "payload": {...}}"""


# ─────────────────────────────────────────────
#  REWARD FUNCTION (calls live HF Space)
# ─────────────────────────────────────────────

def parse_action(text: str) -> dict | None:
    """Parse LLM output into action dict. Returns None on failure."""
    try:
        text = text.strip()
        if "```" in text:
            text = text.split("```")[1]
            if text.startswith("json"):
                text = text[4:]
        text = text.strip()
        # Try direct JSON first
        data = json.loads(text)
        if "action_type" in data:
            return data
    except Exception:
        # Try extracting first JSON object from mixed text output
        match = re.search(r"\{[\s\S]*\}", text)
        if match:
            try:
                data = json.loads(match.group(0))
                if "action_type" in data:
                    return data
            except Exception:
                pass
    return None


def _extract_task_id_from_prompt(prompt_text: str) -> str | None:
    """Fallback extractor when GRPO doesn't pass task_id column."""
    match = re.search(r"-\s*Scenario:\s*([a-z]+_[a-z]?\d+)", prompt_text, flags=re.IGNORECASE)
    if match:
        return match.group(1)
    return None


def reward_fn(prompts, completions, **kwargs):
    """
    GRPO reward function — calls /grader on live environment.
    Returns list of float rewards, one per completion.
    Score always between 0.001 and 0.999.
    """
    rewards = []
    task_ids = kwargs.get("task_ids")
    if not task_ids:
        # GRPO can pass dataset columns directly as kwargs, not always via batch.
        task_ids = kwargs.get("task_id")
    if not task_ids:
        task_ids = ["easy_s001"] * len(prompts)
    if isinstance(task_ids, str):
        task_ids = [task_ids] * len(prompts)

    for i, (prompt, completion) in enumerate(zip(prompts, completions)):
        try:
            # Get completion text
            if isinstance(completion, list):
                text = completion[0].get("content", "") if completion else ""
            else:
                text = str(completion)

            # Parse into action
            action = parse_action(text)
            task_id = task_ids[i] if i < len(task_ids) else "easy_s001"
            if not task_id:
                task_id = _extract_task_id_from_prompt(str(prompt)) or "easy_s001"
            task_id = str(task_id)

            if action is None:
                rewards.append(0.001)
                print(f"  [REWARD] task={task_id} | action=parse_failed | score=0.001")
                continue

            # Use environment step reward so dense + milestone logic is used.
            # This also guarantees the sampled task_id actually drives reward.
            difficulty = "easy"
            if str(task_id).startswith("medium_"):
                difficulty = "medium"
            elif str(task_id).startswith("hard_"):
                difficulty = "hard"

            reset_resp = requests.post(
                f"{ENV_URL}/reset",
                json={"difficulty": difficulty, "task_id": task_id},
                timeout=15,
                headers={"Content-Type": "application/json"},
            )
            if reset_resp.status_code != 200:
                raise RuntimeError(f"/reset failed for {task_id}: {reset_resp.status_code}")

            step_resp = requests.post(
                f"{ENV_URL}/step",
                json=action,
                timeout=15,
                headers={"Content-Type": "application/json"},
            )
            if step_resp.status_code == 200:
                score = step_resp.json().get("reward", {}).get("score", 0.001)
                score = max(0.001, min(0.999, float(score)))
            else:
                # Fallback to grader for robustness.
                grader_resp = requests.post(
                    f"{ENV_URL}/grader",
                    json={"task_id": task_id, "action": action},
                    timeout=15,
                    headers={"Content-Type": "application/json"},
                )
                if grader_resp.status_code == 200:
                    score = grader_resp.json().get("score", 0.001)
                    score = max(0.001, min(0.999, float(score)))
                else:
                    score = 0.001

            action_name = str(action.get("action_type", "unknown"))

            rewards.append(score)
            print(f"  [REWARD] task={task_id} | action={action_name} | score={score:.3f}")

        except json.JSONDecodeError:
            rewards.append(0.001)
        except Exception as e:
            print(f"  [REWARD] Error: {e}")
            rewards.append(0.001)

    return rewards


# ─────────────────────────────────────────────
#  BUILD TRAINING DATASET
# ─────────────────────────────────────────────

def build_dataset():
    """Build training examples from all 15 Round 2 scenarios."""
    scenarios = []

    for fname in [
        "dataset/easy_scenarios.json",
        "dataset/medium_scenarios.json",
        "dataset/hard_scenarios.json"
    ]:
        try:
            with open(fname) as f:
                data = json.load(f)
                scenarios.extend(data)
                print(f"  Loaded {len(data)} scenarios from {fname}")
        except FileNotFoundError:
            print(f"{fname} not found, skipping")

    if not scenarios:
        print("No local scenarios found. Fetching from live environment...")
        try:
            resp = requests.get(f"{ENV_URL}/tasks", timeout=15)
            tasks = resp.json().get("tasks", [])
            scenarios = [{"id": t["id"], "description": t["description"]} for t in tasks]
            print(f"  Fetched {len(scenarios)} tasks from HF Space")
        except Exception as e:
            print(f"Could not fetch tasks: {e}")
            sys.exit(1)

    examples = []
    for s in scenarios:
        prompt = f"""{SYSTEM_PROMPT}

Current Database State:
- Scenario: {s.get('id', 'unknown')}
- Description: {s.get('description', '')}
- Tables: {json.dumps(s.get('tables', []))}
- Slow Queries: {json.dumps(s.get('slow_queries', []))}
- Performance Score: {s.get('performance_score_baseline', 0)} / 100
- Target Score: {s.get('target_score', 85)}

What is your next action?"""

        examples.append({
            "prompt":  prompt,
            "task_id": s.get("id", "easy_s001"),
        })

    diff_counts = {"easy": 0, "medium": 0, "hard": 0}
    for ex in examples:
        tid = ex["task_id"]
        if str(tid).startswith("medium_"):
            diff_counts["medium"] += 1
        elif str(tid).startswith("hard_"):
            diff_counts["hard"] += 1
        else:
            diff_counts["easy"] += 1
    print(f"  Built {len(examples)} training examples total")
    print(f"  Difficulty mix: easy={diff_counts['easy']} medium={diff_counts['medium']} hard={diff_counts['hard']}")
    from datasets import Dataset
    return Dataset.from_list(examples)


# ─────────────────────────────────────────────
#  INFERENCE TEST — run immediately after save
# ─────────────────────────────────────────────

def test_inference(model, tokenizer):
    """
    REQUIRED: Test inference immediately after saving.
    If this fails, the model was not saved correctly.
    """
    print("\n[INFERENCE TEST] Testing saved model...")
    try:
        FastLanguageModel.for_inference(model)

        test_prompt = f"""{SYSTEM_PROMPT}

Current Database State:
- Scenario: easy_s001
- Description: User lookup query taking 2s on 10K users table
- Tables: [{{"name": "users", "rows": 10000, "indexes": ["PRIMARY"]}}]
- Slow Queries: [{{"id": "q1", "sql": "SELECT * FROM users WHERE email=?", "avg_ms": 2000}}]
- Performance Score: 8.0 / 100
- Target Score: 80.0

What is your next action?"""

        inputs = tokenizer(
            test_prompt,
            return_tensors="pt",
            truncation=True,
            max_length=1024
        ).to(model.device)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens  = 100,
                temperature     = 0.3,
                do_sample       = True,
                pad_token_id    = tokenizer.eos_token_id,
            )

        response = tokenizer.decode(
            outputs[0][inputs["input_ids"].shape[1]:],
            skip_special_tokens=True
        ).strip()

        print(f"[INFERENCE TEST] Model output:\n  {response}")

        # Validate output
        action = parse_action(response)
        print(f"[INFERENCE TEST] Parsed action: {action}")
        print("[INFERENCE TEST] PASSED — model saved correctly!")
        return True

    except Exception as e:
        print(f"[INFERENCE TEST] FAILED: {e}")
        print("[INFERENCE TEST] Check model save path. Do NOT proceed without fixing this.")
        return False


# ─────────────────────────────────────────────
#  MAIN TRAINING
# ─────────────────────────────────────────────

def train():
    if not UNSLOTH_AVAILABLE:
        print(" Cannot train — Unsloth not installed or no GPU found")
        print("Run: pip install unsloth trl transformers datasets accelerate")
        return

    print(f"\n Loading model: {MODEL_NAME}")
    print(f" Environment:   {ENV_URL}\n")

    # Verify environment is reachable
    try:
        r = requests.get(f"{ENV_URL}/health", timeout=10)
        version = r.json().get("version", "?")
        print(f" Environment reachable — version {version}")
    except Exception as e:
        print(f" Cannot reach environment at {ENV_URL}: {e}")
        print("Check ENV_URL and make sure HF Space is running.")
        sys.exit(1)

    # ── Load model ───────────────────────────────────────────
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name     = MODEL_NAME,
        max_seq_length = 2048,
        load_in_4bit   = True,   # QLoRA — required for T4
        dtype          = None,   # Auto detect
        token          = HF_TOKEN or None,
    )
    print(" Model loaded")

    # ── Apply LoRA adapters ──────────────────────────────────
    model = FastLanguageModel.get_peft_model(
        model,
        r              = 16,
        lora_alpha     = 16,
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj"],
        lora_dropout   = 0,
        bias           = "none",
        use_gradient_checkpointing = "unsloth",
        random_state   = 42,
    )
    print(" LoRA adapters applied")

    # ── Build dataset ────────────────────────────────────────
    print("\n[DATASET] Building training dataset...")
    dataset = build_dataset()
    print(f" Dataset ready: {len(dataset)} examples")

    # ── Reward wrapper ───────────────────────────────────────
    def reward_wrapper(prompts, completions, **kwargs):
        batch = kwargs.get("batch", [])
        if batch and hasattr(batch[0], "get"):
            task_ids = [b.get("task_id", "easy_s001") for b in batch]
        elif "task_id" in kwargs and kwargs["task_id"]:
            task_ids = kwargs["task_id"]
        else:
            task_ids = ["easy_s001"] * len(prompts)
        return reward_fn(prompts, completions, task_ids=task_ids)

    # ── GRPO config ──────────────────────────────────────────
    # NOTE: batch_size=1, num_generations=2 for free T4
    # At venue A100: increase to batch_size=2, num_generations=4
    config = GRPOConfig(
        output_dir                  = OUTPUT_DIR,
        max_steps                   = MAX_STEPS,
        per_device_train_batch_size = 1,   # 1 for T4, 2 for A100
        gradient_accumulation_steps = 8,
        learning_rate               = 5e-6,
        max_completion_length       = 256,
        num_generations             = 2,   # 2 for T4, 4 for A100
        temperature                 = 0.8,
        logging_steps               = 5,
        save_steps                  = 50,
        save_total_limit            = 2,
        warmup_ratio                = 0.1,
        report_to                   = "none",
        remove_unused_columns       = False,
    )

    trainer = GRPOTrainer(
        model         = model,
        tokenizer     = tokenizer,
        reward_funcs  = reward_wrapper,
        args          = config,
        train_dataset = dataset,
    )

    # ── Train ────────────────────────────────────────────────
    print(f"\n🏋️  Starting GRPO training — {MAX_STEPS} steps...")
    print("Watch the 'reward' column — it should increase over time.\n")
    trainer.train()
    print("\n Training complete!")

    # ── Save — ADAPTER ONLY (correct way for QLoRA) ──────────
    # DO NOT call merge_and_unload() on 4-bit model
    # DO NOT upcast to 16-bit and merge naively
    # CORRECT: save adapter weights only, load with from_pretrained later
    print(f"\n[SAVE] Saving adapter to {OUTPUT_DIR}/final ...")
    Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
    model.save_pretrained(f"{OUTPUT_DIR}/final")
    tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")

    # Save config for reference
    with open(f"{OUTPUT_DIR}/final/training_config.json", "w") as f:
        json.dump({
            "model_name":   MODEL_NAME,
            "max_steps":    MAX_STEPS,
            "save_method":  "adapter_only_qlora",
            "lora_r":       16,
            "lora_alpha":   16,
        }, f, indent=2)
    print(f" Adapter saved to {OUTPUT_DIR}/final")

    # ── IMMEDIATE inference test (required) ──────────────────
    passed = test_inference(model, tokenizer)

    # ── Summary ──────────────────────────────────────────────
    print("\n" + "="*60)
    print("TRAINING COMPLETE")
    print("="*60)
    print(f"  Model:         {MODEL_NAME}")
    print(f"  Steps:         {MAX_STEPS}")
    print(f"  Saved to:      {OUTPUT_DIR}/final")
    print(f"  Save method:   Adapter only (QLoRA safe)")
    print(f"  Inference test: {' PASSED' if passed else ' FAILED'}")
    print("="*60)
    print("\nNext steps:")
    print("  1. python training/evaluate_agent.py")
    print("  2. Open reward_curve.png — show to judges")
    print("  3. git add reward_curve.png && git commit && git push")
    print("="*60)


if __name__ == "__main__":
    train()