# models/loader.py
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    GenerationConfig,
)
from backend.agents import ROLE_PROMPTS

# Optional quantization config (used only if GPU is available)
QUANTIZATION_CONFIG = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

MODEL_REGISTRY = {
    "ceo": "Qwen/Qwen3-0.6B",
    "planner": "Qwen/Qwen3-0.6B",
    "manager": "Qwen/Qwen3-0.6B",
    "debugger": "Qwen/Qwen3-0.6B",
    "business_analyst": "Qwen/Qwen3-0.6B",
    "ux_ui_designer": "Qwen/Qwen3-0.6B",
    "worker_backend_coder": "Qwen/Qwen3-0.6B",
    "worker_front_end_coder": "Qwen/Qwen3-0.6B",
    "worker_tester": "Qwen/Qwen3-0.6B",
    "code_analyst": "Qwen/Qwen3-0.6B",
}
_MODEL_CACHE = {}

# Explicit generation config (avoids model-specific overrides)
GENERATION_CONFIG = GenerationConfig(
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    top_k=50,
    repetition_penalty=1.1,
)


def get_model_and_tokenizer(model_name):
    """
    Loads a model and its tokenizer from the Hugging Face Hub.
    Implements caching to avoid reloading the model for each call.
    """
    if model_name not in _MODEL_CACHE:
        print(f"Loading model: {model_name}...")

        tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

        # Ensure a dedicated pad token exists (not EOS)
        if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
            tokenizer.add_special_tokens({"pad_token": "<|pad|>"})

        # Load model with GPU/CPU awareness
        use_gpu = torch.cuda.is_available()
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto" if use_gpu else None,
            quantization_config=QUANTIZATION_CONFIG if use_gpu else None,
            trust_remote_code=True,
        )

        # Resize embeddings if new tokens were added
        model.resize_token_embeddings(len(tokenizer))

        # Explicitly move to CPU if no GPU
        if not use_gpu:
            model.to("cpu")

        _MODEL_CACHE[model_name] = {"model": model, "tokenizer": tokenizer}

    return _MODEL_CACHE[model_name]["model"], _MODEL_CACHE[model_name]["tokenizer"]


def generate_with_model(agent_role, prompt, generation_config: GenerationConfig = GENERATION_CONFIG):
    """
    Generates a response using the specified agent's model.
    """
    model_name = MODEL_REGISTRY.get(agent_role, "Qwen/Qwen3-0.6B")
    model, tokenizer = get_model_and_tokenizer(model_name)

    full_prompt = f"You are a helpful assistant. {ROLE_PROMPTS.get(agent_role, '')}\n\nUser prompt: {prompt}"

    # Use tokenizer(...) to get both input_ids and attention_mask
    inputs = tokenizer(
        full_prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
    )
    input_ids = inputs["input_ids"].to(model.device)
    attention_mask = inputs["attention_mask"].to(model.device)

    with torch.no_grad():
        output = model.generate(
            input_ids,
            attention_mask=attention_mask,   # ✅ ensures padding is ignored
            generation_config=generation_config,
            pad_token_id=tokenizer.pad_token_id,
        )

    # Slice off the prompt tokens to avoid prompt-echo issues
    generated_tokens = output[0][input_ids.shape[-1]:]
    decoded_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()

    return decoded_output