"""
Nimo's Personal Coder Agent - HuggingFace Spaces Demo

A fine-tuned LLM for code generation, deployed on HuggingFace Spaces.
"""

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

# Configuration
MODEL_ID = "CaptainNimo/nimos-coder-agent-v2"
BASE_MODEL_ID = "Qwen/Qwen2.5-Coder-0.5B-Instruct"

# Global variables for model and tokenizer
model = None
tokenizer = None


def load_model():
    """Load the fine-tuned model."""
    global model, tokenizer

    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    print("Loading base model...")
    # Try GPU first, fall back to CPU
    if torch.cuda.is_available():
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_ID,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True,
        )
    else:
        # CPU fallback (slower but works)
        base_model = AutoModelForCausalLM.from_pretrained(
            BASE_MODEL_ID,
            torch_dtype=torch.float32,
            device_map="cpu",
            trust_remote_code=True,
        )

    print("Loading fine-tuned adapter...")
    model = PeftModel.from_pretrained(base_model, MODEL_ID)
    model.eval()

    print("Model loaded successfully!")
    return model, tokenizer


def generate_code(instruction: str, context: str = "", max_tokens: int = 256, temperature: float = 0.7):
    """Generate code from instruction."""
    global model, tokenizer

    if model is None:
        return "Model is loading, please wait..."

    # Build prompt
    if context.strip():
        prompt = f"""### Instruction:
{instruction}

### Input:
{context}

### Response:
"""
    else:
        prompt = f"""### Instruction:
{instruction}

### Response:
"""

    # Generate
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            temperature=temperature,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract just the response
    if "### Response:" in response:
        response = response.split("### Response:")[-1].strip()

    return response


# Example prompts
EXAMPLES = [
    ["Write a Python function to check if a number is prime", ""],
    ["Create a JavaScript function to debounce API calls", ""],
    ["Write a SQL query to find the top 5 customers by sales", ""],
    ["Fix the bug in this code", "def factorial(n):\n    return n * factorial(n-1)"],
    ["Add error handling to this function", "def divide(a, b):\n    return a / b"],
]

# Load model at startup
print("Initializing Nimo's Coder Agent...")
load_model()

# Create interface
with gr.Blocks(title="Nimo's Coder Agent", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
        # Nimo's Personal Coder Agent

        A fine-tuned LLM for code generation, debugging, and code review.

        **Model**: Qwen2.5-Coder-0.5B + QLoRA fine-tuned on CodeAlpaca-20k

        [GitHub](https://github.com/CaptainNimo/nimos-personal-coder-agent) |
        [Model](https://huggingface.co/CaptainNimo/nimos-coder-agent-v2)
        """
    )

    with gr.Row():
        with gr.Column():
            instruction = gr.Textbox(
                label="What code do you need?",
                placeholder="e.g., Write a Python function to sort a list...",
                lines=2
            )
            context = gr.Textbox(
                label="Context/Existing Code (optional)",
                placeholder="Paste code here for debugging or refactoring...",
                lines=4
            )
            with gr.Row():
                max_tokens = gr.Slider(64, 512, value=256, step=32, label="Max Length")
                temperature = gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Creativity")

            btn = gr.Button("Generate Code", variant="primary")

        with gr.Column():
            output = gr.Code(label="Generated Code", language="python", lines=15)

    gr.Examples(examples=EXAMPLES, inputs=[instruction, context])

    btn.click(generate_code, inputs=[instruction, context, max_tokens, temperature], outputs=output)

    gr.Markdown("---\n*Fine-tuned by Nimo using QLoRA on free Google Colab GPU*")

demo.launch()