import spaces
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from datasets import load_dataset

# Model configuration
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
ADAPTER_PATH = "sumedh/tinyllama-lora-math-adapter-v3"

# Load tokenizer globally
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Load test dataset for examples
print("Loading test dataset...")
test_data = load_dataset("openai/gsm8k", "main", split="test[:50]")

# Models will be loaded lazily on first GPU call
base_model = None
tuned_model = None


def load_models():
    """Load models on GPU when needed."""
    global base_model, tuned_model
    
    if base_model is None:
        print("Loading base model...")
        base_model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        ).eval()
    
    if tuned_model is None:
        print("Loading fine-tuned model...")
        tmp_model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
        tuned_model = PeftModel.from_pretrained(tmp_model, ADAPTER_PATH)
        tuned_model = tuned_model.merge_and_unload().eval()
    
    print("Models loaded!")
    return base_model, tuned_model


@spaces.GPU
def generate_responses(question):
    """Generate responses from both models - runs on GPU."""
    if not question.strip():
        return "Please enter a question.", ""
    
    # Load models if not already loaded
    base, tuned = load_models()
    
    prompt = f"### Instruction:\n{question}\n### Response:\n"
    token_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(base.device)
    
    # Generate from base model
    with torch.no_grad():
        base_output = base.generate(
            token_ids,
            max_new_tokens=256,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1
        )
    base_response = tokenizer.decode(base_output[0], skip_special_tokens=True)
    if "### Response:" in base_response:
        base_response = base_response.split("### Response:")[-1].strip()
    
    # Generate from tuned model
    token_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(tuned.device)
    with torch.no_grad():
        tuned_output = tuned.generate(
            token_ids,
            max_new_tokens=256,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1
        )
    tuned_response = tokenizer.decode(tuned_output[0], skip_special_tokens=True)
    if "### Response:" in tuned_response:
        tuned_response = tuned_response.split("### Response:")[-1].strip()
    
    return base_response, tuned_response


def load_example(idx):
    """Load an example from the test dataset."""
    idx = int(idx)
    if 0 <= idx < len(test_data):
        question = test_data[idx]["question"]
        answer = test_data[idx]["answer"]
        return question, answer
    return "", ""


def run_comparison(question, reference):
    """Run the full comparison."""
    if not question.strip():
        return "Please enter a question.", "", ""
    
    base_response, tuned_response = generate_responses(question)
    return base_response, tuned_response, reference


# Create Gradio interface
with gr.Blocks(title="TinyLlama Math Fine-tuning Demo") as demo:
    gr.Markdown("""
    # 🧮 TinyLlama Math Fine-tuning Demo
    
    Compare the performance of **base TinyLlama** vs **fine-tuned TinyLlama** on math word problems.
    
    - **Base Model**: TinyLlama-1.1B-Chat-v1.0 (no math training)
    - **Fine-tuned Model**: LoRA adapter trained on GSM8K dataset (7,473 examples)
    
    *Note: First run may take ~30s to load models.*
    """)
    
    with gr.Row():
        with gr.Column(scale=2):
            question_input = gr.Textbox(
                label="Math Question",
                placeholder="Enter a math word problem...",
                lines=4
            )
            reference_input = gr.Textbox(
                label="Reference Answer (optional)",
                placeholder="The correct answer will appear here when loading examples",
                lines=4
            )
        
        with gr.Column(scale=1):
            gr.Markdown("### Load Example")
            example_slider = gr.Slider(
                minimum=0,
                maximum=49,
                step=1,
                value=0,
                label="Example Index (0-49)"
            )
            load_btn = gr.Button("Load Example", variant="secondary")
    
    compare_btn = gr.Button("Compare Models", variant="primary", size="lg")
    
    gr.Markdown("---")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🔴 Base Model Response")
            base_output = gr.Textbox(label="", lines=10, show_label=False)
        
        with gr.Column():
            gr.Markdown("### 🟢 Fine-tuned Model Response")
            tuned_output = gr.Textbox(label="", lines=10, show_label=False)
        
        with gr.Column():
            gr.Markdown("### ✅ Correct Answer")
            reference_output = gr.Textbox(label="", lines=10, show_label=False)
    
    # Event handlers
    load_btn.click(
        fn=load_example,
        inputs=[example_slider],
        outputs=[question_input, reference_input]
    )
    
    compare_btn.click(
        fn=run_comparison,
        inputs=[question_input, reference_input],
        outputs=[base_output, tuned_output, reference_output]
    )
    
    gr.Markdown("""
    ---
    ### About
    
    This demo showcases the effect of fine-tuning a small language model (TinyLlama 1.1B) on math word problems.
    
    - **Dataset**: [GSM8K](https://huggingface.co/datasets/openai/gsm8k) - Grade School Math 8K
    - **Method**: LoRA (Low-Rank Adaptation)
    - **Training**: 5 epochs on 7,473 examples
    """)

if __name__ == "__main__":
    demo.launch()