import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import time
import random

# Model configuration - using TinyLlama for efficient CPU inference
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# Global variables for model components
tokenizer = None
model = None
text_generator = None

def load_model():
    """Load the Smol LLM model and tokenizer"""
    global tokenizer, model, text_generator
    try:
        print(f"Loading model: {MODEL_NAME}")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch.float32,  # Use float32 for CPU
            device_map="auto"
        )
        
        # Create text generation pipeline
        text_generator = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            max_new_tokens=512,
            temperature=0.7,
            top_p=0.95,
            do_sample=True
        )
        
        # Set pad token if not present
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token
            
        return "✅ Model loaded successfully!"
    except Exception as e:
        return f"❌ Error loading model: {str(e)}"

def format_prompt(prompt, system_prompt=None):
    """Format the prompt for chat-style models"""
    if system_prompt:
        formatted = f"<|system|>\n{system_prompt}\n<|user|>\n{prompt}\n<|assistant|>"
    else:
        formatted = f"<|user|>\n{prompt}\n<|assistant|>"
    return formatted

def generate_text(
    prompt,
    max_length=200,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.1,
    system_prompt="You are a helpful AI assistant. Provide clear and concise answers."
):
    """Generate text using the loaded model"""
    global text_generator
    
    if text_generator is None:
        return "⚠️ Please load the model first using the 'Load Model' button."
    
    if not prompt.strip():
        return "⚠️ Please enter a prompt."
    
    try:
        # Format the prompt
        formatted_prompt = format_prompt(prompt, system_prompt)
        
        # Update pipeline parameters
        text_generator.max_new_tokens = max_length
        text_generator.temperature = temperature
        text_generator.top_p = top_p
        text_generator.repetition_penalty = repetition_penalty
        
        # Generate response
        start_time = time.time()
        result = text_generator(
            formatted_prompt,
            max_new_tokens=max_length,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
        
        generation_time = time.time() - start_time
        
        # Extract the generated text
        generated_text = result[0]["generated_text"]
        
        # Extract only the assistant's response
        if "<|assistant|>" in generated_text:
            response = generated_text.split("<|assistant|>")[-1].strip()
        else:
            response = generated_text
        
        # Format output with metadata
        output = f"**Response:**\n{response}\n\n---\n*Generated in {generation_time:.2f} seconds*"
        
        return output
        
    except Exception as e:
        return f"❌ Error during generation: {str(e)}"

def clear_chat():
    """Clear the chat interface"""
    return "", ""

# Create custom theme
custom_theme = gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="indigo",
    neutral_hue="slate",
    font=gr.themes.GoogleFont("Inter"),
    text_size="lg",
    spacing_size="lg",
    radius_size="md"
).set(
    button_primary_background_fill="*primary_600",
    button_primary_background_fill_hover="*primary_700",
    block_title_text_weight="600",
)

# Build the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown(
        """
        # 🤖 Smol LLM Inference GUI
        
        **Built with [anycoder](https://huggingface.co/spaces/akhaliq/anycoder)** - 
        Efficient text generation using TinyLlama
    
        This application runs a compact language model locally for text generation.
        Perfect for chat, completion tasks, and creative writing.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=2):
            # Model loading section
            with gr.Group():
                gr.Markdown("### 📦 Model Management")
                model_status = gr.Textbox(
                    label="Model Status",
                    value="Model not loaded. Click 'Load Model' to start.",
                    interactive=False
                )
                load_btn = gr.Button(
                    "🔄 Load Model",
                    variant="primary",
                    size="lg"
                )
                
                # Generation parameters
                gr.Markdown("### ⚙️ Generation Parameters")
                
                with gr.Row():
                    max_length = gr.Slider(
                        minimum=50,
                        maximum=1024,
                        value=200,
                        step=50,
                        label="Max Tokens"
                    )
                    temperature = gr.Slider(
                        minimum=0.1,
                        maximum=2.0,
                        value=0.7,
                        step=0.1,
                        label="Temperature"
                    )
                
                with gr.Row():
                    top_p = gr.Slider(
                        minimum=0.1,
                        maximum=1.0,
                        value=0.95,
                        step=0.05,
                        label="Top-p"
                    )
                    repetition_penalty = gr.Slider(
                        minimum=1.0,
                        maximum=2.0,
                        value=1.1,
                        step=0.1,
                        label="Repetition Penalty"
                    )
                
                system_prompt = gr.Textbox(
                    label="System Prompt",
                    value="You are a helpful AI assistant. Provide clear and concise answers.",
                    lines=3,
                    placeholder="Enter a system prompt to guide the model's behavior..."
                )
        
        with gr.Column(scale=3):
            # Main interface
            with gr.Group():
                gr.Markdown("### 💬 Text Generation")
                
                prompt_input = gr.Textbox(
                    label="Enter your prompt",
                    placeholder="Type your message here...",
                    lines=4,
                    autofocus=True
                )
                
                with gr.Row():
                    generate_btn = gr.Button(
                        "🚀 Generate",
                        variant="primary",
                        size="lg"
                    )
                    clear_btn = gr.Button(
                        "🗑️ Clear",
                        variant="secondary"
                    )
                
                output_text = gr.Markdown(
                    label="Generated Response",
                    value="*Response will appear here...*"
                )
    
    # Example prompts
    with gr.Accordion("📝 Example Prompts", open=False):
        gr.Examples(
            examples=[
                ["Write a short story about a robot discovering music."],
                ["Explain quantum computing in simple terms."],
                ["Create a poem about the changing seasons."],
                ["What are the benefits of renewable energy?"],
                ["Write a Python function to calculate fibonacci numbers."],
                ["Describe the perfect day in your own words."],
                ["Explain the concept of machine learning to a beginner."],
                ["Create a dialogue between two friends planning a trip."]
            ],
            inputs=[prompt_input],
            label="Click an example to get started"
        )
    
    # Event handlers
    load_btn.click(
        fn=load_model,
        outputs=[model_status],
        api_visibility="public"
    )
    
    generate_btn.click(
        fn=generate_text,
        inputs=[
            prompt_input,
            max_length,
            temperature,
            top_p,
            repetition_penalty,
            system_prompt
        ],
        outputs=[output_text],
        api_visibility="public"
    )
    
    clear_btn.click(
        fn=clear_chat,
        outputs=[prompt_input],
        api_visibility="private"
    )
    
    # Allow Enter key to generate
    prompt_input.submit(
        fn=generate_text,
        inputs=[
            prompt_input,
            max_length,
            temperature,
            top_p,
            repetition_penalty,
            system_prompt
        ],
        outputs=[output_text],
        api_visibility="public"
    )

# Launch the application
demo.launch(
    theme=custom_theme,
    footer_links=[
        {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
        {"label": "TinyLlama Model", "url": "https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0"},
        {"label": "Gradio", "url": "https://gradio.app"}
    ],
    share=False,
    show_error=True
)