import gradio as gr
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import os

# Custom CSS for ChatGPT-like appearance
custom_css = """
body, .gradio-container {
    background-color: #0d0d0d !important;
    color: #e5e5e5 !important;
    font-family: 'Inter', -apple-system, BlinkMacSystemFont, sans-serif !important;
}

#chatbot {
    border: none !important;
    background: transparent !important;
}

.message.user {
    background-color: #2f2f2f !important;
    border-radius: 18px !important;
    padding: 12px 16px !important;
    margin: 8px 0 !important;
    max-width: 85% !important;
    align-self: flex-end !important;
}

.message.bot {
    background-color: transparent !important;
    padding: 12px 0 !important;
    margin: 8px 0 !important;
    max-width: 90% !important;
}

#input-container {
    background: #1a1a1a !important;
    border: 1px solid #333 !important;
    border-radius: 12px !important;
    padding: 8px !important;
    margin-top: 20px !important;
}

#send-button {
    background-color: #ffffff !important;
    color: #000000 !important;
    border-radius: 8px !important;
    font-weight: 600 !important;
}

#sidebar {
    background-color: #000000 !important;
    border-right: 1px solid #222 !important;
    padding: 20px !important;
}

.gr-button-secondary {
    background-color: #222 !important;
    color: white !important;
    border: 1px solid #333 !important;
}

footer {
    display: none !important;
}
"""

# Global cache for models
models_cache = {}

def get_pipeline(model_id):
    if model_id not in models_cache:
        print(f"Loading model {model_id}...")
        try:
            pipe = pipeline(
                "text-generation",
                model=model_id,
                device_map="auto",
                trust_remote_code=True,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            )
            models_cache[model_id] = pipe
        except Exception as e:
            raise gr.Error(f"Failed to load model {model_id} locally: {str(e)}")
    return models_cache[model_id]

def respond(
    message,
    history,
    model_id,
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    pipe = get_pipeline(model_id)

    # Convert history to chat format for tokenizer
    messages = [{"role": "system", "content": system_message}]
    for user_msg, bot_msg in history:
        if user_msg: messages.append({"role": "user", "content": user_msg})
        if bot_msg: messages.append({"role": "assistant", "content": bot_msg})
    
    messages.append({"role": "user", "content": message})

    # Generate using the pipeline
    try:
        # Prompt construction depends on model chat template
        # Many small models use a specific format
        prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        
        # We'll use the pipeline's built-in handling but for streaming we need to do it manually or use a ThreadedGenerator
        # Since Gradio expects a generator for streaming, let's use the simplest streaming approach
        
        outputs = pipe(
            prompt,
            max_new_tokens=max_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=pipe.tokenizer.eos_token_id,
        )
        
        full_response = outputs[0]['generated_text']
        # Extract only the newly generated part
        response = full_response[len(prompt):]
        yield response
        
    except Exception as e:
        yield f"Error during generation: {str(e)}"

with gr.Blocks(theme=gr.themes.Soft(primary_hue="gray"), css=custom_css) as demo:
    with gr.Row():
        # Sidebar for settings
        with gr.Column(scale=1, elem_id="sidebar"):
            gr.Markdown("## 🛠️ Settings")
            
            model_id = gr.Dropdown(
                choices=[
                    "onedevelopment/oneai-1.2-38m",
                    "onedevelopment/oneai-1-35m"
                ],
                value="onedevelopment/oneai-1.2-38m",
                label="Select Model",
                interactive=True
            )
            
            system_message = gr.Textbox(
                value="You are a helpful and advanced AI assistant named OneAI.",
                label="System Prompt",
                lines=3
            )
            
            with gr.Accordion("Advanced Parameters", open=False):
                max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens")
                temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature")
                top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.05, label="Top-p")
            
            gr.Markdown("---")
            gr.Markdown("Models run locally on Space CPU/GPU.")
            
        # Main Chat Area
        with gr.Column(scale=4):
            gr.Markdown("# 💬 OneAI Chat")
            
            chatbot = gr.Chatbot(
                height=650,
                elem_id="chatbot",
                show_label=False,
                bubble_full_width=False,
                type="messages"
            )
            
            with gr.Row(elem_id="input-container"):
                msg = gr.Textbox(
                    placeholder="Ask OneAI anything...",
                    show_label=False,
                    scale=9,
                    container=False
                )
                submit_btn = gr.Button("↑", scale=1, variant="primary", elem_id="send-button")

            gr.ClearButton([msg, chatbot], variant="secondary")

    # Linking components
    def chat_echo(message, history):
        history.append({"role": "user", "content": message})
        return "", history

    def bot_response(history, model_id, system_message, max_tokens, temperature, top_p):
        user_message = history[-1]["content"]
        
        legacy_history = []
        for i in range(0, len(history) - 1, 2):
            if i + 1 < len(history):
                legacy_history.append([history[i]["content"], history[i+1]["content"]])
        
        history.append({"role": "assistant", "content": ""})
        
        response_gen = respond(
            user_message, 
            legacy_history, 
            model_id, 
            system_message, 
            max_tokens, 
            temperature, 
            top_p
        )
        
        for partial_response in response_gen:
            history[-1]["content"] = partial_response
            yield history

    msg.submit(chat_echo, [msg, chatbot], [msg, chatbot], queue=False, api_name=False).then(
        bot_response, [chatbot, model_id, system_message, max_tokens, temperature, top_p], chatbot, api_name=False
    )
    submit_btn.click(chat_echo, [msg, chatbot], [msg, chatbot], queue=False, api_name=False).then(
        bot_response, [chatbot, model_id, system_message, max_tokens, temperature, top_p], chatbot, api_name=False
    )

if __name__ == "__main__":
    demo.launch()