import gradio as gr
import torch
from threading import Thread
from transformers import pipeline, TextIteratorStreamer

# ✅ Load GGUF model
pipe = pipeline(
    "text-generation",
    model="MaziyarPanahi/gemma-2b-it-GGUF",
    device_map="cpu"
)

def generate_response(message, history):

    messages = []

    # Chat history
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})

    messages.append({"role": "user", "content": message})

    streamer = TextIteratorStreamer(
        pipe.tokenizer,
        skip_prompt=True,
        skip_special_tokens=True
    )

    generation_kwargs = dict(
        text_inputs=messages,
        streamer=streamer,
        max_new_tokens=256,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

    def run_generation():
        try:
            with torch.no_grad():
                pipe(**generation_kwargs)
        except Exception as e:
            print("Error:", e)
            streamer.text_queue.put(f"\n[Error: {e}]")
            streamer.end()

    Thread(target=run_generation).start()

    partial_text = ""

    for new_text in streamer:
        partial_text += new_text
        yield partial_text


# 🎨 Gradio UI
demo = gr.ChatInterface(
    fn=generate_response,
    title="Gemma 2B GGUF Chatbot",
    description="🚀 Running GGUF quantized Gemma on Hugging Face Spaces",
    examples=[
        "Explain AI simply",
        "Write Python hello world",
        "What is IoT?"
    ],
    cache_examples=False
)

if __name__ == "__main__":
    demo.launch()