import gradio as gr import torch from threading import Thread from transformers import pipeline, TextIteratorStreamer # ✅ Load GGUF model pipe = pipeline( "text-generation", model="MaziyarPanahi/gemma-2b-it-GGUF", device_map="cpu" ) def generate_response(message, history): messages = [] # Chat history for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) streamer = TextIteratorStreamer( pipe.tokenizer, skip_prompt=True, skip_special_tokens=True ) generation_kwargs = dict( text_inputs=messages, streamer=streamer, max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True ) def run_generation(): try: with torch.no_grad(): pipe(**generation_kwargs) except Exception as e: print("Error:", e) streamer.text_queue.put(f"\n[Error: {e}]") streamer.end() Thread(target=run_generation).start() partial_text = "" for new_text in streamer: partial_text += new_text yield partial_text # 🎨 Gradio UI demo = gr.ChatInterface( fn=generate_response, title="Gemma 2B GGUF Chatbot", description="🚀 Running GGUF quantized Gemma on Hugging Face Spaces", examples=[ "Explain AI simply", "Write Python hello world", "What is IoT?" ], cache_examples=False ) if __name__ == "__main__": demo.launch()