import gradio as gr import time import os from typing import List, Dict class ChatbotHandler: def __init__(self): self.model_name = "facebook/opt-6.7b" # Smaller, faster 6.7B model instead of 13B self.tokenizer = None self.model = None self.chat_pipeline = None self.max_length = 512 # Reduced for speed self.temperature = 0.7 self.model_loaded = False self.system_prompt = """You are a helpful, friendly, and knowledgeable AI assistant. You provide clear, accurate, and thoughtful responses. You are engaging and try to be helpful while being honest about your limitations. Always maintain a positive and supportive tone in your conversations.""" # Initialize the model self.initialize_model() def initialize_model(self): """Initialize the Hugging Face model with quantization for speed.""" try: from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig import torch except ImportError: print("Transformers library not available. Please install the required dependencies.") return False try: print("Loading OPT-6.7B model with 8-bit quantization... This should be faster.") # Configure 8-bit quantization for speed quantization_config = BitsAndBytesConfig( load_in_8bit=True, llm_int8_enable_fp32_cpu_offload=True ) self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True) self.model = AutoModelForCausalLM.from_pretrained( self.model_name, quantization_config=quantization_config, device_map="auto", # Automatically distribute across available GPUs torch_dtype=torch.float16, low_cpu_mem_usage=True ) # Set pad token if not present if self.tokenizer.pad_token is None: self.tokenizer.pad_token = self.tokenizer.eos_token # Create pipeline for text generation with optimized settings self.chat_pipeline = pipeline( "text-generation", model=self.model, tokenizer=self.tokenizer, device_map="auto", max_length=self.max_length, temperature=self.temperature, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, truncation=True, use_fast=True ) print("Model loaded successfully!") self.model_loaded = True return True except Exception as e: print(f"Error loading model: {str(e)}") return False def get_response(self, message: str, history: List[Dict]) -> str: """Get response from the model with optimized settings.""" if not self.chat_pipeline: return "Model not loaded. Please try again later." try: # Prepare conversation history as a single string (limit to last 2 exchanges for speed) conversation = self.system_prompt + "\n" # Add recent history (limit to last 2 exchanges for speed) for msg in history[-2:]: if msg["role"] == "user": conversation += f"User: {msg['content']}\n" elif msg["role"] == "assistant": conversation += f"Assistant: {msg['content']}\n" # Add current message conversation += f"User: {message}\nAssistant:" # Generate response with optimized settings for speed start_time = time.time() outputs = self.chat_pipeline( conversation, max_new_tokens=50, # Shorter responses for speed num_return_sequences=1, return_full_text=False, do_sample=True, temperature=self.temperature, top_p=0.9, # Add top_p for better quality repetition_penalty=1.1 # Reduce repetition ) end_time = time.time() print(f"Response generated in {end_time - start_time:.2f} seconds") response = outputs[0]['generated_text'].strip() # Clean up response (remove any unwanted prefixes) if response.startswith("Assistant:"): response = response[10:].strip() elif response.startswith("User:"): response = "I apologize, but I seem to have gotten confused. How can I help you?" # Limit response length for speed if len(response) > 200: response = response[:200] + "..." # Faster streaming (yield larger chunks) words = response.split() current_response = "" chunk_size = 3 # Yield every 3 words for faster streaming for i in range(0, len(words), chunk_size): chunk = words[i:i + chunk_size] current_response += " ".join(chunk) + " " yield current_response.strip() time.sleep(0.01) # Very short delay for smooth streaming except Exception as e: yield f"I apologize, but I encountered an error. Please try again. Error: {str(e)}" # Initialize chatbot handler chat_handler = ChatbotHandler() def respond_stream(message: str, history: List[Dict]): """Generate streaming response from the model with fixed history management.""" if not message.strip(): return "", history # Create a copy of history to avoid mutation issues current_history = history.copy() # Always add user message first to prevent disappearing chats current_history.append({"role": "user", "content": message}) # Check if model is initialized if not chat_handler.chat_pipeline: current_history.append({"role": "assistant", "content": "The chatbot model is still loading. Please wait a moment and try again."}) return "", current_history # Get streaming response with error handling full_response = "" assistant_added = False try: for chunk in chat_handler.get_response(message, current_history[:-1]): # Don't include current user message in context full_response = chunk # Update or add the assistant message if not assistant_added: current_history.append({"role": "assistant", "content": full_response}) assistant_added = True else: current_history[-1]["content"] = full_response yield "", current_history except Exception as e: # If streaming fails, add a fallback response error_msg = "I apologize, but I encountered an error. Please try again." if not assistant_added: current_history.append({"role": "assistant", "content": error_msg}) else: current_history[-1]["content"] = error_msg yield "", current_history def clear_history(): """Clear the chat history.""" return [] def update_model_settings(temp, max_len): """Update model settings.""" chat_handler.temperature = temp chat_handler.max_length = max_len return f"Settings updated: temp={temp}, max_length={max_len}" # Create the interface with gr.Blocks(theme=gr.themes.Soft(), title="Fast AI Chatbot with OPT-6.7B") as demo: # Header gr.HTML("""

⚡ Fast AI Chatbot

Powered by OPT-6.7B with 8-bit quantization • Built with anycoder

""") # Status indicator if chat_handler.model_loaded: status_msg = "✅ Chatbot is ready! Responses should take 1-3 seconds." status_color = "#28a745" else: status_msg = "⏳ Loading OPT-6.7B model with quantization... Should be faster than before." status_color = "#ffc107" gr.HTML(f"""

{status_msg}

""") # Model settings with gr.Accordion("Settings", open=False): with gr.Row(): temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature", info="Higher values make responses more creative" ) max_length = gr.Slider( minimum=256, maximum=1024, value=512, step=64, label="Max Length", info="Maximum context length (lower = faster)" ) # Chatbot component chatbot = gr.Chatbot( type="messages", label="Conversation", height=500, show_copy_button=True, bubble_full_width=False, avatar_images=(None, "https://huggingface.co/datasets/huggingface/avatars/resolve/main/bot-avatar.png") ) # Input section with gr.Row(): msg = gr.Textbox( label="Your Message", placeholder="Type your message here and press Enter...", container=False, scale=4 ) submit_btn = gr.Button("Send", variant="primary", scale=1) # Control buttons with gr.Row(): clear_btn = gr.Button("Clear Chat", variant="secondary") refresh_btn = gr.Button("Refresh Settings", variant="secondary") # Example questions with gr.Accordion("Example Questions", open=False): gr.Examples( examples=[ "What's the difference between AI and machine learning?", "Can you explain quantum computing in simple terms?", "Help me write a professional email.", "What are some good books to learn programming?", "Can you help me brainstorm ideas for a project?", "Explain the concept of blockchain technology." ], inputs=msg, label="Click an example to start chatting" ) # Footer gr.HTML("""

This chatbot uses Meta's OPT-6.7B model with 8-bit quantization for fast responses (1-3 seconds). It's completely free to use!

Speed optimizations: Smaller model, quantization, shorter responses, optimized parameters.

""") # Event handlers # Chat functionality msg.submit( respond_stream, inputs=[msg, chatbot], outputs=[msg, chatbot] ) submit_btn.click( respond_stream, inputs=[msg, chatbot], outputs=[msg, chatbot] ) # Clear chat clear_btn.click(clear_history, outputs=chatbot) # Update model settings temperature.change( update_model_settings, inputs=[temperature, max_length], outputs=[] ) max_length.change( update_model_settings, inputs=[temperature, max_length], outputs=[] ) # Refresh settings (useful for debugging) refresh_btn.click( lambda: f"Settings: temp={chat_handler.temperature}, max_length={chat_handler.max_length}", outputs=[] ) # Launch the app if __name__ == "__main__": demo.launch(share=True)