dark-app-74 / app.py
AiCoderv2's picture
Upload app.py with huggingface_hub
af5ca25 verified
import gradio as gr
import time
import os
from typing import List, Dict
class ChatbotHandler:
def __init__(self):
self.model_name = "facebook/opt-6.7b" # Smaller, faster 6.7B model instead of 13B
self.tokenizer = None
self.model = None
self.chat_pipeline = None
self.max_length = 512 # Reduced for speed
self.temperature = 0.7
self.model_loaded = False
self.system_prompt = """You are a helpful, friendly, and knowledgeable AI assistant.
You provide clear, accurate, and thoughtful responses. You are engaging and try to be
helpful while being honest about your limitations. Always maintain a positive and
supportive tone in your conversations."""
# Initialize the model
self.initialize_model()
def initialize_model(self):
"""Initialize the Hugging Face model with quantization for speed."""
try:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
except ImportError:
print("Transformers library not available. Please install the required dependencies.")
return False
try:
print("Loading OPT-6.7B model with 8-bit quantization... This should be faster.")
# Configure 8-bit quantization for speed
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_enable_fp32_cpu_offload=True
)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
device_map="auto", # Automatically distribute across available GPUs
torch_dtype=torch.float16,
low_cpu_mem_usage=True
)
# Set pad token if not present
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# Create pipeline for text generation with optimized settings
self.chat_pipeline = pipeline(
"text-generation",
model=self.model,
tokenizer=self.tokenizer,
device_map="auto",
max_length=self.max_length,
temperature=self.temperature,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
truncation=True,
use_fast=True
)
print("Model loaded successfully!")
self.model_loaded = True
return True
except Exception as e:
print(f"Error loading model: {str(e)}")
return False
def get_response(self, message: str, history: List[Dict]) -> str:
"""Get response from the model with optimized settings."""
if not self.chat_pipeline:
return "Model not loaded. Please try again later."
try:
# Prepare conversation history as a single string (limit to last 2 exchanges for speed)
conversation = self.system_prompt + "\n"
# Add recent history (limit to last 2 exchanges for speed)
for msg in history[-2:]:
if msg["role"] == "user":
conversation += f"User: {msg['content']}\n"
elif msg["role"] == "assistant":
conversation += f"Assistant: {msg['content']}\n"
# Add current message
conversation += f"User: {message}\nAssistant:"
# Generate response with optimized settings for speed
start_time = time.time()
outputs = self.chat_pipeline(
conversation,
max_new_tokens=50, # Shorter responses for speed
num_return_sequences=1,
return_full_text=False,
do_sample=True,
temperature=self.temperature,
top_p=0.9, # Add top_p for better quality
repetition_penalty=1.1 # Reduce repetition
)
end_time = time.time()
print(f"Response generated in {end_time - start_time:.2f} seconds")
response = outputs[0]['generated_text'].strip()
# Clean up response (remove any unwanted prefixes)
if response.startswith("Assistant:"):
response = response[10:].strip()
elif response.startswith("User:"):
response = "I apologize, but I seem to have gotten confused. How can I help you?"
# Limit response length for speed
if len(response) > 200:
response = response[:200] + "..."
# Faster streaming (yield larger chunks)
words = response.split()
current_response = ""
chunk_size = 3 # Yield every 3 words for faster streaming
for i in range(0, len(words), chunk_size):
chunk = words[i:i + chunk_size]
current_response += " ".join(chunk) + " "
yield current_response.strip()
time.sleep(0.01) # Very short delay for smooth streaming
except Exception as e:
yield f"I apologize, but I encountered an error. Please try again. Error: {str(e)}"
# Initialize chatbot handler
chat_handler = ChatbotHandler()
def respond_stream(message: str, history: List[Dict]):
"""Generate streaming response from the model with fixed history management."""
if not message.strip():
return "", history
# Create a copy of history to avoid mutation issues
current_history = history.copy()
# Always add user message first to prevent disappearing chats
current_history.append({"role": "user", "content": message})
# Check if model is initialized
if not chat_handler.chat_pipeline:
current_history.append({"role": "assistant", "content": "The chatbot model is still loading. Please wait a moment and try again."})
return "", current_history
# Get streaming response with error handling
full_response = ""
assistant_added = False
try:
for chunk in chat_handler.get_response(message, current_history[:-1]): # Don't include current user message in context
full_response = chunk
# Update or add the assistant message
if not assistant_added:
current_history.append({"role": "assistant", "content": full_response})
assistant_added = True
else:
current_history[-1]["content"] = full_response
yield "", current_history
except Exception as e:
# If streaming fails, add a fallback response
error_msg = "I apologize, but I encountered an error. Please try again."
if not assistant_added:
current_history.append({"role": "assistant", "content": error_msg})
else:
current_history[-1]["content"] = error_msg
yield "", current_history
def clear_history():
"""Clear the chat history."""
return []
def update_model_settings(temp, max_len):
"""Update model settings."""
chat_handler.temperature = temp
chat_handler.max_length = max_len
return f"Settings updated: temp={temp}, max_length={max_len}"
# Create the interface
with gr.Blocks(theme=gr.themes.Soft(), title="Fast AI Chatbot with OPT-6.7B") as demo:
# Header
gr.HTML("""
<div style='text-align: center; padding: 20px;'>
<h1>⚡ Fast AI Chatbot</h1>
<p style='color: #666;'>Powered by OPT-6.7B with 8-bit quantization • Built with <a href='https://huggingface.co/spaces/akhaliq/anycoder' target='_blank' style='color: #007bff; text-decoration: none;'>anycoder</a></p>
</div>
""")
# Status indicator
if chat_handler.model_loaded:
status_msg = "✅ Chatbot is ready! Responses should take 1-3 seconds."
status_color = "#28a745"
else:
status_msg = "⏳ Loading OPT-6.7B model with quantization... Should be faster than before."
status_color = "#ffc107"
gr.HTML(f"""
<div style='text-align: center; padding: 10px; background-color: {status_color}15; border: 1px solid {status_color}30; border-radius: 5px; margin: 10px 0;'>
<p style='color: {status_color}; margin: 0;'>{status_msg}</p>
</div>
""")
# Model settings
with gr.Accordion("Settings", open=False):
with gr.Row():
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.7,
step=0.1,
label="Temperature",
info="Higher values make responses more creative"
)
max_length = gr.Slider(
minimum=256,
maximum=1024,
value=512,
step=64,
label="Max Length",
info="Maximum context length (lower = faster)"
)
# Chatbot component
chatbot = gr.Chatbot(
type="messages",
label="Conversation",
height=500,
show_copy_button=True,
bubble_full_width=False,
avatar_images=(None, "https://huggingface.co/datasets/huggingface/avatars/resolve/main/bot-avatar.png")
)
# Input section
with gr.Row():
msg = gr.Textbox(
label="Your Message",
placeholder="Type your message here and press Enter...",
container=False,
scale=4
)
submit_btn = gr.Button("Send", variant="primary", scale=1)
# Control buttons
with gr.Row():
clear_btn = gr.Button("Clear Chat", variant="secondary")
refresh_btn = gr.Button("Refresh Settings", variant="secondary")
# Example questions
with gr.Accordion("Example Questions", open=False):
gr.Examples(
examples=[
"What's the difference between AI and machine learning?",
"Can you explain quantum computing in simple terms?",
"Help me write a professional email.",
"What are some good books to learn programming?",
"Can you help me brainstorm ideas for a project?",
"Explain the concept of blockchain technology."
],
inputs=msg,
label="Click an example to start chatting"
)
# Footer
gr.HTML("""
<div style='text-align: center; padding: 10px; color: #888; font-size: 0.9em;'>
<p>This chatbot uses Meta's OPT-6.7B model with 8-bit quantization for fast responses (1-3 seconds). It's completely free to use!</p>
<p><strong>Speed optimizations:</strong> Smaller model, quantization, shorter responses, optimized parameters.</p>
</div>
""")
# Event handlers
# Chat functionality
msg.submit(
respond_stream,
inputs=[msg, chatbot],
outputs=[msg, chatbot]
)
submit_btn.click(
respond_stream,
inputs=[msg, chatbot],
outputs=[msg, chatbot]
)
# Clear chat
clear_btn.click(clear_history, outputs=chatbot)
# Update model settings
temperature.change(
update_model_settings,
inputs=[temperature, max_length],
outputs=[]
)
max_length.change(
update_model_settings,
inputs=[temperature, max_length],
outputs=[]
)
# Refresh settings (useful for debugging)
refresh_btn.click(
lambda: f"Settings: temp={chat_handler.temperature}, max_length={chat_handler.max_length}",
outputs=[]
)
# Launch the app
if __name__ == "__main__":
demo.launch(share=True)