Spaces:
Sleeping
Sleeping
File size: 12,064 Bytes
adb6707 563d6b1 adb6707 563d6b1 af5ca25 c79d862 af5ca25 563d6b1 7c2a0f5 6558d3e 563d6b1 6558d3e c79d862 bbc2a92 c79d862 af5ca25 563d6b1 af5ca25 7c2a0f5 af5ca25 6558d3e af5ca25 6558d3e af5ca25 6558d3e c79d862 6558d3e af5ca25 c79d862 6558d3e c79d862 6558d3e af5ca25 c79d862 7c2a0f5 bbc2a92 563d6b1 c79d862 bbc2a92 adb6707 563d6b1 af5ca25 c79d862 563d6b1 af5ca25 c79d862 563d6b1 af5ca25 563d6b1 c79d862 563d6b1 c79d862 563d6b1 c79d862 563d6b1 af5ca25 c79d862 af5ca25 c79d862 6558d3e af5ca25 563d6b1 af5ca25 563d6b1 c79d862 6558d3e af5ca25 c79d862 af5ca25 c79d862 af5ca25 c79d862 af5ca25 563d6b1 af5ca25 563d6b1 bbc2a92 af5ca25 563d6b1 adb6707 af5ca25 6558d3e af5ca25 6558d3e c79d862 af5ca25 adb6707 af5ca25 563d6b1 af5ca25 6558d3e af5ca25 6558d3e af5ca25 6558d3e af5ca25 6558d3e af5ca25 563d6b1 af5ca25 adb6707 c79d862 bbc2a92 c79d862 bbc2a92 563d6b1 af5ca25 adb6707 bbc2a92 af5ca25 bbc2a92 adb6707 bbc2a92 7c2a0f5 af5ca25 bbc2a92 af5ca25 6558d3e bbc2a92 563d6b1 bbc2a92 c79d862 bbc2a92 c79d862 af5ca25 c79d862 af5ca25 563d6b1 bbc2a92 adb6707 bbc2a92 563d6b1 bbc2a92 563d6b1 bbc2a92 af5ca25 bbc2a92 563d6b1 bbc2a92 563d6b1 bbc2a92 563d6b1 adb6707 563d6b1 adb6707 563d6b1 c79d862 563d6b1 c79d862 563d6b1 c79d862 563d6b1 bbc2a92 c79d862 bbc2a92 adb6707 78f6180 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 | import gradio as gr
import time
import os
from typing import List, Dict
class ChatbotHandler:
def __init__(self):
self.model_name = "facebook/opt-6.7b" # Smaller, faster 6.7B model instead of 13B
self.tokenizer = None
self.model = None
self.chat_pipeline = None
self.max_length = 512 # Reduced for speed
self.temperature = 0.7
self.model_loaded = False
self.system_prompt = """You are a helpful, friendly, and knowledgeable AI assistant.
You provide clear, accurate, and thoughtful responses. You are engaging and try to be
helpful while being honest about your limitations. Always maintain a positive and
supportive tone in your conversations."""
# Initialize the model
self.initialize_model()
def initialize_model(self):
"""Initialize the Hugging Face model with quantization for speed."""
try:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
import torch
except ImportError:
print("Transformers library not available. Please install the required dependencies.")
return False
try:
print("Loading OPT-6.7B model with 8-bit quantization... This should be faster.")
# Configure 8-bit quantization for speed
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_enable_fp32_cpu_offload=True
)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
quantization_config=quantization_config,
device_map="auto", # Automatically distribute across available GPUs
torch_dtype=torch.float16,
low_cpu_mem_usage=True
)
# Set pad token if not present
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# Create pipeline for text generation with optimized settings
self.chat_pipeline = pipeline(
"text-generation",
model=self.model,
tokenizer=self.tokenizer,
device_map="auto",
max_length=self.max_length,
temperature=self.temperature,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
truncation=True,
use_fast=True
)
print("Model loaded successfully!")
self.model_loaded = True
return True
except Exception as e:
print(f"Error loading model: {str(e)}")
return False
def get_response(self, message: str, history: List[Dict]) -> str:
"""Get response from the model with optimized settings."""
if not self.chat_pipeline:
return "Model not loaded. Please try again later."
try:
# Prepare conversation history as a single string (limit to last 2 exchanges for speed)
conversation = self.system_prompt + "\n"
# Add recent history (limit to last 2 exchanges for speed)
for msg in history[-2:]:
if msg["role"] == "user":
conversation += f"User: {msg['content']}\n"
elif msg["role"] == "assistant":
conversation += f"Assistant: {msg['content']}\n"
# Add current message
conversation += f"User: {message}\nAssistant:"
# Generate response with optimized settings for speed
start_time = time.time()
outputs = self.chat_pipeline(
conversation,
max_new_tokens=50, # Shorter responses for speed
num_return_sequences=1,
return_full_text=False,
do_sample=True,
temperature=self.temperature,
top_p=0.9, # Add top_p for better quality
repetition_penalty=1.1 # Reduce repetition
)
end_time = time.time()
print(f"Response generated in {end_time - start_time:.2f} seconds")
response = outputs[0]['generated_text'].strip()
# Clean up response (remove any unwanted prefixes)
if response.startswith("Assistant:"):
response = response[10:].strip()
elif response.startswith("User:"):
response = "I apologize, but I seem to have gotten confused. How can I help you?"
# Limit response length for speed
if len(response) > 200:
response = response[:200] + "..."
# Faster streaming (yield larger chunks)
words = response.split()
current_response = ""
chunk_size = 3 # Yield every 3 words for faster streaming
for i in range(0, len(words), chunk_size):
chunk = words[i:i + chunk_size]
current_response += " ".join(chunk) + " "
yield current_response.strip()
time.sleep(0.01) # Very short delay for smooth streaming
except Exception as e:
yield f"I apologize, but I encountered an error. Please try again. Error: {str(e)}"
# Initialize chatbot handler
chat_handler = ChatbotHandler()
def respond_stream(message: str, history: List[Dict]):
"""Generate streaming response from the model with fixed history management."""
if not message.strip():
return "", history
# Create a copy of history to avoid mutation issues
current_history = history.copy()
# Always add user message first to prevent disappearing chats
current_history.append({"role": "user", "content": message})
# Check if model is initialized
if not chat_handler.chat_pipeline:
current_history.append({"role": "assistant", "content": "The chatbot model is still loading. Please wait a moment and try again."})
return "", current_history
# Get streaming response with error handling
full_response = ""
assistant_added = False
try:
for chunk in chat_handler.get_response(message, current_history[:-1]): # Don't include current user message in context
full_response = chunk
# Update or add the assistant message
if not assistant_added:
current_history.append({"role": "assistant", "content": full_response})
assistant_added = True
else:
current_history[-1]["content"] = full_response
yield "", current_history
except Exception as e:
# If streaming fails, add a fallback response
error_msg = "I apologize, but I encountered an error. Please try again."
if not assistant_added:
current_history.append({"role": "assistant", "content": error_msg})
else:
current_history[-1]["content"] = error_msg
yield "", current_history
def clear_history():
"""Clear the chat history."""
return []
def update_model_settings(temp, max_len):
"""Update model settings."""
chat_handler.temperature = temp
chat_handler.max_length = max_len
return f"Settings updated: temp={temp}, max_length={max_len}"
# Create the interface
with gr.Blocks(theme=gr.themes.Soft(), title="Fast AI Chatbot with OPT-6.7B") as demo:
# Header
gr.HTML("""
<div style='text-align: center; padding: 20px;'>
<h1>⚡ Fast AI Chatbot</h1>
<p style='color: #666;'>Powered by OPT-6.7B with 8-bit quantization • Built with <a href='https://huggingface.co/spaces/akhaliq/anycoder' target='_blank' style='color: #007bff; text-decoration: none;'>anycoder</a></p>
</div>
""")
# Status indicator
if chat_handler.model_loaded:
status_msg = "✅ Chatbot is ready! Responses should take 1-3 seconds."
status_color = "#28a745"
else:
status_msg = "⏳ Loading OPT-6.7B model with quantization... Should be faster than before."
status_color = "#ffc107"
gr.HTML(f"""
<div style='text-align: center; padding: 10px; background-color: {status_color}15; border: 1px solid {status_color}30; border-radius: 5px; margin: 10px 0;'>
<p style='color: {status_color}; margin: 0;'>{status_msg}</p>
</div>
""")
# Model settings
with gr.Accordion("Settings", open=False):
with gr.Row():
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.7,
step=0.1,
label="Temperature",
info="Higher values make responses more creative"
)
max_length = gr.Slider(
minimum=256,
maximum=1024,
value=512,
step=64,
label="Max Length",
info="Maximum context length (lower = faster)"
)
# Chatbot component
chatbot = gr.Chatbot(
type="messages",
label="Conversation",
height=500,
show_copy_button=True,
bubble_full_width=False,
avatar_images=(None, "https://huggingface.co/datasets/huggingface/avatars/resolve/main/bot-avatar.png")
)
# Input section
with gr.Row():
msg = gr.Textbox(
label="Your Message",
placeholder="Type your message here and press Enter...",
container=False,
scale=4
)
submit_btn = gr.Button("Send", variant="primary", scale=1)
# Control buttons
with gr.Row():
clear_btn = gr.Button("Clear Chat", variant="secondary")
refresh_btn = gr.Button("Refresh Settings", variant="secondary")
# Example questions
with gr.Accordion("Example Questions", open=False):
gr.Examples(
examples=[
"What's the difference between AI and machine learning?",
"Can you explain quantum computing in simple terms?",
"Help me write a professional email.",
"What are some good books to learn programming?",
"Can you help me brainstorm ideas for a project?",
"Explain the concept of blockchain technology."
],
inputs=msg,
label="Click an example to start chatting"
)
# Footer
gr.HTML("""
<div style='text-align: center; padding: 10px; color: #888; font-size: 0.9em;'>
<p>This chatbot uses Meta's OPT-6.7B model with 8-bit quantization for fast responses (1-3 seconds). It's completely free to use!</p>
<p><strong>Speed optimizations:</strong> Smaller model, quantization, shorter responses, optimized parameters.</p>
</div>
""")
# Event handlers
# Chat functionality
msg.submit(
respond_stream,
inputs=[msg, chatbot],
outputs=[msg, chatbot]
)
submit_btn.click(
respond_stream,
inputs=[msg, chatbot],
outputs=[msg, chatbot]
)
# Clear chat
clear_btn.click(clear_history, outputs=chatbot)
# Update model settings
temperature.change(
update_model_settings,
inputs=[temperature, max_length],
outputs=[]
)
max_length.change(
update_model_settings,
inputs=[temperature, max_length],
outputs=[]
)
# Refresh settings (useful for debugging)
refresh_btn.click(
lambda: f"Settings: temp={chat_handler.temperature}, max_length={chat_handler.max_length}",
outputs=[]
)
# Launch the app
if __name__ == "__main__":
demo.launch(share=True) |