AiCoderv2 commited on
Commit
af5ca25
·
verified ·
1 Parent(s): 6558d3e

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +73 -50
app.py CHANGED
@@ -5,11 +5,11 @@ from typing import List, Dict
5
 
6
  class ChatbotHandler:
7
  def __init__(self):
8
- self.model_name = "facebook/opt-13b" # 13B parameter model (close to 15B)
9
  self.tokenizer = None
10
  self.model = None
11
  self.chat_pipeline = None
12
- self.max_length = 1000
13
  self.temperature = 0.7
14
  self.model_loaded = False
15
  self.system_prompt = """You are a helpful, friendly, and knowledgeable AI assistant.
@@ -21,20 +21,29 @@ class ChatbotHandler:
21
  self.initialize_model()
22
 
23
  def initialize_model(self):
24
- """Initialize the Hugging Face model."""
25
  try:
26
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
27
  except ImportError:
28
  print("Transformers library not available. Please install the required dependencies.")
29
  return False
30
 
31
  try:
32
- print("Loading OPT-13B model... This may take a very long time and require significant memory.")
 
 
 
 
 
 
 
33
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
34
  self.model = AutoModelForCausalLM.from_pretrained(
35
  self.model_name,
 
36
  device_map="auto", # Automatically distribute across available GPUs
37
- torch_dtype="auto",
38
  low_cpu_mem_usage=True
39
  )
40
 
@@ -42,7 +51,7 @@ class ChatbotHandler:
42
  if self.tokenizer.pad_token is None:
43
  self.tokenizer.pad_token = self.tokenizer.eos_token
44
 
45
- # Create pipeline for text generation
46
  self.chat_pipeline = pipeline(
47
  "text-generation",
48
  model=self.model,
@@ -52,7 +61,8 @@ class ChatbotHandler:
52
  temperature=self.temperature,
53
  do_sample=True,
54
  pad_token_id=self.tokenizer.eos_token_id,
55
- truncation=True
 
56
  )
57
  print("Model loaded successfully!")
58
  self.model_loaded = True
@@ -62,16 +72,16 @@ class ChatbotHandler:
62
  return False
63
 
64
  def get_response(self, message: str, history: List[Dict]) -> str:
65
- """Get response from the model."""
66
  if not self.chat_pipeline:
67
  return "Model not loaded. Please try again later."
68
 
69
  try:
70
- # Prepare conversation history as a single string
71
  conversation = self.system_prompt + "\n"
72
 
73
- # Add recent history (limit to last 3 exchanges to save memory)
74
- for msg in history[-3:]:
75
  if msg["role"] == "user":
76
  conversation += f"User: {msg['content']}\n"
77
  elif msg["role"] == "assistant":
@@ -80,15 +90,20 @@ class ChatbotHandler:
80
  # Add current message
81
  conversation += f"User: {message}\nAssistant:"
82
 
83
- # Generate response with memory constraints
 
84
  outputs = self.chat_pipeline(
85
  conversation,
86
- max_new_tokens=100, # Shorter responses to save memory
87
  num_return_sequences=1,
88
  return_full_text=False,
89
  do_sample=True,
90
- temperature=self.temperature
 
 
91
  )
 
 
92
 
93
  response = outputs[0]['generated_text'].strip()
94
 
@@ -98,56 +113,64 @@ class ChatbotHandler:
98
  elif response.startswith("User:"):
99
  response = "I apologize, but I seem to have gotten confused. How can I help you?"
100
 
101
- # Limit response length
102
- if len(response) > 500:
103
- response = response[:500] + "..."
104
 
105
- # Simulate streaming by yielding chunks
106
  words = response.split()
107
  current_response = ""
108
- for word in words:
109
- current_response += word + " "
 
 
110
  yield current_response.strip()
111
- time.sleep(0.02) # Faster streaming
112
 
113
  except Exception as e:
114
- yield f"I apologize, but I encountered an error generating a response. Please try asking your question again. Error: {str(e)}"
115
 
116
  # Initialize chatbot handler
117
  chat_handler = ChatbotHandler()
118
 
119
  def respond_stream(message: str, history: List[Dict]):
120
- """Generate streaming response from the model."""
121
  if not message.strip():
122
  return "", history
123
 
 
 
 
124
  # Always add user message first to prevent disappearing chats
125
- history = history + [{"role": "user", "content": message}]
126
 
127
  # Check if model is initialized
128
  if not chat_handler.chat_pipeline:
129
- history = history + [{"role": "assistant", "content": "The chatbot model is still loading. Please wait a moment and try again."}]
130
- return "", history
131
 
132
- # Get streaming response
133
  full_response = ""
 
 
134
  try:
135
- for chunk in chat_handler.get_response(message, history[:-1]): # Don't include current user message in context
136
  full_response = chunk
137
- # Update the last assistant message
138
- if len(history) > 0 and history[-1].get("role") == "assistant":
139
- history[-1]["content"] = full_response
 
140
  else:
141
- history = history + [{"role": "assistant", "content": full_response}]
142
- yield "", history
143
  except Exception as e:
144
  # If streaming fails, add a fallback response
145
  error_msg = "I apologize, but I encountered an error. Please try again."
146
- if len(history) > 0 and history[-1].get("role") == "assistant":
147
- history[-1]["content"] = error_msg
148
  else:
149
- history = history + [{"role": "assistant", "content": error_msg}]
150
- yield "", history
151
 
152
  def clear_history():
153
  """Clear the chat history."""
@@ -160,22 +183,22 @@ def update_model_settings(temp, max_len):
160
  return f"Settings updated: temp={temp}, max_length={max_len}"
161
 
162
  # Create the interface
163
- with gr.Blocks(theme=gr.themes.Soft(), title="AI Chatbot with OPT-13B") as demo:
164
 
165
  # Header
166
  gr.HTML("""
167
  <div style='text-align: center; padding: 20px;'>
168
- <h1>🤖 AI Chatbot</h1>
169
- <p style='color: #666;'>Powered by OPT-13B (13B parameters) • Built with <a href='https://huggingface.co/spaces/akhaliq/anycoder' target='_blank' style='color: #007bff; text-decoration: none;'>anycoder</a></p>
170
  </div>
171
  """)
172
 
173
  # Status indicator
174
  if chat_handler.model_loaded:
175
- status_msg = "✅ Chatbot is ready! Start chatting below."
176
  status_color = "#28a745"
177
  else:
178
- status_msg = "⏳ Loading OPT-13B model... This may take 10-20 minutes and requires significant memory."
179
  status_color = "#ffc107"
180
 
181
  gr.HTML(f"""
@@ -196,12 +219,12 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI Chatbot with OPT-13B") as demo:
196
  info="Higher values make responses more creative"
197
  )
198
  max_length = gr.Slider(
199
- minimum=500,
200
- maximum=2000,
201
- value=1000,
202
- step=100,
203
  label="Max Length",
204
- info="Maximum context length"
205
  )
206
 
207
  # Chatbot component
@@ -247,8 +270,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="AI Chatbot with OPT-13B") as demo:
247
  # Footer
248
  gr.HTML("""
249
  <div style='text-align: center; padding: 10px; color: #888; font-size: 0.9em;'>
250
- <p>This chatbot uses Meta's OPT-13B model (13 billion parameters) from Hugging Face. It's completely free to use!</p>
251
- <p><strong>Note:</strong> This large model requires significant computational resources and may take time to load and respond.</p>
252
  </div>
253
  """)
254
 
 
5
 
6
  class ChatbotHandler:
7
  def __init__(self):
8
+ self.model_name = "facebook/opt-6.7b" # Smaller, faster 6.7B model instead of 13B
9
  self.tokenizer = None
10
  self.model = None
11
  self.chat_pipeline = None
12
+ self.max_length = 512 # Reduced for speed
13
  self.temperature = 0.7
14
  self.model_loaded = False
15
  self.system_prompt = """You are a helpful, friendly, and knowledgeable AI assistant.
 
21
  self.initialize_model()
22
 
23
  def initialize_model(self):
24
+ """Initialize the Hugging Face model with quantization for speed."""
25
  try:
26
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
27
+ import torch
28
  except ImportError:
29
  print("Transformers library not available. Please install the required dependencies.")
30
  return False
31
 
32
  try:
33
+ print("Loading OPT-6.7B model with 8-bit quantization... This should be faster.")
34
+
35
+ # Configure 8-bit quantization for speed
36
+ quantization_config = BitsAndBytesConfig(
37
+ load_in_8bit=True,
38
+ llm_int8_enable_fp32_cpu_offload=True
39
+ )
40
+
41
  self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=True)
42
  self.model = AutoModelForCausalLM.from_pretrained(
43
  self.model_name,
44
+ quantization_config=quantization_config,
45
  device_map="auto", # Automatically distribute across available GPUs
46
+ torch_dtype=torch.float16,
47
  low_cpu_mem_usage=True
48
  )
49
 
 
51
  if self.tokenizer.pad_token is None:
52
  self.tokenizer.pad_token = self.tokenizer.eos_token
53
 
54
+ # Create pipeline for text generation with optimized settings
55
  self.chat_pipeline = pipeline(
56
  "text-generation",
57
  model=self.model,
 
61
  temperature=self.temperature,
62
  do_sample=True,
63
  pad_token_id=self.tokenizer.eos_token_id,
64
+ truncation=True,
65
+ use_fast=True
66
  )
67
  print("Model loaded successfully!")
68
  self.model_loaded = True
 
72
  return False
73
 
74
  def get_response(self, message: str, history: List[Dict]) -> str:
75
+ """Get response from the model with optimized settings."""
76
  if not self.chat_pipeline:
77
  return "Model not loaded. Please try again later."
78
 
79
  try:
80
+ # Prepare conversation history as a single string (limit to last 2 exchanges for speed)
81
  conversation = self.system_prompt + "\n"
82
 
83
+ # Add recent history (limit to last 2 exchanges for speed)
84
+ for msg in history[-2:]:
85
  if msg["role"] == "user":
86
  conversation += f"User: {msg['content']}\n"
87
  elif msg["role"] == "assistant":
 
90
  # Add current message
91
  conversation += f"User: {message}\nAssistant:"
92
 
93
+ # Generate response with optimized settings for speed
94
+ start_time = time.time()
95
  outputs = self.chat_pipeline(
96
  conversation,
97
+ max_new_tokens=50, # Shorter responses for speed
98
  num_return_sequences=1,
99
  return_full_text=False,
100
  do_sample=True,
101
+ temperature=self.temperature,
102
+ top_p=0.9, # Add top_p for better quality
103
+ repetition_penalty=1.1 # Reduce repetition
104
  )
105
+ end_time = time.time()
106
+ print(f"Response generated in {end_time - start_time:.2f} seconds")
107
 
108
  response = outputs[0]['generated_text'].strip()
109
 
 
113
  elif response.startswith("User:"):
114
  response = "I apologize, but I seem to have gotten confused. How can I help you?"
115
 
116
+ # Limit response length for speed
117
+ if len(response) > 200:
118
+ response = response[:200] + "..."
119
 
120
+ # Faster streaming (yield larger chunks)
121
  words = response.split()
122
  current_response = ""
123
+ chunk_size = 3 # Yield every 3 words for faster streaming
124
+ for i in range(0, len(words), chunk_size):
125
+ chunk = words[i:i + chunk_size]
126
+ current_response += " ".join(chunk) + " "
127
  yield current_response.strip()
128
+ time.sleep(0.01) # Very short delay for smooth streaming
129
 
130
  except Exception as e:
131
+ yield f"I apologize, but I encountered an error. Please try again. Error: {str(e)}"
132
 
133
  # Initialize chatbot handler
134
  chat_handler = ChatbotHandler()
135
 
136
  def respond_stream(message: str, history: List[Dict]):
137
+ """Generate streaming response from the model with fixed history management."""
138
  if not message.strip():
139
  return "", history
140
 
141
+ # Create a copy of history to avoid mutation issues
142
+ current_history = history.copy()
143
+
144
  # Always add user message first to prevent disappearing chats
145
+ current_history.append({"role": "user", "content": message})
146
 
147
  # Check if model is initialized
148
  if not chat_handler.chat_pipeline:
149
+ current_history.append({"role": "assistant", "content": "The chatbot model is still loading. Please wait a moment and try again."})
150
+ return "", current_history
151
 
152
+ # Get streaming response with error handling
153
  full_response = ""
154
+ assistant_added = False
155
+
156
  try:
157
+ for chunk in chat_handler.get_response(message, current_history[:-1]): # Don't include current user message in context
158
  full_response = chunk
159
+ # Update or add the assistant message
160
+ if not assistant_added:
161
+ current_history.append({"role": "assistant", "content": full_response})
162
+ assistant_added = True
163
  else:
164
+ current_history[-1]["content"] = full_response
165
+ yield "", current_history
166
  except Exception as e:
167
  # If streaming fails, add a fallback response
168
  error_msg = "I apologize, but I encountered an error. Please try again."
169
+ if not assistant_added:
170
+ current_history.append({"role": "assistant", "content": error_msg})
171
  else:
172
+ current_history[-1]["content"] = error_msg
173
+ yield "", current_history
174
 
175
  def clear_history():
176
  """Clear the chat history."""
 
183
  return f"Settings updated: temp={temp}, max_length={max_len}"
184
 
185
  # Create the interface
186
+ with gr.Blocks(theme=gr.themes.Soft(), title="Fast AI Chatbot with OPT-6.7B") as demo:
187
 
188
  # Header
189
  gr.HTML("""
190
  <div style='text-align: center; padding: 20px;'>
191
+ <h1> Fast AI Chatbot</h1>
192
+ <p style='color: #666;'>Powered by OPT-6.7B with 8-bit quantization • Built with <a href='https://huggingface.co/spaces/akhaliq/anycoder' target='_blank' style='color: #007bff; text-decoration: none;'>anycoder</a></p>
193
  </div>
194
  """)
195
 
196
  # Status indicator
197
  if chat_handler.model_loaded:
198
+ status_msg = "✅ Chatbot is ready! Responses should take 1-3 seconds."
199
  status_color = "#28a745"
200
  else:
201
+ status_msg = "⏳ Loading OPT-6.7B model with quantization... Should be faster than before."
202
  status_color = "#ffc107"
203
 
204
  gr.HTML(f"""
 
219
  info="Higher values make responses more creative"
220
  )
221
  max_length = gr.Slider(
222
+ minimum=256,
223
+ maximum=1024,
224
+ value=512,
225
+ step=64,
226
  label="Max Length",
227
+ info="Maximum context length (lower = faster)"
228
  )
229
 
230
  # Chatbot component
 
270
  # Footer
271
  gr.HTML("""
272
  <div style='text-align: center; padding: 10px; color: #888; font-size: 0.9em;'>
273
+ <p>This chatbot uses Meta's OPT-6.7B model with 8-bit quantization for fast responses (1-3 seconds). It's completely free to use!</p>
274
+ <p><strong>Speed optimizations:</strong> Smaller model, quantization, shorter responses, optimized parameters.</p>
275
  </div>
276
  """)
277