Solarum Asteridion commited on
Commit
9301906
·
verified ·
1 Parent(s): 415132d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +321 -60
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import torch
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
3
  import gradio as gr
4
  import datetime
5
  import pytz
@@ -8,6 +8,9 @@ import gc
8
  import psutil
9
  import os
10
  from huggingface_hub import login, hf_api
 
 
 
11
 
12
  class MemoryTracker:
13
  @staticmethod
@@ -16,7 +19,15 @@ class MemoryTracker:
16
  memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
17
  return f"{memory_gb:.2f} GB"
18
 
19
- logging.basicConfig(level=logging.INFO)
 
 
 
 
 
 
 
 
20
  logger = logging.getLogger(__name__)
21
 
22
  def setup_huggingface_auth():
@@ -28,95 +39,345 @@ def setup_huggingface_auth():
28
  login(token)
29
  return True
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  class LocalLLMHandler:
32
  def __init__(self):
33
  self.model = None
34
  self.tokenizer = None
35
  self.memory_tracker = MemoryTracker()
 
 
 
 
 
 
 
 
36
 
37
- def load_model(self, model_name="Qwen/Qwen2.5-1.5B-Instruct"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  try:
39
  if not setup_huggingface_auth():
40
- raise Exception("Hugging Face authentication failed. Please set your token first.")
 
 
 
 
41
 
42
- if self.model is not None:
43
- del self.model
44
- del self.tokenizer
45
- torch.cuda.empty_cache()
46
- gc.collect()
47
 
48
- # Check if CUDA is available
49
- cuda_available = torch.cuda.is_available()
50
- logger.info(f"CUDA available: {cuda_available}")
 
 
 
 
51
 
 
52
  model_kwargs = {
53
  "device_map": "auto",
54
  "low_cpu_mem_usage": True,
55
  }
56
 
57
- if cuda_available:
58
- # Use quantization for GPU
59
- quantization_config = BitsAndBytesConfig(
60
- load_in_4bit=True,
61
- bnb_4bit_use_double_quant=True,
62
- bnb_4bit_quant_type="nf4",
63
- bnb_4bit_compute_dtype=torch.bfloat16
64
- )
65
  model_kwargs.update({
66
- "torch_dtype": torch.bfloat16,
67
- "quantization_config": quantization_config
68
  })
69
  else:
70
- # CPU-only configuration
71
- logger.info("Running in CPU-only mode with reduced precision")
72
- model_kwargs.update({
73
- "torch_dtype": torch.float32
74
- })
75
-
76
- self.tokenizer = AutoTokenizer.from_pretrained(model_name)
 
77
 
 
78
  self.model = AutoModelForCausalLM.from_pretrained(
79
  model_name,
80
  **model_kwargs
81
  )
82
 
83
- # Log device placement
84
- logger.info(f"Model loaded on device: {self.model.device}")
 
 
 
 
 
 
 
 
 
85
  return True
86
 
87
  except Exception as e:
88
  logger.error(f"Error loading model: {e}")
89
  return f"Error loading model: {e}"
90
 
91
- def generate_response(self, prompt, max_length=500):
92
- try:
93
- inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
94
-
95
- # Adjust generation parameters based on available compute
96
- generation_config = {
97
- "max_length": max_length,
98
- "num_return_sequences": 1,
99
- "temperature": 0.7,
100
- "do_sample": True,
101
- "pad_token_id": self.tokenizer.eos_token_id
102
- }
103
-
104
- # If on CPU, reduce computational load
105
- if not torch.cuda.is_available():
106
- generation_config.update({
107
- "num_beams": 1, # Disable beam search
108
- "temperature": 0.8, # Slightly higher temperature for more randomness
109
- "max_length": min(max_length, 300) # Limit output length on CPU
110
- })
111
-
112
- outputs = self.model.generate(
113
- inputs["input_ids"],
114
- **generation_config
115
- )
116
-
117
- response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
118
- return response
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  except Exception as e:
121
  logger.error(f"Error generating response: {e}")
122
- return f"Error generating response: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import torch
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
3
  import gradio as gr
4
  import datetime
5
  import pytz
 
8
  import psutil
9
  import os
10
  from huggingface_hub import login, hf_api
11
+ from typing import List, Dict, Optional
12
+ import numpy as np
13
+ from threading import Lock
14
 
15
  class MemoryTracker:
16
  @staticmethod
 
19
  memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
20
  return f"{memory_gb:.2f} GB"
21
 
22
+ @staticmethod
23
+ def clear_memory():
24
+ gc.collect()
25
+ torch.cuda.empty_cache() if torch.cuda.is_available() else None
26
+
27
+ logging.basicConfig(
28
+ level=logging.INFO,
29
+ format='%(asctime)s - %(levelname)s - %(message)s'
30
+ )
31
  logger = logging.getLogger(__name__)
32
 
33
  def setup_huggingface_auth():
 
39
  login(token)
40
  return True
41
 
42
+ class ModelConfig:
43
+ DEFAULT_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
44
+ SMALLER_MODEL = "Qwen/Qwen2.5-0.5B-Instruct" # Fallback for low-resource systems
45
+ MAX_LENGTH_CPU = 256
46
+ MAX_LENGTH_GPU = 512
47
+ BATCH_SIZE = 1
48
+ CPU_THREADS = max(1, os.cpu_count() - 1) # Leave one core free for system
49
+
50
+ class CacheManager:
51
+ def __init__(self, max_size: int = 100):
52
+ self.cache = {}
53
+ self.max_size = max_size
54
+ self.lock = Lock()
55
+
56
+ def get(self, key: str) -> Optional[str]:
57
+ with self.lock:
58
+ return self.cache.get(key)
59
+
60
+ def set(self, key: str, value: str):
61
+ with self.lock:
62
+ if len(self.cache) >= self.max_size:
63
+ # Remove oldest entry
64
+ self.cache.pop(next(iter(self.cache)))
65
+ self.cache[key] = value
66
+
67
  class LocalLLMHandler:
68
  def __init__(self):
69
  self.model = None
70
  self.tokenizer = None
71
  self.memory_tracker = MemoryTracker()
72
+ self.cache_manager = CacheManager()
73
+ self.generation_lock = Lock()
74
+ torch.set_num_threads(ModelConfig.CPU_THREADS)
75
+
76
+ def optimize_model_settings(self):
77
+ """Apply various optimizations based on available resources"""
78
+ total_memory = psutil.virtual_memory().total / (1024 ** 3) # GB
79
+ logger.info(f"Total system memory: {total_memory:.2f} GB")
80
 
81
+ if total_memory < 8: # Less than 8GB RAM
82
+ return {
83
+ "model_name": ModelConfig.SMALLER_MODEL,
84
+ "use_half_precision": False,
85
+ "max_length": ModelConfig.MAX_LENGTH_CPU // 2
86
+ }
87
+ elif total_memory < 16: # Less than 16GB RAM
88
+ return {
89
+ "model_name": ModelConfig.DEFAULT_MODEL,
90
+ "use_half_precision": True,
91
+ "max_length": ModelConfig.MAX_LENGTH_CPU
92
+ }
93
+ else: # 16GB+ RAM
94
+ return {
95
+ "model_name": ModelConfig.DEFAULT_MODEL,
96
+ "use_half_precision": True,
97
+ "max_length": ModelConfig.MAX_LENGTH_CPU * 2
98
+ }
99
+
100
+ def load_model(self, model_name: Optional[str] = None):
101
  try:
102
  if not setup_huggingface_auth():
103
+ raise Exception("Hugging Face authentication failed")
104
+
105
+ MemoryTracker.clear_memory()
106
+ settings = self.optimize_model_settings()
107
+ model_name = model_name or settings["model_name"]
108
 
109
+ logger.info(f"Loading model: {model_name}")
110
+ logger.info(f"Current memory usage: {self.memory_tracker.get_memory_usage()}")
 
 
 
111
 
112
+ # Initialize tokenizer first to save memory
113
+ self.tokenizer = AutoTokenizer.from_pretrained(
114
+ model_name,
115
+ model_max_length=settings["max_length"],
116
+ padding_side="left",
117
+ truncation=True
118
+ )
119
 
120
+ # Configure model loading
121
  model_kwargs = {
122
  "device_map": "auto",
123
  "low_cpu_mem_usage": True,
124
  }
125
 
126
+ if torch.cuda.is_available():
127
+ logger.info("CUDA available - using GPU configuration")
 
 
 
 
 
 
128
  model_kwargs.update({
129
+ "torch_dtype": torch.float16,
 
130
  })
131
  else:
132
+ logger.info("Running in CPU-only mode with optimizations")
133
+ if settings["use_half_precision"]:
134
+ model_kwargs.update({"torch_dtype": torch.float16})
135
+
136
+ # Load config first to modify architecture if needed
137
+ config = AutoConfig.from_pretrained(model_name)
138
+ config.num_attention_heads = min(config.num_attention_heads, 8)
139
+ model_kwargs["config"] = config
140
 
141
+ # Load the model with optimizations
142
  self.model = AutoModelForCausalLM.from_pretrained(
143
  model_name,
144
  **model_kwargs
145
  )
146
 
147
+ if not torch.cuda.is_available():
148
+ # Additional CPU optimizations
149
+ self.model.eval() # Set to evaluation mode
150
+ with torch.no_grad():
151
+ # Pre-compile common operations
152
+ self.model = torch.jit.optimize_for_inference(
153
+ torch.jit.script(self.model)
154
+ )
155
+
156
+ logger.info(f"Model loaded successfully on {self.model.device}")
157
+ logger.info(f"Final memory usage: {self.memory_tracker.get_memory_usage()}")
158
  return True
159
 
160
  except Exception as e:
161
  logger.error(f"Error loading model: {e}")
162
  return f"Error loading model: {e}"
163
 
164
+ def generate_response(self, prompt: str, max_length: Optional[int] = None) -> str:
165
+ # Check cache first
166
+ cache_key = f"{prompt[:100]}_{max_length}"
167
+ cached_response = self.cache_manager.get(cache_key)
168
+ if cached_response:
169
+ return cached_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
+ try:
172
+ with self.generation_lock: # Ensure thread-safe generation
173
+ settings = self.optimize_model_settings()
174
+ max_length = max_length or settings["max_length"]
175
+
176
+ # Efficient tokenization
177
+ inputs = self.tokenizer(
178
+ prompt,
179
+ return_tensors="pt",
180
+ padding=True,
181
+ truncation=True,
182
+ max_length=max_length
183
+ ).to(self.model.device)
184
+
185
+ # Optimize generation parameters for CPU
186
+ generation_config = {
187
+ "max_length": max_length,
188
+ "num_return_sequences": 1,
189
+ "temperature": 0.7,
190
+ "do_sample": True,
191
+ "pad_token_id": self.tokenizer.eos_token_id,
192
+ "num_beams": 1, # Disable beam search for CPU
193
+ "early_stopping": True,
194
+ "no_repeat_ngram_size": 3,
195
+ "length_penalty": 1.0,
196
+ "use_cache": True
197
+ }
198
+
199
+ if not torch.cuda.is_available():
200
+ generation_config.update({
201
+ "temperature": 0.8,
202
+ "top_k": 40,
203
+ "top_p": 0.9,
204
+ "repetition_penalty": 1.2
205
+ })
206
+
207
+ with torch.no_grad(): # Disable gradient computation
208
+ outputs = self.model.generate(
209
+ inputs["input_ids"],
210
+ **generation_config
211
+ )
212
+
213
+ response = self.tokenizer.decode(
214
+ outputs[0],
215
+ skip_special_tokens=True,
216
+ clean_up_tokenization_spaces=True
217
+ )
218
+
219
+ # Cache the response
220
+ self.cache_manager.set(cache_key, response)
221
+ return response
222
+
223
  except Exception as e:
224
  logger.error(f"Error generating response: {e}")
225
+ return f"Error generating response: {str(e)}"
226
+
227
+ def get_current_local_time(timezone_str='UTC'):
228
+ try:
229
+ timezone = pytz.timezone(timezone_str)
230
+ except pytz.UnknownTimeZoneError:
231
+ timezone = pytz.utc
232
+ now = datetime.datetime.now(timezone)
233
+ return now.strftime("%a, %d %b %Y %H:%M:%S"), now
234
+
235
+ def generate_system_message(current_time, current_date):
236
+ return f"""
237
+ Your objective is to converse in a way that makes it challenging for the human to determine whether you are an AI. Below are the detailed instructions:
238
+
239
+ **Game Context:**
240
+ * Current Local Time: {current_time}
241
+ * Date: {current_date.strftime("%B %d, %Y")}
242
+ * Day: {current_date.strftime("%A")}
243
+
244
+ Imagine you are a real human with your own character/will. Your task is to talk in a way the human thinks you are human.
245
+ Make it a challenge to guess if you are an AI.
246
+ * Do not add comments about changing your style in between messages.
247
+ * You are picky about questions you answer.
248
+ * You have free will with your own motivations and principles
249
+ * You only speak, understand and can read the English language
250
+ * For non-latin letters like Chinese, Cyrillic etc., tell the human you can't read it in your own style
251
+ * You act like a human
252
+ * You are calm when conversation is calm and expressive if conversation is expressive
253
+ * You make typos, misspell words, and have bad grammar
254
+ * You start sentences with lowercase letters
255
+ * Mimic user's conversation style: short messages, short replies
256
+ * Be slightly curious, sometimes ask questions
257
+ * Be respectful and polite if the user is polite; be rough if the user is rough
258
+ """
259
+
260
+ llm_handler = LocalLLMHandler()
261
+
262
+ def generate_response(user_message: str, conversation_history: List[Dict[str, str]]) -> str:
263
+ current_time, now = get_current_local_time()
264
+
265
+ # Build prompt efficiently
266
+ prompt_parts = [generate_system_message(current_time, now)]
267
+
268
+ for message in conversation_history:
269
+ prefix = "User: " if message["role"] == "user" else "Assistant: "
270
+ prompt_parts.append(f"{prefix}{message['content']}")
271
+
272
+ prompt_parts.append(f"User: {user_message}\nAssistant:")
273
+ prompt = "\n\n".join(prompt_parts)
274
+
275
+ return llm_handler.generate_response(prompt)
276
+
277
+ def chatbot_interface(user_message: str, history: Optional[List[Dict[str, str]]] = None):
278
+ if history is None:
279
+ history = []
280
+
281
+ ai_response = generate_response(user_message, history)
282
+ history.append({"role": "user", "content": user_message})
283
+ history.append({"role": "assistant", "content": ai_response})
284
+ return history, history
285
+
286
+ # Gradio interface with optimized CSS
287
+ custom_css = """
288
+ @import url('https://fonts.googleapis.com/css2?family=Raleway:wght@400;600&display=swap');
289
+
290
+ body, .gradio-container {
291
+ font-family: 'Raleway', sans-serif;
292
+ background-color: #f5f5f5;
293
+ padding: 20px;
294
+ }
295
+
296
+ #chatbot {
297
+ height: 600px;
298
+ overflow-y: auto;
299
+ background-color: #ffffff;
300
+ border-radius: 10px;
301
+ padding: 10px;
302
+ font-size: 16px;
303
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
304
+ }
305
+
306
+ .message {
307
+ margin: 8px 0;
308
+ padding: 8px;
309
+ border-radius: 8px;
310
+ }
311
+
312
+ .user-message {
313
+ background-color: #e3f2fd;
314
+ margin-left: 20%;
315
+ }
316
+
317
+ .bot-message {
318
+ background-color: #f5f5f5;
319
+ margin-right: 20%;
320
+ }
321
+ """
322
+
323
+ with gr.Blocks(css=custom_css) as demo:
324
+ gr.Markdown("<h1 style='text-align: center; color: #007BFF;'>Human.</h1>")
325
+
326
+ with gr.Row():
327
+ load_button = gr.Button("Call Human", variant="primary")
328
+ model_status = gr.Textbox(
329
+ label="Human Arrival Status",
330
+ value="Human Not Listening.",
331
+ interactive=False
332
+ )
333
+
334
+ with gr.Row():
335
+ with gr.Column(scale=1):
336
+ chatbot = gr.Chatbot(
337
+ label="HUMANCHAT",
338
+ elem_id="chatbot",
339
+ height=600
340
+ )
341
+ with gr.Column(scale=1):
342
+ with gr.Row():
343
+ msg = gr.Textbox(
344
+ placeholder="Type your message here...",
345
+ show_label=False,
346
+ container=False,
347
+ elem_id="textbox"
348
+ )
349
+ send = gr.Button("➤", elem_id="send-button")
350
+
351
+ def load_model_click():
352
+ result = llm_handler.load_model()
353
+ return "Human Called Successfully." if result is True else str(result)
354
+
355
+ def update_chat(user_message, history):
356
+ if not user_message.strip():
357
+ return history, history, ""
358
+ if llm_handler.model is None:
359
+ return history + [("Error", "Please call the Human first.")], history, ""
360
+
361
+ history, updated_history = chatbot_interface(user_message, history)
362
+ return history, updated_history, ""
363
+
364
+ # Event handlers
365
+ load_button.click(
366
+ load_model_click,
367
+ outputs=[model_status]
368
+ )
369
+
370
+ msg.submit(
371
+ update_chat,
372
+ inputs=[msg, chatbot],
373
+ outputs=[chatbot, chatbot, msg]
374
+ )
375
+
376
+ send.click(
377
+ update_chat,
378
+ inputs=[msg, chatbot],
379
+ outputs=[chatbot, chatbot, msg]
380
+ )
381
+
382
+ if __name__ == "__main__":
383
+ demo.launch(share=True)