Solarum Asteridion commited on
Commit
03fd978
·
verified ·
1 Parent(s): 200a0c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -181
app.py CHANGED
@@ -1,27 +1,21 @@
1
- import torch
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 
3
  import gradio as gr
4
  import datetime
5
  import pytz
6
  import logging
7
- import gc
8
- import psutil
9
- import os
10
- from huggingface_hub import login, hf_api
11
- from typing import List, Dict, Optional
12
- from threading import Lock
13
 
14
  class MemoryTracker:
15
  @staticmethod
16
  def get_memory_usage():
17
- process = psutil.Process(os.getpid())
18
- memory_gb = process.memory_info().rss / 1024 / 1024 / 1024
19
- return f"{memory_gb:.2f} GB"
20
 
21
  @staticmethod
22
  def clear_memory():
23
- gc.collect()
24
- torch.cuda.empty_cache() if torch.cuda.is_available() else None
25
 
26
  logging.basicConfig(
27
  level=logging.INFO,
@@ -29,128 +23,20 @@ logging.basicConfig(
29
  )
30
  logger = logging.getLogger(__name__)
31
 
32
- def setup_huggingface_auth():
33
- token = os.environ.get("HF_TOKEN")
34
- if token is None:
35
- token = hf_api.HfFolder.get_token()
36
- if token is None:
37
- raise Exception("Hugging Face authentication failed. Please set your token.")
38
- login(token)
39
  return True
40
 
41
- class ModelConfig:
42
- DEFAULT_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
43
- SMALLER_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
44
- MAX_LENGTH_CPU = 256
45
- MAX_LENGTH_GPU = 512
46
- BATCH_SIZE = 1
47
- CPU_THREADS = max(1, os.cpu_count() - 1)
48
-
49
- class CacheManager:
50
- def __init__(self, max_size: int = 100):
51
- self.cache = {}
52
- self.max_size = max_size
53
- self.lock = Lock()
54
-
55
- def get(self, key: str) -> Optional[str]:
56
- with self.lock:
57
- return self.cache.get(key)
58
-
59
- def set(self, key: str, value: str):
60
- with self.lock:
61
- if len(self.cache) >= self.max_size:
62
- self.cache.pop(next(iter(self.cache)))
63
- self.cache[key] = value
64
-
65
- class LocalLLMHandler:
66
  def __init__(self):
67
- self.model = None
68
- self.tokenizer = None
69
  self.memory_tracker = MemoryTracker()
70
  self.cache_manager = CacheManager()
71
  self.generation_lock = Lock()
72
- torch.set_num_threads(ModelConfig.CPU_THREADS)
73
-
74
- def optimize_model_settings(self):
75
- """Apply safe optimizations based on available resources"""
76
- total_memory = psutil.virtual_memory().total / (1024 ** 3) # GB
77
- logger.info(f"Total system memory: {total_memory:.2f} GB")
78
 
79
- if total_memory < 8: # Less than 8GB RAM
80
- return {
81
- "model_name": ModelConfig.SMALLER_MODEL,
82
- "use_float16": False,
83
- "max_length": ModelConfig.MAX_LENGTH_CPU // 2
84
- }
85
- elif total_memory < 16: # Less than 16GB RAM
86
- return {
87
- "model_name": ModelConfig.SMALLER_MODEL,
88
- "use_float16": False,
89
- "max_length": ModelConfig.MAX_LENGTH_CPU
90
- }
91
- else: # 16GB+ RAM
92
- return {
93
- "model_name": ModelConfig.DEFAULT_MODEL,
94
- "use_float16": False,
95
- "max_length": ModelConfig.MAX_LENGTH_CPU
96
- }
97
-
98
- def load_model(self, model_name: Optional[str] = None):
99
- try:
100
- if not setup_huggingface_auth():
101
- raise Exception("Hugging Face authentication failed")
102
-
103
- MemoryTracker.clear_memory()
104
- settings = self.optimize_model_settings()
105
- model_name = model_name or settings["model_name"]
106
-
107
- logger.info(f"Loading model: {model_name}")
108
- logger.info(f"Current memory usage: {self.memory_tracker.get_memory_usage()}")
109
-
110
- # Load tokenizer with safe settings
111
- self.tokenizer = AutoTokenizer.from_pretrained(
112
- model_name,
113
- model_max_length=settings["max_length"],
114
- padding_side="left",
115
- truncation=True
116
- )
117
-
118
- # Basic model loading configuration
119
- model_kwargs = {
120
- "low_cpu_mem_usage": True,
121
- }
122
-
123
- if torch.cuda.is_available():
124
- logger.info("CUDA available - using GPU configuration")
125
- model_kwargs.update({
126
- "device_map": "auto",
127
- "torch_dtype": torch.float16 if settings["use_float16"] else torch.float32
128
- })
129
- else:
130
- logger.info("Running in CPU-only mode with safe optimizations")
131
- model_kwargs.update({
132
- "device_map": "cpu",
133
- "torch_dtype": torch.float32 # Use float32 for CPU stability
134
- })
135
-
136
- # Load the model without trying to modify its architecture
137
- self.model = AutoModelForCausalLM.from_pretrained(
138
- model_name,
139
- **model_kwargs
140
- )
141
-
142
- # Set to eval mode for inference
143
- self.model.eval()
144
-
145
- logger.info(f"Model loaded successfully on {self.model.device}")
146
- logger.info(f"Final memory usage: {self.memory_tracker.get_memory_usage()}")
147
- return True
148
-
149
- except Exception as e:
150
- logger.error(f"Error loading model: {e}")
151
- return f"Error loading model: {e}"
152
-
153
- def generate_response(self, prompt: str, max_length: Optional[int] = None) -> str:
154
  cache_key = f"{prompt[:100]}_{max_length}"
155
  cached_response = self.cache_manager.get(cache_key)
156
  if cached_response:
@@ -158,54 +44,21 @@ class LocalLLMHandler:
158
 
159
  try:
160
  with self.generation_lock:
161
- settings = self.optimize_model_settings()
162
- max_length = max_length or settings["max_length"]
163
-
164
- # Tokenize input
165
- inputs = self.tokenizer(
166
- prompt,
167
- return_tensors="pt",
168
- padding=True,
169
- truncation=True,
170
- max_length=max_length
171
- ).to(self.model.device)
172
-
173
- # Safe generation parameters
174
- generation_config = {
175
- "max_length": max_length,
176
- "num_return_sequences": 1,
177
- "temperature": 0.7,
178
- "do_sample": True,
179
- "pad_token_id": self.tokenizer.eos_token_id,
180
- "early_stopping": True,
181
- "no_repeat_ngram_size": 3,
182
- "length_penalty": 1.0,
183
- "use_cache": True
184
- }
185
-
186
- if not torch.cuda.is_available():
187
- generation_config.update({
188
- "temperature": 0.8,
189
- "top_k": 40,
190
- "top_p": 0.9,
191
- })
192
-
193
- with torch.no_grad():
194
- outputs = self.model.generate(
195
- inputs["input_ids"],
196
- **generation_config
197
- )
198
-
199
- response = self.tokenizer.decode(
200
- outputs[0],
201
- skip_special_tokens=True,
202
- clean_up_tokenization_spaces=True
203
  )
204
 
205
- self.cache_manager.set(cache_key, response)
206
- return response
 
 
207
 
208
- except Exception as e:
209
  logger.error(f"Error generating response: {e}")
210
  return f"Error generating response: {str(e)}"
211
 
@@ -242,12 +95,11 @@ Make it a challenge to guess if you are an AI.
242
  * Be respectful and polite if the user is polite; be rough if the user is rough
243
  """
244
 
245
- llm_handler = LocalLLMHandler()
246
 
247
- def generate_response(user_message: str, conversation_history: List[Dict[str, str]]) -> str:
248
  current_time, now = get_current_local_time()
249
 
250
- # Build prompt efficiently
251
  prompt_parts = [generate_system_message(current_time, now)]
252
 
253
  for message in conversation_history:
@@ -257,12 +109,9 @@ def generate_response(user_message: str, conversation_history: List[Dict[str, st
257
  prompt_parts.append(f"User: {user_message}\nAssistant:")
258
  prompt = "\n\n".join(prompt_parts)
259
 
260
- # Increase max_length to accommodate longer inputs
261
- max_length = 512 # You can adjust this value as needed
262
-
263
- return llm_handler.generate_response(prompt, max_length)
264
 
265
- def chatbot_interface(user_message: str, history: Optional[List[Dict[str, str]]] = None):
266
  if history is None:
267
  history = []
268
 
 
1
+ import os
2
+ import openai
3
+ from openai.error import OpenAIError
4
  import gradio as gr
5
  import datetime
6
  import pytz
7
  import logging
 
 
 
 
 
 
8
 
9
  class MemoryTracker:
10
  @staticmethod
11
  def get_memory_usage():
12
+ # Placeholder for memory usage tracking
13
+ return "0.00 GB"
 
14
 
15
  @staticmethod
16
  def clear_memory():
17
+ # Placeholder for memory clearing
18
+ pass
19
 
20
  logging.basicConfig(
21
  level=logging.INFO,
 
23
  )
24
  logger = logging.getLogger(__name__)
25
 
26
+ def setup_openai_auth():
27
+ openai.api_key = os.environ.get("OPENAI_API_KEY")
28
+ if openai.api_key is None:
29
+ raise Exception("OpenAI API authentication failed. Please set your API key.")
 
 
 
30
  return True
31
 
32
+ class OpenAILLMHandler:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def __init__(self):
34
+ self.model = "gpt-3.5-turbo"
 
35
  self.memory_tracker = MemoryTracker()
36
  self.cache_manager = CacheManager()
37
  self.generation_lock = Lock()
 
 
 
 
 
 
38
 
39
+ def generate_response(self, prompt: str, max_length: int = 512) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  cache_key = f"{prompt[:100]}_{max_length}"
41
  cached_response = self.cache_manager.get(cache_key)
42
  if cached_response:
 
44
 
45
  try:
46
  with self.generation_lock:
47
+ response = openai.ChatCompletion.create(
48
+ model=self.model,
49
+ messages=[{"role": "user", "content": prompt}],
50
+ max_tokens=max_length,
51
+ n=1,
52
+ stop=None,
53
+ temperature=0.7,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  )
55
 
56
+ response_text = response.choices[0].message.content
57
+
58
+ self.cache_manager.set(cache_key, response_text)
59
+ return response_text
60
 
61
+ except OpenAIError as e:
62
  logger.error(f"Error generating response: {e}")
63
  return f"Error generating response: {str(e)}"
64
 
 
95
  * Be respectful and polite if the user is polite; be rough if the user is rough
96
  """
97
 
98
+ llm_handler = OpenAILLMHandler()
99
 
100
+ def generate_response(user_message: str, conversation_history: list) -> str:
101
  current_time, now = get_current_local_time()
102
 
 
103
  prompt_parts = [generate_system_message(current_time, now)]
104
 
105
  for message in conversation_history:
 
109
  prompt_parts.append(f"User: {user_message}\nAssistant:")
110
  prompt = "\n\n".join(prompt_parts)
111
 
112
+ return llm_handler.generate_response(prompt)
 
 
 
113
 
114
+ def chatbot_interface(user_message: str, history: list = None):
115
  if history is None:
116
  history = []
117