Solarum Asteridion commited on
Commit
5a1081c
·
verified ·
1 Parent(s): b2d02e9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -41
app.py CHANGED
@@ -9,7 +9,6 @@ import psutil
9
  import os
10
  from huggingface_hub import login, hf_api
11
  from typing import List, Dict, Optional
12
- import numpy as np
13
  from threading import Lock
14
 
15
  class MemoryTracker:
@@ -41,12 +40,12 @@ def setup_huggingface_auth():
41
 
42
  class ModelConfig:
43
  DEFAULT_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
44
- SMALLER_MODEL = "Qwen/Qwen2.5-0.5B-Instruct" # Fallback for low-resource systems
45
- MAX_LENGTH_CPU = 384
46
  MAX_LENGTH_GPU = 512
47
  BATCH_SIZE = 1
48
- CPU_THREADS = max(1, os.cpu_count() - 1) # Leave one core free for system
49
-
50
  class CacheManager:
51
  def __init__(self, max_size: int = 100):
52
  self.cache = {}
@@ -60,7 +59,6 @@ class CacheManager:
60
  def set(self, key: str, value: str):
61
  with self.lock:
62
  if len(self.cache) >= self.max_size:
63
- # Remove oldest entry
64
  self.cache.pop(next(iter(self.cache)))
65
  self.cache[key] = value
66
 
@@ -74,27 +72,27 @@ class LocalLLMHandler:
74
  torch.set_num_threads(ModelConfig.CPU_THREADS)
75
 
76
  def optimize_model_settings(self):
77
- """Apply various optimizations based on available resources"""
78
  total_memory = psutil.virtual_memory().total / (1024 ** 3) # GB
79
  logger.info(f"Total system memory: {total_memory:.2f} GB")
80
 
81
  if total_memory < 8: # Less than 8GB RAM
82
  return {
83
  "model_name": ModelConfig.SMALLER_MODEL,
84
- "use_half_precision": False,
85
  "max_length": ModelConfig.MAX_LENGTH_CPU // 2
86
  }
87
  elif total_memory < 16: # Less than 16GB RAM
88
  return {
89
- "model_name": ModelConfig.DEFAULT_MODEL,
90
- "use_half_precision": True,
91
  "max_length": ModelConfig.MAX_LENGTH_CPU
92
  }
93
  else: # 16GB+ RAM
94
  return {
95
  "model_name": ModelConfig.DEFAULT_MODEL,
96
- "use_half_precision": True,
97
- "max_length": ModelConfig.MAX_LENGTH_CPU * 2
98
  }
99
 
100
  def load_model(self, model_name: Optional[str] = None):
@@ -109,7 +107,7 @@ class LocalLLMHandler:
109
  logger.info(f"Loading model: {model_name}")
110
  logger.info(f"Current memory usage: {self.memory_tracker.get_memory_usage()}")
111
 
112
- # Initialize tokenizer first to save memory
113
  self.tokenizer = AutoTokenizer.from_pretrained(
114
  model_name,
115
  model_max_length=settings["max_length"],
@@ -117,41 +115,32 @@ class LocalLLMHandler:
117
  truncation=True
118
  )
119
 
120
- # Configure model loading
121
  model_kwargs = {
122
- "device_map": "auto",
123
  "low_cpu_mem_usage": True,
124
  }
125
 
126
  if torch.cuda.is_available():
127
  logger.info("CUDA available - using GPU configuration")
128
  model_kwargs.update({
129
- "torch_dtype": torch.float16,
 
130
  })
131
  else:
132
- logger.info("Running in CPU-only mode with optimizations")
133
- if settings["use_half_precision"]:
134
- model_kwargs.update({"torch_dtype": torch.float16})
135
-
136
- # Load config first to modify architecture if needed
137
- config = AutoConfig.from_pretrained(model_name)
138
- config.num_attention_heads = min(config.num_attention_heads, 8)
139
- model_kwargs["config"] = config
140
 
141
- # Load the model with optimizations
142
  self.model = AutoModelForCausalLM.from_pretrained(
143
  model_name,
144
  **model_kwargs
145
  )
146
 
147
- if not torch.cuda.is_available():
148
- # Additional CPU optimizations
149
- self.model.eval() # Set to evaluation mode
150
- with torch.no_grad():
151
- # Pre-compile common operations
152
- self.model = torch.jit.optimize_for_inference(
153
- torch.jit.script(self.model)
154
- )
155
 
156
  logger.info(f"Model loaded successfully on {self.model.device}")
157
  logger.info(f"Final memory usage: {self.memory_tracker.get_memory_usage()}")
@@ -162,18 +151,17 @@ class LocalLLMHandler:
162
  return f"Error loading model: {e}"
163
 
164
  def generate_response(self, prompt: str, max_length: Optional[int] = None) -> str:
165
- # Check cache first
166
  cache_key = f"{prompt[:100]}_{max_length}"
167
  cached_response = self.cache_manager.get(cache_key)
168
  if cached_response:
169
  return cached_response
170
 
171
  try:
172
- with self.generation_lock: # Ensure thread-safe generation
173
  settings = self.optimize_model_settings()
174
  max_length = max_length or settings["max_length"]
175
 
176
- # Efficient tokenization
177
  inputs = self.tokenizer(
178
  prompt,
179
  return_tensors="pt",
@@ -182,14 +170,13 @@ class LocalLLMHandler:
182
  max_length=max_length
183
  ).to(self.model.device)
184
 
185
- # Optimize generation parameters for CPU
186
  generation_config = {
187
  "max_length": max_length,
188
  "num_return_sequences": 1,
189
  "temperature": 0.7,
190
  "do_sample": True,
191
  "pad_token_id": self.tokenizer.eos_token_id,
192
- "num_beams": 1, # Disable beam search for CPU
193
  "early_stopping": True,
194
  "no_repeat_ngram_size": 3,
195
  "length_penalty": 1.0,
@@ -201,10 +188,9 @@ class LocalLLMHandler:
201
  "temperature": 0.8,
202
  "top_k": 40,
203
  "top_p": 0.9,
204
- "repetition_penalty": 1.2
205
  })
206
 
207
- with torch.no_grad(): # Disable gradient computation
208
  outputs = self.model.generate(
209
  inputs["input_ids"],
210
  **generation_config
@@ -216,7 +202,6 @@ class LocalLLMHandler:
216
  clean_up_tokenization_spaces=True
217
  )
218
 
219
- # Cache the response
220
  self.cache_manager.set(cache_key, response)
221
  return response
222
 
 
9
  import os
10
  from huggingface_hub import login, hf_api
11
  from typing import List, Dict, Optional
 
12
  from threading import Lock
13
 
14
  class MemoryTracker:
 
40
 
41
  class ModelConfig:
42
  DEFAULT_MODEL = "Qwen/Qwen2.5-1.5B-Instruct"
43
+ SMALLER_MODEL = "Qwen/Qwen2.5-0.5B-Instruct"
44
+ MAX_LENGTH_CPU = 256
45
  MAX_LENGTH_GPU = 512
46
  BATCH_SIZE = 1
47
+ CPU_THREADS = max(1, os.cpu_count() - 1)
48
+
49
  class CacheManager:
50
  def __init__(self, max_size: int = 100):
51
  self.cache = {}
 
59
  def set(self, key: str, value: str):
60
  with self.lock:
61
  if len(self.cache) >= self.max_size:
 
62
  self.cache.pop(next(iter(self.cache)))
63
  self.cache[key] = value
64
 
 
72
  torch.set_num_threads(ModelConfig.CPU_THREADS)
73
 
74
  def optimize_model_settings(self):
75
+ """Apply safe optimizations based on available resources"""
76
  total_memory = psutil.virtual_memory().total / (1024 ** 3) # GB
77
  logger.info(f"Total system memory: {total_memory:.2f} GB")
78
 
79
  if total_memory < 8: # Less than 8GB RAM
80
  return {
81
  "model_name": ModelConfig.SMALLER_MODEL,
82
+ "use_float16": False,
83
  "max_length": ModelConfig.MAX_LENGTH_CPU // 2
84
  }
85
  elif total_memory < 16: # Less than 16GB RAM
86
  return {
87
+ "model_name": ModelConfig.SMALLER_MODEL,
88
+ "use_float16": False,
89
  "max_length": ModelConfig.MAX_LENGTH_CPU
90
  }
91
  else: # 16GB+ RAM
92
  return {
93
  "model_name": ModelConfig.DEFAULT_MODEL,
94
+ "use_float16": False,
95
+ "max_length": ModelConfig.MAX_LENGTH_CPU
96
  }
97
 
98
  def load_model(self, model_name: Optional[str] = None):
 
107
  logger.info(f"Loading model: {model_name}")
108
  logger.info(f"Current memory usage: {self.memory_tracker.get_memory_usage()}")
109
 
110
+ # Load tokenizer with safe settings
111
  self.tokenizer = AutoTokenizer.from_pretrained(
112
  model_name,
113
  model_max_length=settings["max_length"],
 
115
  truncation=True
116
  )
117
 
118
+ # Basic model loading configuration
119
  model_kwargs = {
 
120
  "low_cpu_mem_usage": True,
121
  }
122
 
123
  if torch.cuda.is_available():
124
  logger.info("CUDA available - using GPU configuration")
125
  model_kwargs.update({
126
+ "device_map": "auto",
127
+ "torch_dtype": torch.float16 if settings["use_float16"] else torch.float32
128
  })
129
  else:
130
+ logger.info("Running in CPU-only mode with safe optimizations")
131
+ model_kwargs.update({
132
+ "device_map": "cpu",
133
+ "torch_dtype": torch.float32 # Use float32 for CPU stability
134
+ })
 
 
 
135
 
136
+ # Load the model without trying to modify its architecture
137
  self.model = AutoModelForCausalLM.from_pretrained(
138
  model_name,
139
  **model_kwargs
140
  )
141
 
142
+ # Set to eval mode for inference
143
+ self.model.eval()
 
 
 
 
 
 
144
 
145
  logger.info(f"Model loaded successfully on {self.model.device}")
146
  logger.info(f"Final memory usage: {self.memory_tracker.get_memory_usage()}")
 
151
  return f"Error loading model: {e}"
152
 
153
  def generate_response(self, prompt: str, max_length: Optional[int] = None) -> str:
 
154
  cache_key = f"{prompt[:100]}_{max_length}"
155
  cached_response = self.cache_manager.get(cache_key)
156
  if cached_response:
157
  return cached_response
158
 
159
  try:
160
+ with self.generation_lock:
161
  settings = self.optimize_model_settings()
162
  max_length = max_length or settings["max_length"]
163
 
164
+ # Tokenize input
165
  inputs = self.tokenizer(
166
  prompt,
167
  return_tensors="pt",
 
170
  max_length=max_length
171
  ).to(self.model.device)
172
 
173
+ # Safe generation parameters
174
  generation_config = {
175
  "max_length": max_length,
176
  "num_return_sequences": 1,
177
  "temperature": 0.7,
178
  "do_sample": True,
179
  "pad_token_id": self.tokenizer.eos_token_id,
 
180
  "early_stopping": True,
181
  "no_repeat_ngram_size": 3,
182
  "length_penalty": 1.0,
 
188
  "temperature": 0.8,
189
  "top_k": 40,
190
  "top_p": 0.9,
 
191
  })
192
 
193
+ with torch.no_grad():
194
  outputs = self.model.generate(
195
  inputs["input_ids"],
196
  **generation_config
 
202
  clean_up_tokenization_spaces=True
203
  )
204
 
 
205
  self.cache_manager.set(cache_key, response)
206
  return response
207