phxdev commited on
Commit
9bc41b0
Β·
verified Β·
1 Parent(s): 6df4177

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -8
app.py CHANGED
@@ -80,14 +80,19 @@ class CreedBrattonAI:
80
 
81
  # Load model with proper device handling
82
  if self.device == "cuda":
 
83
  self.model = AutoModelForCausalLM.from_pretrained(
84
  model_name,
85
  torch_dtype=torch.float16, # Use float16 for GPU efficiency
86
- device_map="auto", # Auto device mapping for GPU
87
  trust_remote_code=True,
88
  low_cpu_mem_usage=True
89
  )
 
 
 
90
  else:
 
91
  self.model = AutoModelForCausalLM.from_pretrained(
92
  model_name,
93
  torch_dtype=torch.float32, # Use float32 for CPU
@@ -104,9 +109,13 @@ class CreedBrattonAI:
104
 
105
  self.model.eval()
106
 
 
 
 
 
107
  self.model_loaded = True
108
  self.loading = False
109
- print(f"βœ… Creed's consciousness loaded on {self.device}!")
110
 
111
  # GPU memory info
112
  if self.device == "cuda" and torch.cuda.is_available():
@@ -128,8 +137,9 @@ class CreedBrattonAI:
128
  self.model = AutoModelForCausalLM.from_pretrained(
129
  base_model,
130
  torch_dtype=torch.float16,
131
- device_map="auto"
132
  )
 
133
  else:
134
  self.model = AutoModelForCausalLM.from_pretrained(
135
  base_model,
@@ -140,23 +150,38 @@ class CreedBrattonAI:
140
 
141
  self.model.eval()
142
  self.model_loaded = True
143
- print(f"βœ… Fallback model loaded on {self.device}")
 
144
  except Exception as fallback_error:
145
  print(f"❌ Fallback also failed: {fallback_error}")
146
  self.loading = False
147
 
148
  @spaces.GPU if SPACES_AVAILABLE else lambda func: func
149
  def generate_response_gpu(self, conversation: str) -> str:
150
- """Generate response using the loaded model - back to working version"""
151
 
152
  if not self.model_loaded:
153
  return "❌ Model not loaded"
154
 
155
  try:
 
 
 
 
 
 
 
 
 
 
 
 
156
  # Simple tokenization that was working before
157
  inputs = self.tokenizer.encode(conversation, return_tensors="pt")
158
- if self.device == "cuda":
159
- inputs = inputs.to(self.device)
 
 
160
 
161
  # Generate response with original settings that worked
162
  with torch.no_grad():
@@ -180,6 +205,7 @@ class CreedBrattonAI:
180
  return self._clean_response(response)
181
 
182
  except Exception as e:
 
183
  return f"🎸 *Creed scratches his head* Something weird happened... {str(e)[:100]}"
184
 
185
  def generate_response(self, message: str, history: List[List[str]]) -> Iterator[str]:
@@ -342,6 +368,8 @@ def main():
342
 
343
  # Memory status if GPU available
344
  if torch.cuda.is_available() and creed_ai.model_loaded:
 
 
345
  print(f"πŸ”₯ Final GPU Memory: {torch.cuda.memory_allocated() // 1024**2} MB allocated")
346
  print(f"πŸ“Š GPU Memory Reserved: {torch.cuda.memory_reserved() // 1024**2} MB reserved")
347
 
@@ -678,10 +706,11 @@ def main():
678
  ) as demo:
679
 
680
  # Modern header
 
681
  gr.HTML(f"""
682
  <div class="header">
683
  <h1>🎸 Creed Bratton AI</h1>
684
- <p>Powered by phxdev/creed-qwen-0.5b-lora β€’ Running on {'πŸš€ GPU' if creed_ai.device == 'cuda' else 'πŸ–₯️ CPU'}</p>
685
  </div>
686
  """)
687
 
 
80
 
81
  # Load model with proper device handling
82
  if self.device == "cuda":
83
+ print("πŸ€– Loading model for GPU...")
84
  self.model = AutoModelForCausalLM.from_pretrained(
85
  model_name,
86
  torch_dtype=torch.float16, # Use float16 for GPU efficiency
87
+ device_map=None, # Don't use auto device mapping in ZeroGPU
88
  trust_remote_code=True,
89
  low_cpu_mem_usage=True
90
  )
91
+ # Explicitly move to CUDA
92
+ print("πŸ”§ Explicitly moving model to CUDA...")
93
+ self.model = self.model.to(self.device)
94
  else:
95
+ print("πŸ€– Loading model for CPU...")
96
  self.model = AutoModelForCausalLM.from_pretrained(
97
  model_name,
98
  torch_dtype=torch.float32, # Use float32 for CPU
 
109
 
110
  self.model.eval()
111
 
112
+ # Verify final device placement
113
+ final_device = next(self.model.parameters()).device
114
+ print(f"🎯 Model final device: {final_device}")
115
+
116
  self.model_loaded = True
117
  self.loading = False
118
+ print(f"βœ… Creed's consciousness loaded on {final_device}!")
119
 
120
  # GPU memory info
121
  if self.device == "cuda" and torch.cuda.is_available():
 
137
  self.model = AutoModelForCausalLM.from_pretrained(
138
  base_model,
139
  torch_dtype=torch.float16,
140
+ device_map=None # Don't use auto in ZeroGPU
141
  )
142
+ self.model = self.model.to(self.device)
143
  else:
144
  self.model = AutoModelForCausalLM.from_pretrained(
145
  base_model,
 
150
 
151
  self.model.eval()
152
  self.model_loaded = True
153
+ fallback_device = next(self.model.parameters()).device
154
+ print(f"βœ… Fallback model loaded on {fallback_device}")
155
  except Exception as fallback_error:
156
  print(f"❌ Fallback also failed: {fallback_error}")
157
  self.loading = False
158
 
159
  @spaces.GPU if SPACES_AVAILABLE else lambda func: func
160
  def generate_response_gpu(self, conversation: str) -> str:
161
+ """Generate response using the loaded model with proper device handling"""
162
 
163
  if not self.model_loaded:
164
  return "❌ Model not loaded"
165
 
166
  try:
167
+ # Always ensure model is on the correct device in ZeroGPU
168
+ current_model_device = next(self.model.parameters()).device
169
+ print(f"πŸ” Current model device: {current_model_device}")
170
+
171
+ if self.device == "cuda" and current_model_device.type != "cuda":
172
+ print(f"πŸ”„ Moving model from {current_model_device} to {self.device}")
173
+ self.model = self.model.to(self.device)
174
+
175
+ # Verify model device after potential move
176
+ actual_device = next(self.model.parameters()).device
177
+ print(f"🎯 Model now on: {actual_device}")
178
+
179
  # Simple tokenization that was working before
180
  inputs = self.tokenizer.encode(conversation, return_tensors="pt")
181
+
182
+ # Put inputs on same device as model
183
+ inputs = inputs.to(actual_device)
184
+ print(f"πŸ” Inputs device: {inputs.device}")
185
 
186
  # Generate response with original settings that worked
187
  with torch.no_grad():
 
205
  return self._clean_response(response)
206
 
207
  except Exception as e:
208
+ print(f"❌ Generation error: {e}")
209
  return f"🎸 *Creed scratches his head* Something weird happened... {str(e)[:100]}"
210
 
211
  def generate_response(self, message: str, history: List[List[str]]) -> Iterator[str]:
 
368
 
369
  # Memory status if GPU available
370
  if torch.cuda.is_available() and creed_ai.model_loaded:
371
+ actual_model_device = next(creed_ai.model.parameters()).device
372
+ print(f"🎯 Model actually on: {actual_model_device}")
373
  print(f"πŸ”₯ Final GPU Memory: {torch.cuda.memory_allocated() // 1024**2} MB allocated")
374
  print(f"πŸ“Š GPU Memory Reserved: {torch.cuda.memory_reserved() // 1024**2} MB reserved")
375
 
 
706
  ) as demo:
707
 
708
  # Modern header
709
+ actual_device = next(creed_ai.model.parameters()).device if creed_ai.model_loaded else creed_ai.device
710
  gr.HTML(f"""
711
  <div class="header">
712
  <h1>🎸 Creed Bratton AI</h1>
713
+ <p>Powered by phxdev/creed-qwen-0.5b-lora β€’ Running on {'πŸš€ GPU' if 'cuda' in str(actual_device) else 'πŸ–₯️ CPU'} ({actual_device})</p>
714
  </div>
715
  """)
716