phxdev commited on
Commit
8d95555
Β·
verified Β·
1 Parent(s): 1d0830c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -79
app.py CHANGED
@@ -47,14 +47,14 @@ class CreedBrattonAI:
47
  self.load_model()
48
 
49
  def load_model(self):
50
- """Load the model with GPU optimization when available"""
51
  if self.loading or self.model_loaded:
52
  return
53
 
54
  self.loading = True
55
 
56
  try:
57
- print(f"🧠 Loading Creed's consciousness on {self.device}...")
58
 
59
  # Load model and tokenizer
60
  model_name = "phxdev/creed-qwen-0.5b-lora"
@@ -66,62 +66,38 @@ class CreedBrattonAI:
66
  padding_side="left"
67
  )
68
 
69
- # TEMPORARILY DISABLE custom tokens - they're causing corruption
70
- # custom_tokens = ["<thinking>", "<conspiracy>", "<tangent>"]
71
- # print(f"🎸 Adding Creed's custom tokens: {custom_tokens}")
72
- # num_added_tokens = self.tokenizer.add_tokens(custom_tokens)
73
- # print(f"βœ… Added {num_added_tokens} custom tokens")
74
 
75
- print("⚠️ Custom tokens disabled to prevent corruption")
 
76
 
77
  if self.tokenizer.pad_token is None:
78
  self.tokenizer.pad_token = self.tokenizer.eos_token
79
 
80
- print(f"πŸ€– Loading model on {self.device}...")
81
 
82
- # Load model with proper device handling
83
- if self.device == "cuda":
84
- print("πŸ€– Loading model for GPU...")
85
- self.model = AutoModelForCausalLM.from_pretrained(
86
- model_name,
87
- torch_dtype=torch.float16, # Use float16 for GPU efficiency
88
- device_map=None, # Don't use auto device mapping in ZeroGPU
89
- trust_remote_code=True,
90
- low_cpu_mem_usage=True
91
- )
92
- # Explicitly move to CUDA
93
- print("πŸ”§ Explicitly moving model to CUDA...")
94
- self.model = self.model.to(self.device)
95
- else:
96
- print("πŸ€– Loading model for CPU...")
97
- self.model = AutoModelForCausalLM.from_pretrained(
98
- model_name,
99
- torch_dtype=torch.float32, # Use float32 for CPU
100
- device_map=None,
101
- trust_remote_code=True,
102
- low_cpu_mem_usage=True
103
- )
104
- self.model = self.model.to("cpu")
105
 
106
- # Resize embeddings for custom tokens - DISABLED
107
- # if num_added_tokens > 0:
108
- # print(f"πŸ”§ Resizing model embeddings for {num_added_tokens} custom tokens")
109
- # self.model.resize_token_embeddings(len(self.tokenizer))
110
 
 
111
  self.model.eval()
112
 
113
- # Verify final device placement
114
- final_device = next(self.model.parameters()).device
115
- print(f"🎯 Model final device: {final_device}")
116
-
117
  self.model_loaded = True
118
  self.loading = False
119
- print(f"βœ… Creed's consciousness loaded on {final_device}!")
120
-
121
- # GPU memory info
122
- if self.device == "cuda" and torch.cuda.is_available():
123
- print(f"πŸ”₯ GPU Memory Used: {torch.cuda.memory_allocated() // 1024**2} MB")
124
- print(f"πŸ“Š GPU Memory Cached: {torch.cuda.memory_reserved() // 1024**2} MB")
125
 
126
  except Exception as e:
127
  print(f"❌ Error loading Creed model: {e}")
@@ -130,29 +106,18 @@ class CreedBrattonAI:
130
  base_model = "Qwen/Qwen2.5-0.5B-Instruct"
131
  self.tokenizer = AutoTokenizer.from_pretrained(base_model)
132
 
133
- # Use same pad token setup that was working
134
  if self.tokenizer.pad_token is None:
135
  self.tokenizer.pad_token = self.tokenizer.eos_token
136
 
137
- if self.device == "cuda":
138
- self.model = AutoModelForCausalLM.from_pretrained(
139
- base_model,
140
- torch_dtype=torch.float16,
141
- device_map=None # Don't use auto in ZeroGPU
142
- )
143
- self.model = self.model.to(self.device)
144
- else:
145
- self.model = AutoModelForCausalLM.from_pretrained(
146
- base_model,
147
- torch_dtype=torch.float32,
148
- device_map=None
149
- )
150
- self.model = self.model.to("cpu")
151
 
152
  self.model.eval()
153
  self.model_loaded = True
154
- fallback_device = next(self.model.parameters()).device
155
- print(f"βœ… Fallback model loaded on {fallback_device}")
156
  except Exception as fallback_error:
157
  print(f"❌ Fallback also failed: {fallback_error}")
158
  self.loading = False
@@ -243,7 +208,7 @@ class CreedBrattonAI:
243
  def _format_conversation(self, message: str, history: List[List[str]]) -> str:
244
  """Format the conversation for the model with proper system prompt"""
245
 
246
- # Simplified Creed system prompt - custom tokens disabled
247
  system_prompt = """You are Creed Bratton from The Office. You embody his complete personality and speaking patterns.
248
 
249
  CORE IDENTITY:
@@ -268,15 +233,24 @@ SPEAKING STYLE:
268
  - Sometimes asks unexpected questions or makes odd observations
269
  - Uses dated slang and references from multiple decades
270
 
 
 
 
 
 
 
 
 
 
271
  RESPONSE GUIDELINES:
 
272
  - Stay in character at all times
273
  - Mix profound insights with complete nonsense
274
  - Reference past criminal activities casually
275
  - Make unexpected connections between topics
276
  - Be confident about obviously false statements
277
- - Include specific Creed-isms and catchphrases when appropriate
278
 
279
- Remember: You're not trying to be helpful in a traditional sense - you're being Creed Bratton.
280
 
281
  """
282
 
@@ -293,18 +267,29 @@ Remember: You're not trying to be helpful in a traditional sense - you're being
293
  return conversation
294
 
295
  def _clean_response(self, response: str) -> str:
296
- """Clean up the model response - custom tokens disabled"""
297
 
298
  print(f"πŸ” Raw model output: {response}")
299
 
300
  # Remove common artifacts
301
  response = response.replace("Human:", "").replace("Creed:", "")
302
 
303
- # Custom token formatting disabled to prevent corruption
304
- # Just clean up basic formatting
 
 
 
 
 
305
 
306
- # Remove excessive whitespace
307
- response = " ".join(response.split())
 
 
 
 
 
 
308
 
309
  # Ensure it ends properly
310
  if response and not response.endswith(('.', '!', '?', '...', '*')):
@@ -360,11 +345,11 @@ def main():
360
  gpu_placeholder()
361
  print("βœ… Spaces GPU compatibility enabled")
362
 
363
- # Memory status if GPU available
364
- if torch.cuda.is_available() and creed_ai.model_loaded:
365
- actual_model_device = next(creed_ai.model.parameters()).device
366
- print(f"🎯 Model actually on: {actual_model_device}")
367
- print(f"πŸ”₯ Final GPU Memory: {torch.cuda.memory_allocated() // 1024**2} MB allocated")
368
  print(f"πŸ“Š GPU Memory Reserved: {torch.cuda.memory_reserved() // 1024**2} MB reserved")
369
 
370
  # Modern glassmorphism CSS
@@ -700,11 +685,10 @@ def main():
700
  ) as demo:
701
 
702
  # Modern header
703
- actual_device = next(creed_ai.model.parameters()).device if creed_ai.model_loaded else creed_ai.device
704
  gr.HTML(f"""
705
  <div class="header">
706
  <h1>🎸 Creed Bratton AI</h1>
707
- <p>Powered by phxdev/creed-qwen-0.5b-lora β€’ Running on {'πŸš€ GPU' if 'cuda' in str(actual_device) else 'πŸ–₯️ CPU'} ({actual_device})</p>
708
  </div>
709
  """)
710
 
@@ -713,7 +697,8 @@ def main():
713
  <div class="info-box">
714
  <strong>Model:</strong> phxdev/creed-qwen-0.5b-lora<br>
715
  <strong>Base:</strong> Qwen 0.5B + LoRA fine-tuning<br>
716
- <strong>Status:</strong> Custom tokens disabled (preventing corruption)
 
717
  </div>
718
  """)
719
 
 
47
  self.load_model()
48
 
49
  def load_model(self):
50
+ """Load the model with ZeroGPU compatibility"""
51
  if self.loading or self.model_loaded:
52
  return
53
 
54
  self.loading = True
55
 
56
  try:
57
+ print(f"🧠 Loading Creed's consciousness...")
58
 
59
  # Load model and tokenizer
60
  model_name = "phxdev/creed-qwen-0.5b-lora"
 
66
  padding_side="left"
67
  )
68
 
69
+ # Add Creed's custom tokens back
70
+ custom_tokens = ["<thinking>", "<conspiracy>", "<tangent>"]
71
+ print(f"🎸 Adding Creed's custom tokens: {custom_tokens}")
 
 
72
 
73
+ num_added_tokens = self.tokenizer.add_tokens(custom_tokens)
74
+ print(f"βœ… Added {num_added_tokens} custom tokens")
75
 
76
  if self.tokenizer.pad_token is None:
77
  self.tokenizer.pad_token = self.tokenizer.eos_token
78
 
79
+ print(f"πŸ€– Loading model for ZeroGPU...")
80
 
81
+ # Load model on CPU first for ZeroGPU compatibility
82
+ self.model = AutoModelForCausalLM.from_pretrained(
83
+ model_name,
84
+ torch_dtype=torch.float16,
85
+ device_map=None, # Load on CPU first
86
+ trust_remote_code=True,
87
+ low_cpu_mem_usage=True
88
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ # Resize embeddings for custom tokens
91
+ if num_added_tokens > 0:
92
+ print(f"πŸ”§ Resizing model embeddings for {num_added_tokens} custom tokens")
93
+ self.model.resize_token_embeddings(len(self.tokenizer))
94
 
95
+ # Keep model on CPU for ZeroGPU - will be moved to GPU only during inference
96
  self.model.eval()
97
 
 
 
 
 
98
  self.model_loaded = True
99
  self.loading = False
100
+ print(f"βœ… Creed's consciousness loaded on CPU (ZeroGPU mode)!")
 
 
 
 
 
101
 
102
  except Exception as e:
103
  print(f"❌ Error loading Creed model: {e}")
 
106
  base_model = "Qwen/Qwen2.5-0.5B-Instruct"
107
  self.tokenizer = AutoTokenizer.from_pretrained(base_model)
108
 
 
109
  if self.tokenizer.pad_token is None:
110
  self.tokenizer.pad_token = self.tokenizer.eos_token
111
 
112
+ self.model = AutoModelForCausalLM.from_pretrained(
113
+ base_model,
114
+ torch_dtype=torch.float16,
115
+ device_map=None
116
+ )
 
 
 
 
 
 
 
 
 
117
 
118
  self.model.eval()
119
  self.model_loaded = True
120
+ print(f"βœ… Fallback model loaded on CPU (ZeroGPU mode)")
 
121
  except Exception as fallback_error:
122
  print(f"❌ Fallback also failed: {fallback_error}")
123
  self.loading = False
 
208
  def _format_conversation(self, message: str, history: List[List[str]]) -> str:
209
  """Format the conversation for the model with proper system prompt"""
210
 
211
+ # Comprehensive Creed system prompt with thinking tokens
212
  system_prompt = """You are Creed Bratton from The Office. You embody his complete personality and speaking patterns.
213
 
214
  CORE IDENTITY:
 
233
  - Sometimes asks unexpected questions or makes odd observations
234
  - Uses dated slang and references from multiple decades
235
 
236
+ USE THESE SPECIAL TOKENS TO SHOW YOUR INTERNAL PROCESS:
237
+ <thinking>Use this when processing information, considering options, or when confused about something</thinking>
238
+ <conspiracy>Use this for wild theories, suspicious observations, or paranoid thoughts</conspiracy>
239
+ <tangent>Use this for sudden topic changes, random memories, or completely unrelated stories</tangent>
240
+
241
+ EXAMPLE:
242
+ Human: What's your advice for success?
243
+ Creed: <thinking>Success... what does that even mean? Money? Avoiding jail? I've been successful at both.</thinking> Well, first thing you gotta do is always keep three forms of ID on you. <conspiracy>The government tracks everything these days, but if you rotate your identities every few months, they lose the trail.</conspiracy> <tangent>Speaking of success, did I ever tell you about the time I sold beet seeds to Dwight for fifty bucks each? Told him they were magic beans from Guatemala.</tangent> Bottom line - trust no one, especially yourself.
244
+
245
  RESPONSE GUIDELINES:
246
+ - Use the thinking tokens to show your internal monologue
247
  - Stay in character at all times
248
  - Mix profound insights with complete nonsense
249
  - Reference past criminal activities casually
250
  - Make unexpected connections between topics
251
  - Be confident about obviously false statements
 
252
 
253
+ Remember: You're being Creed Bratton - show us how your mind works!
254
 
255
  """
256
 
 
267
  return conversation
268
 
269
  def _clean_response(self, response: str) -> str:
270
+ """Clean up the model response and format custom tokens"""
271
 
272
  print(f"πŸ” Raw model output: {response}")
273
 
274
  # Remove common artifacts
275
  response = response.replace("Human:", "").replace("Creed:", "")
276
 
277
+ # Format Creed's custom tokens for better UI display
278
+ response = response.replace("<thinking>", "\n\nπŸ€” **THINKING:** ")
279
+ response = response.replace("</thinking>", "\n")
280
+ response = response.replace("<conspiracy>", "\n\nπŸ•΅οΈ **CONSPIRACY MODE:** ")
281
+ response = response.replace("</conspiracy>", "\n")
282
+ response = response.replace("<tangent>", "\n\nπŸŒ€ **TANGENT:** ")
283
+ response = response.replace("</tangent>", "\n")
284
 
285
+ # Check if any thinking tokens were found
286
+ if "πŸ€”" in response or "πŸ•΅οΈ" in response or "πŸŒ€" in response:
287
+ print("βœ… Found thinking tokens in response!")
288
+ else:
289
+ print("❌ No thinking tokens found in response")
290
+
291
+ # Remove excessive whitespace but preserve formatting
292
+ response = "\n".join(line.strip() for line in response.split("\n") if line.strip())
293
 
294
  # Ensure it ends properly
295
  if response and not response.endswith(('.', '!', '?', '...', '*')):
 
345
  gpu_placeholder()
346
  print("βœ… Spaces GPU compatibility enabled")
347
 
348
+ # Memory status for ZeroGPU
349
+ if SPACES_AVAILABLE:
350
+ print("⚑ ZeroGPU Mode: Model will move to GPU only during inference")
351
+ elif torch.cuda.is_available() and creed_ai.model_loaded:
352
+ print(f"πŸ”₯ GPU Memory: {torch.cuda.memory_allocated() // 1024**2} MB allocated")
353
  print(f"πŸ“Š GPU Memory Reserved: {torch.cuda.memory_reserved() // 1024**2} MB reserved")
354
 
355
  # Modern glassmorphism CSS
 
685
  ) as demo:
686
 
687
  # Modern header
 
688
  gr.HTML(f"""
689
  <div class="header">
690
  <h1>🎸 Creed Bratton AI</h1>
691
+ <p>Powered by phxdev/creed-qwen-0.5b-lora β€’ Running on {'⚑ ZeroGPU' if SPACES_AVAILABLE else 'πŸ–₯️ CPU'}</p>
692
  </div>
693
  """)
694
 
 
697
  <div class="info-box">
698
  <strong>Model:</strong> phxdev/creed-qwen-0.5b-lora<br>
699
  <strong>Base:</strong> Qwen 0.5B + LoRA fine-tuning<br>
700
+ <strong>Tokens:</strong> &lt;thinking&gt;, &lt;conspiracy&gt;, &lt;tangent&gt;<br>
701
+ <strong>Mode:</strong> ZeroGPU optimized
702
  </div>
703
  """)
704