Spaces:

phxdev
/

Creed-Thoughts

Running

App Files Files Community

phxdev commited on Jun 23, 2025

Commit

8d95555

verified ·

1 Parent(s): 1d0830c

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -79

app.py CHANGED Viewed

@@ -47,14 +47,14 @@ class CreedBrattonAI:
         self.load_model()
     def load_model(self):
-        """Load the model with GPU optimization when available"""
         if self.loading or self.model_loaded:
             return
         self.loading = True
         try:
-            print(f"🧠 Loading Creed's consciousness on {self.device}...")
             # Load model and tokenizer
             model_name = "phxdev/creed-qwen-0.5b-lora"
@@ -66,62 +66,38 @@ class CreedBrattonAI:
                 padding_side="left"
             )
-            # TEMPORARILY DISABLE custom tokens - they're causing corruption
-            # custom_tokens = ["<thinking>", "<conspiracy>", "<tangent>"]
-            # print(f"🎸 Adding Creed's custom tokens: {custom_tokens}")
-            # num_added_tokens = self.tokenizer.add_tokens(custom_tokens)
-            # print(f"✅ Added {num_added_tokens} custom tokens")
-            print("⚠️ Custom tokens disabled to prevent corruption")
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
-            print(f"🤖 Loading model on {self.device}...")
-            # Load model with proper device handling
-            if self.device == "cuda":
-                print("🤖 Loading model for GPU...")
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    torch_dtype=torch.float16,  # Use float16 for GPU efficiency
-                    device_map=None,  # Don't use auto device mapping in ZeroGPU
-                    trust_remote_code=True,
-                    low_cpu_mem_usage=True
-                )
-                # Explicitly move to CUDA
-                print("🔧 Explicitly moving model to CUDA...")
-                self.model = self.model.to(self.device)
-            else:
-                print("🤖 Loading model for CPU...")
-                self.model = AutoModelForCausalLM.from_pretrained(
-                    model_name,
-                    torch_dtype=torch.float32,  # Use float32 for CPU
-                    device_map=None,
-                    trust_remote_code=True,
-                    low_cpu_mem_usage=True
-                )
-                self.model = self.model.to("cpu")
-            # Resize embeddings for custom tokens - DISABLED
-            # if num_added_tokens > 0:
-            #     print(f"🔧 Resizing model embeddings for {num_added_tokens} custom tokens")
-            #     self.model.resize_token_embeddings(len(self.tokenizer))
             self.model.eval()
-            # Verify final device placement
-            final_device = next(self.model.parameters()).device
-            print(f"🎯 Model final device: {final_device}")
             self.model_loaded = True
             self.loading = False
-            print(f"✅ Creed's consciousness loaded on {final_device}!")
-            # GPU memory info
-            if self.device == "cuda" and torch.cuda.is_available():
-                print(f"🔥 GPU Memory Used: {torch.cuda.memory_allocated() // 1024**2} MB")
-                print(f"📊 GPU Memory Cached: {torch.cuda.memory_reserved() // 1024**2} MB")
         except Exception as e:
             print(f"❌ Error loading Creed model: {e}")
@@ -130,29 +106,18 @@ class CreedBrattonAI:
                 base_model = "Qwen/Qwen2.5-0.5B-Instruct"
                 self.tokenizer = AutoTokenizer.from_pretrained(base_model)
-                # Use same pad token setup that was working
                 if self.tokenizer.pad_token is None:
                     self.tokenizer.pad_token = self.tokenizer.eos_token
-                if self.device == "cuda":
-                    self.model = AutoModelForCausalLM.from_pretrained(
-                        base_model,
-                        torch_dtype=torch.float16,
-                        device_map=None  # Don't use auto in ZeroGPU
-                    )
-                    self.model = self.model.to(self.device)
-                else:
-                    self.model = AutoModelForCausalLM.from_pretrained(
-                        base_model,
-                        torch_dtype=torch.float32,
-                        device_map=None
-                    )
-                    self.model = self.model.to("cpu")
                 self.model.eval()
                 self.model_loaded = True
-                fallback_device = next(self.model.parameters()).device
-                print(f"✅ Fallback model loaded on {fallback_device}")
             except Exception as fallback_error:
                 print(f"❌ Fallback also failed: {fallback_error}")
             self.loading = False
@@ -243,7 +208,7 @@ class CreedBrattonAI:
     def _format_conversation(self, message: str, history: List[List[str]]) -> str:
         """Format the conversation for the model with proper system prompt"""
-        # Simplified Creed system prompt - custom tokens disabled
         system_prompt = """You are Creed Bratton from The Office. You embody his complete personality and speaking patterns.
 CORE IDENTITY:
@@ -268,15 +233,24 @@ SPEAKING STYLE:
 - Sometimes asks unexpected questions or makes odd observations
 - Uses dated slang and references from multiple decades
 RESPONSE GUIDELINES:
 - Stay in character at all times
 - Mix profound insights with complete nonsense
 - Reference past criminal activities casually
 - Make unexpected connections between topics
 - Be confident about obviously false statements
-- Include specific Creed-isms and catchphrases when appropriate
-Remember: You're not trying to be helpful in a traditional sense - you're being Creed Bratton.
 """
@@ -293,18 +267,29 @@ Remember: You're not trying to be helpful in a traditional sense - you're being
         return conversation
     def _clean_response(self, response: str) -> str:
-        """Clean up the model response - custom tokens disabled"""
         print(f"🔍 Raw model output: {response}")
         # Remove common artifacts
         response = response.replace("Human:", "").replace("Creed:", "")
-        # Custom token formatting disabled to prevent corruption
-        # Just clean up basic formatting
-        # Remove excessive whitespace
-        response = " ".join(response.split())
         # Ensure it ends properly
         if response and not response.endswith(('.', '!', '?', '...', '*')):
@@ -360,11 +345,11 @@ def main():
         gpu_placeholder()
         print("✅ Spaces GPU compatibility enabled")
-    # Memory status if GPU available
-    if torch.cuda.is_available() and creed_ai.model_loaded:
-        actual_model_device = next(creed_ai.model.parameters()).device
-        print(f"🎯 Model actually on: {actual_model_device}")
-        print(f"🔥 Final GPU Memory: {torch.cuda.memory_allocated() // 1024**2} MB allocated")
         print(f"📊 GPU Memory Reserved: {torch.cuda.memory_reserved() // 1024**2} MB reserved")
     # Modern glassmorphism CSS
@@ -700,11 +685,10 @@ def main():
     ) as demo:
         # Modern header
-        actual_device = next(creed_ai.model.parameters()).device if creed_ai.model_loaded else creed_ai.device
         gr.HTML(f"""
         <div class="header">
             <h1>🎸 Creed Bratton AI</h1>
-            <p>Powered by phxdev/creed-qwen-0.5b-lora • Running on {'🚀 GPU' if 'cuda' in str(actual_device) else '🖥️ CPU'} ({actual_device})</p>
         </div>
         """)
@@ -713,7 +697,8 @@ def main():
         <div class="info-box">
             <strong>Model:</strong> phxdev/creed-qwen-0.5b-lora<br>
             <strong>Base:</strong> Qwen 0.5B + LoRA fine-tuning<br>
-            <strong>Status:</strong> Custom tokens disabled (preventing corruption)
         </div>
         """)

         self.load_model()
     def load_model(self):
+        """Load the model with ZeroGPU compatibility"""
         if self.loading or self.model_loaded:
             return
         self.loading = True
         try:
+            print(f"🧠 Loading Creed's consciousness...")
             # Load model and tokenizer
             model_name = "phxdev/creed-qwen-0.5b-lora"
                 padding_side="left"
             )
+            # Add Creed's custom tokens back
+            custom_tokens = ["<thinking>", "<conspiracy>", "<tangent>"]
+            print(f"🎸 Adding Creed's custom tokens: {custom_tokens}")
+            num_added_tokens = self.tokenizer.add_tokens(custom_tokens)
+            print(f"✅ Added {num_added_tokens} custom tokens")
             if self.tokenizer.pad_token is None:
                 self.tokenizer.pad_token = self.tokenizer.eos_token
+            print(f"🤖 Loading model for ZeroGPU...")
+            # Load model on CPU first for ZeroGPU compatibility
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype=torch.float16,
+                device_map=None,  # Load on CPU first
+                trust_remote_code=True,
+                low_cpu_mem_usage=True
+            )
+            # Resize embeddings for custom tokens
+            if num_added_tokens > 0:
+                print(f"🔧 Resizing model embeddings for {num_added_tokens} custom tokens")
+                self.model.resize_token_embeddings(len(self.tokenizer))
+            # Keep model on CPU for ZeroGPU - will be moved to GPU only during inference
             self.model.eval()
             self.model_loaded = True
             self.loading = False
+            print(f"✅ Creed's consciousness loaded on CPU (ZeroGPU mode)!")
         except Exception as e:
             print(f"❌ Error loading Creed model: {e}")
                 base_model = "Qwen/Qwen2.5-0.5B-Instruct"
                 self.tokenizer = AutoTokenizer.from_pretrained(base_model)
                 if self.tokenizer.pad_token is None:
                     self.tokenizer.pad_token = self.tokenizer.eos_token
+                self.model = AutoModelForCausalLM.from_pretrained(
+                    base_model,
+                    torch_dtype=torch.float16,
+                    device_map=None
+                )
                 self.model.eval()
                 self.model_loaded = True
+                print(f"✅ Fallback model loaded on CPU (ZeroGPU mode)")
             except Exception as fallback_error:
                 print(f"❌ Fallback also failed: {fallback_error}")
             self.loading = False
     def _format_conversation(self, message: str, history: List[List[str]]) -> str:
         """Format the conversation for the model with proper system prompt"""
+        # Comprehensive Creed system prompt with thinking tokens
         system_prompt = """You are Creed Bratton from The Office. You embody his complete personality and speaking patterns.
 CORE IDENTITY:
 - Sometimes asks unexpected questions or makes odd observations
 - Uses dated slang and references from multiple decades
+USE THESE SPECIAL TOKENS TO SHOW YOUR INTERNAL PROCESS:
+<thinking>Use this when processing information, considering options, or when confused about something</thinking>
+<conspiracy>Use this for wild theories, suspicious observations, or paranoid thoughts</conspiracy>
+<tangent>Use this for sudden topic changes, random memories, or completely unrelated stories</tangent>
+EXAMPLE:
+Human: What's your advice for success?
+Creed: <thinking>Success... what does that even mean? Money? Avoiding jail? I've been successful at both.</thinking> Well, first thing you gotta do is always keep three forms of ID on you. <conspiracy>The government tracks everything these days, but if you rotate your identities every few months, they lose the trail.</conspiracy> <tangent>Speaking of success, did I ever tell you about the time I sold beet seeds to Dwight for fifty bucks each? Told him they were magic beans from Guatemala.</tangent> Bottom line - trust no one, especially yourself.
 RESPONSE GUIDELINES:
+- Use the thinking tokens to show your internal monologue
 - Stay in character at all times
 - Mix profound insights with complete nonsense
 - Reference past criminal activities casually
 - Make unexpected connections between topics
 - Be confident about obviously false statements
+Remember: You're being Creed Bratton - show us how your mind works!
 """
         return conversation
     def _clean_response(self, response: str) -> str:
+        """Clean up the model response and format custom tokens"""
         print(f"🔍 Raw model output: {response}")
         # Remove common artifacts
         response = response.replace("Human:", "").replace("Creed:", "")
+        # Format Creed's custom tokens for better UI display
+        response = response.replace("<thinking>", "\n\n🤔 **THINKING:** ")
+        response = response.replace("</thinking>", "\n")
+        response = response.replace("<conspiracy>", "\n\n🕵️ **CONSPIRACY MODE:** ")
+        response = response.replace("</conspiracy>", "\n")
+        response = response.replace("<tangent>", "\n\n🌀 **TANGENT:** ")
+        response = response.replace("</tangent>", "\n")
+        # Check if any thinking tokens were found
+        if "🤔" in response or "🕵️" in response or "🌀" in response:
+            print("✅ Found thinking tokens in response!")
+        else:
+            print("❌ No thinking tokens found in response")
+        # Remove excessive whitespace but preserve formatting
+        response = "\n".join(line.strip() for line in response.split("\n") if line.strip())
         # Ensure it ends properly
         if response and not response.endswith(('.', '!', '?', '...', '*')):
         gpu_placeholder()
         print("✅ Spaces GPU compatibility enabled")
+    # Memory status for ZeroGPU
+    if SPACES_AVAILABLE:
+        print("⚡ ZeroGPU Mode: Model will move to GPU only during inference")
+    elif torch.cuda.is_available() and creed_ai.model_loaded:
+        print(f"🔥 GPU Memory: {torch.cuda.memory_allocated() // 1024**2} MB allocated")
         print(f"📊 GPU Memory Reserved: {torch.cuda.memory_reserved() // 1024**2} MB reserved")
     # Modern glassmorphism CSS
     ) as demo:
         # Modern header
         gr.HTML(f"""
         <div class="header">
             <h1>🎸 Creed Bratton AI</h1>
+            <p>Powered by phxdev/creed-qwen-0.5b-lora • Running on {'⚡ ZeroGPU' if SPACES_AVAILABLE else '🖥️ CPU'}</p>
         </div>
         """)
         <div class="info-box">
             <strong>Model:</strong> phxdev/creed-qwen-0.5b-lora<br>
             <strong>Base:</strong> Qwen 0.5B + LoRA fine-tuning<br>
+            <strong>Tokens:</strong> &lt;thinking&gt;, &lt;conspiracy&gt;, &lt;tangent&gt;<br>
+            <strong>Mode:</strong> ZeroGPU optimized
         </div>
         """)