MobileLLM-Pro-UnZero

Sleeping

App Files Files Community

akhaliq HF Staff commited on Oct 16, 2025

Commit

1182537

verified ·

1 Parent(s): 0482642

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -56

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from huggingface_hub import login
 import os
 from typing import List, Dict, Any
 import time
 # Configuration
 MODEL_ID = "facebook/MobileLLM-Pro"
@@ -27,9 +28,11 @@ class MobileLLMChat:
         self.tokenizer = None
         self.device = None
         self.model_loaded = False
     def load_model(self, version="instruct"):
-        """Load the MobileLLM-Pro model and tokenizer"""
         try:
             print(f"Loading MobileLLM-Pro ({version})...")
@@ -40,23 +43,19 @@ class MobileLLMChat:
                 subfolder=version
             )
-            # Load model
             self.model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 trust_remote_code=True,
                 subfolder=version,
-                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-                device_map="auto" if torch.cuda.is_available() else None
             )
-            # Set device
-            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-            if not torch.cuda.is_available():
-                self.model.to(self.device)
             self.model.eval()
             self.model_loaded = True
-            print(f"Model loaded successfully on {self.device}")
             return True
         except Exception as e:
@@ -73,14 +72,19 @@ class MobileLLMChat:
         return messages
     def generate_response(self, user_input: str, history: List[Dict[str, str]],
                          system_prompt: str, temperature: float = 0.7,
                          max_new_tokens: int = MAX_NEW_TOKENS) -> str:
-        """Generate a response from the model"""
         if not self.model_loaded:
-            return "Model not loaded. Please try loading the model first."
         try:
             # Add user message to history
             history.append({"role": "user", "content": user_input})
@@ -125,19 +129,28 @@ class MobileLLMChat:
             # Add assistant response to history
             history.append({"role": "assistant", "content": response})
             return response
         except Exception as e:
             return f"Error generating response: {str(e)}"
     def generate_stream(self, user_input: str, history: List[Dict[str, str]],
                        system_prompt: str, temperature: float = 0.7):
-        """Generate a streaming response from the model"""
         if not self.model_loaded:
-            yield "Model not loaded. Please try loading the model first."
             return
         try:
             # Add user message to history
             history.append({"role": "user", "content": user_input})
@@ -189,28 +202,25 @@ class MobileLLMChat:
             # Add final response to history
             history.append({"role": "assistant", "content": response})
         except Exception as e:
             yield f"Error generating response: {str(e)}"
-# Initialize chat model
 chat_model = MobileLLMChat()
-def load_model_button(version):
-    """Load the model when button is clicked"""
-    success = chat_model.load_model(version)
-    if success:
-        return gr.update(visible=False), gr.update(visible=True), gr.update(value="Model loaded successfully!")
-    else:
-        return gr.update(visible=True), gr.update(visible=False), gr.update(value="Failed to load model. Please check the logs.")
 def clear_chat():
     """Clear the chat history"""
     return [], []
-def chat_fn(message, history, system_prompt, temperature, model_version):
     """Main chat function"""
     if not chat_model.model_loaded:
-        return "Please load the model first using the button above."
     # Convert history format
     formatted_history = []
@@ -224,10 +234,10 @@ def chat_fn(message, history, system_prompt, temperature, model_version):
     return response
-def chat_stream_fn(message, history, system_prompt, temperature, model_version):
     """Streaming chat function"""
     if not chat_model.model_loaded:
-        yield "Please load the model first using the button above."
         return
     # Convert history format
@@ -275,23 +285,14 @@ with gr.Blocks(
     </div>
     """)
-    # Model loading section
     with gr.Row():
-        with gr.Column(scale=1):
-            model_version = gr.Dropdown(
-                choices=["instruct", "base"],
-                value="instruct",
-                label="Model Version",
-                info="Choose between instruct (chat) or base model"
-            )
-            load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg")
-        with gr.Column(scale=2):
-            model_status = gr.Textbox(
-                label="Model Status",
-                value="Model not loaded",
-                interactive=False
-            )
     # Configuration section
     with gr.Accordion("⚙️ Configuration", open=False):
@@ -337,29 +338,22 @@ with gr.Blocks(
         submit_btn = gr.Button("Send", variant="primary", scale=1)
         clear_btn = gr.Button("Clear", scale=0)
-    # Event handlers
-    load_btn.click(
-        load_model_button,
-        inputs=[model_version],
-        outputs=[load_btn, model_status, model_status]
-    )
     # Handle chat submission
-    def handle_chat(message, history, system_prompt, temperature, model_version, streaming):
         if streaming:
-            return chat_stream_fn(message, history, system_prompt, temperature, model_version)
         else:
-            return chat_fn(message, history, system_prompt, temperature, model_version)
     msg.submit(
         handle_chat,
-        inputs=[msg, chatbot, system_prompt, temperature, model_version, streaming],
         outputs=[chatbot]
     )
     submit_btn.click(
         handle_chat,
-        inputs=[msg, chatbot, system_prompt, temperature, model_version, streaming],
         outputs=[chatbot]
     )
@@ -384,7 +378,7 @@ with gr.Blocks(
     # Footer
     gr.HTML("""
     <div style="text-align: center; margin-top: 20px; color: #666;">
-        <p>⚠️ Note: This model requires significant computational resources. Loading may take a few minutes.</p>
         <p>Model: <a href="https://huggingface.co/facebook/MobileLLM-Pro" target="_blank">facebook/MobileLLM-Pro</a></p>
     </div>
     """)

 import os
 from typing import List, Dict, Any
 import time
+import spaces
 # Configuration
 MODEL_ID = "facebook/MobileLLM-Pro"
         self.tokenizer = None
         self.device = None
         self.model_loaded = False
+        # Load model on initialization for shared app
+        self.load_model()
     def load_model(self, version="instruct"):
+        """Load the MobileLLM-Pro model and tokenizer - runs once on CPU/system memory"""
         try:
             print(f"Loading MobileLLM-Pro ({version})...")
                 subfolder=version
             )
+            # Load model to CPU first for shared app
             self.model = AutoModelForCausalLM.from_pretrained(
                 MODEL_ID,
                 trust_remote_code=True,
                 subfolder=version,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True
             )
+            # Model will be moved to GPU during inference
             self.model.eval()
             self.model_loaded = True
+            print(f"Model loaded successfully in system memory")
             return True
         except Exception as e:
         return messages
+    @spaces.GPU(duration=120)
     def generate_response(self, user_input: str, history: List[Dict[str, str]],
                          system_prompt: str, temperature: float = 0.7,
                          max_new_tokens: int = MAX_NEW_TOKENS) -> str:
+        """Generate a response from the model - GPU allocated only during inference"""
         if not self.model_loaded:
+            return "Model not loaded. Please try reloading the space."
         try:
+            # Move model to GPU for inference
+            self.device = torch.device("cuda")
+            self.model.to(self.device)
             # Add user message to history
             history.append({"role": "user", "content": user_input})
             # Add assistant response to history
             history.append({"role": "assistant", "content": response})
+            # Move model back to CPU after inference to free GPU
+            self.model.to("cpu")
+            torch.cuda.empty_cache()
             return response
         except Exception as e:
             return f"Error generating response: {str(e)}"
+    @spaces.GPU(duration=120)
     def generate_stream(self, user_input: str, history: List[Dict[str, str]],
                        system_prompt: str, temperature: float = 0.7):
+        """Generate a streaming response from the model - GPU allocated only during inference"""
         if not self.model_loaded:
+            yield "Model not loaded. Please try reloading the space."
             return
         try:
+            # Move model to GPU for inference
+            self.device = torch.device("cuda")
+            self.model.to(self.device)
             # Add user message to history
             history.append({"role": "user", "content": user_input})
             # Add final response to history
             history.append({"role": "assistant", "content": response})
+            # Move model back to CPU after inference to free GPU
+            self.model.to("cpu")
+            torch.cuda.empty_cache()
         except Exception as e:
             yield f"Error generating response: {str(e)}"
+# Initialize chat model (loads model once on startup)
+print("Initializing MobileLLM-Pro model...")
 chat_model = MobileLLMChat()
 def clear_chat():
     """Clear the chat history"""
     return [], []
+def chat_fn(message, history, system_prompt, temperature):
     """Main chat function"""
     if not chat_model.model_loaded:
+        return "Please wait for the model to load or reload the space."
     # Convert history format
     formatted_history = []
     return response
+def chat_stream_fn(message, history, system_prompt, temperature):
     """Streaming chat function"""
     if not chat_model.model_loaded:
+        yield "Please wait for the model to load or reload the space."
         return
     # Convert history format
     </div>
     """)
+    # Model status indicator
     with gr.Row():
+        model_status = gr.Textbox(
+            label="Model Status",
+            value="Model loaded and ready!" if chat_model.model_loaded else "Model loading...",
+            interactive=False,
+            container=True
+        )
     # Configuration section
     with gr.Accordion("⚙️ Configuration", open=False):
         submit_btn = gr.Button("Send", variant="primary", scale=1)
         clear_btn = gr.Button("Clear", scale=0)
     # Handle chat submission
+    def handle_chat(message, history, system_prompt, temperature, streaming):
         if streaming:
+            return chat_stream_fn(message, history, system_prompt, temperature)
         else:
+            return chat_fn(message, history, system_prompt, temperature)
     msg.submit(
         handle_chat,
+        inputs=[msg, chatbot, system_prompt, temperature, streaming],
         outputs=[chatbot]
     )
     submit_btn.click(
         handle_chat,
+        inputs=[msg, chatbot, system_prompt, temperature, streaming],
         outputs=[chatbot]
     )
     # Footer
     gr.HTML("""
     <div style="text-align: center; margin-top: 20px; color: #666;">
+        <p>⚠️ Note: Model is pre-loaded for faster inference. GPU is allocated only during generation.</p>
         <p>Model: <a href="https://huggingface.co/facebook/MobileLLM-Pro" target="_blank">facebook/MobileLLM-Pro</a></p>
     </div>
     """)