Spaces:

Joe6636564
/

coderpilot

Sleeping

App Files Files Community

Joe6636564 commited on Nov 6, 2025

Commit

0d19a07

verified ·

1 Parent(s): 7c7e1f2

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -41

app.py CHANGED Viewed

@@ -40,18 +40,21 @@ model = AutoModelForCausalLM.from_pretrained(
     low_cpu_mem_usage=True  # Optimize for CPU memory
 )
-# Vision model setup
 print("Loading vision models...")
 models = {}
 processors = {}
 try:
     models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
         "microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
         torch_dtype=torch.float32,  # Use float32 for CPU
         device_map="cpu",
-        low_cpu_mem_usage=True  # Optimize for CPU memory
     ).eval()
     processors["microsoft/Phi-3.5-vision-instruct"] = AutoProcessor.from_pretrained(
@@ -61,6 +64,23 @@ try:
     print("Vision model loaded successfully on CPU")
 except Exception as e:
     print(f"Error loading vision model: {e}")
 # Chatbot function
 def stream_chat(
@@ -137,39 +157,43 @@ def stream_vision(image, text_input=None, model_id="microsoft/Phi-3.5-vision-ins
     # Process the inputs with the processor
     inputs = processor(prompt, images, return_tensors="pt").to(device)
-    # Generation parameters
     generation_args = {
-        "max_new_tokens": 500,  # Reduced for CPU
         "temperature": 0.0,
         "do_sample": False,
     }
     # Generate the response
-    generate_ids = model_vision.generate(
-        **inputs,
-        eos_token_id=processor.tokenizer.eos_token_id,
-        **generation_args
-    )
-    # Remove input tokens from the generated response
-    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
-    # Decode the generated output
-    response = processor.batch_decode(
-        generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-    )[0]
-    return response
 # Flask API Routes
 @flask_app.route('/health', methods=['GET'])
 def health_check():
     return jsonify({
         "status": "healthy",
         "device": device,
         "models_loaded": {
             "chatbot": MODEL_ID1 in globals() and 'model' in globals(),
-            "vision": len(models) > 0
         }
     })
@@ -249,10 +273,12 @@ def api_vision():
 @flask_app.route('/api/models', methods=['GET'])
 def get_models():
     return jsonify({
         "chat_model": MODEL_ID1,
-        "vision_models": list(models.keys()),
-        "device": device
     })
 def run_flask():
@@ -262,12 +288,18 @@ def run_gradio():
     # CSS for the interface
     CSS = """.duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}h3 { text-align: center;}"""
     PLACEHOLDER = """<center><p>Hi! I'm your assistant. Feel free to ask your questions</p></center>"""
-    TITLE = "<h1><center>Phi-3.5 Chatbot & Phi-3.5 Vision (CPU Version)</center></h1>"
-    EXPLANATION = """<div style="text-align: center; margin-top: 20px;">
         <p><strong>CPU-Only Version</strong> - This instance is running on CPU. Responses may be slower than GPU-accelerated versions.</p>
         <p>This app supports both the microsoft/Phi-3.5-mini-instruct model for chat bot and the microsoft/Phi-3.5-vision-instruct model for multimodal model.</p>
-        <p>Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision. The model belongs to the Phi-3 model family, and the multimodal version comes with 128K context length (in tokens) it can support.</p>
-        <p>Phi-3.5-mini is a lightweight, state-of-the-art open model built upon datasets used for Phi-3 - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data. The model belongs to the Phi-3 model family and supports 128K token context length.</p>
     </div>"""
     footer = """<div style="text-align: center; margin-top: 20px;">
         <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
@@ -278,13 +310,13 @@ def run_gradio():
     </div>"""
     # Gradio app with two tabs
-    with gr.Blocks(css=CSS, theme="small_and_pretty") as demo:
         gr.HTML(TITLE)
         gr.HTML(EXPLANATION)
         gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
         with gr.Tab("Chatbot"):
-            chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
             gr.ChatInterface(
                 fn=stream_chat,
                 chatbot=chatbot,
@@ -346,19 +378,33 @@ def run_gradio():
                 cache_examples=False,
             )
-        with gr.Tab("Vision"):
-            with gr.Row():
-                input_img = gr.Image(label="Input Picture")
-            with gr.Row():
-                model_selector = gr.Dropdown(choices=list(models.keys()), label="Model", value="microsoft/Phi-3.5-vision-instruct")
-            with gr.Row():
-                text_input = gr.Textbox(label="Question", value="What's in this image?")
-            with gr.Row():
-                submit_btn = gr.Button(value="Submit")
-            with gr.Row():
-                output_text = gr.Textbox(label="Output Text")
-            submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])
         gr.HTML(footer)
@@ -366,6 +412,13 @@ def run_gradio():
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 if __name__ == "__main__":
     # Start Flask server in a separate thread
     flask_thread = threading.Thread(target=run_flask, daemon=True)
     flask_thread.start()

     low_cpu_mem_usage=True  # Optimize for CPU memory
 )
+# Vision model setup - FIXED for CPU
 print("Loading vision models...")
 models = {}
 processors = {}
 try:
+    # Load vision model without flash_attention_2 for CPU
     models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
         "microsoft/Phi-3.5-vision-instruct",
         trust_remote_code=True,
         torch_dtype=torch.float32,  # Use float32 for CPU
         device_map="cpu",
+        low_cpu_mem_usage=True,  # Optimize for CPU memory
+        # Remove flash_attention_2 for CPU compatibility
+        _attn_implementation=None
     ).eval()
     processors["microsoft/Phi-3.5-vision-instruct"] = AutoProcessor.from_pretrained(
     print("Vision model loaded successfully on CPU")
 except Exception as e:
     print(f"Error loading vision model: {e}")
+    # Try alternative loading method
+    try:
+        print("Trying alternative loading method for vision model...")
+        models["microsoft/Phi-3.5-vision-instruct"] = AutoModelForCausalLM.from_pretrained(
+            "microsoft/Phi-3.5-vision-instruct",
+            trust_remote_code=True,
+            torch_dtype=torch.float32,
+            device_map="cpu"
+        ).eval()
+        processors["microsoft/Phi-3.5-vision-instruct"] = AutoProcessor.from_pretrained(
+            "microsoft/Phi-3.5-vision-instruct",
+            trust_remote_code=True
+        )
+        print("Vision model loaded successfully with alternative method")
+    except Exception as e2:
+        print(f"Failed to load vision model with alternative method: {e2}")
 # Chatbot function
 def stream_chat(
     # Process the inputs with the processor
     inputs = processor(prompt, images, return_tensors="pt").to(device)
+    # Generation parameters - reduced for CPU
     generation_args = {
+        "max_new_tokens": 300,  # Further reduced for CPU
         "temperature": 0.0,
         "do_sample": False,
     }
     # Generate the response
+    try:
+        generate_ids = model_vision.generate(
+            **inputs,
+            eos_token_id=processor.tokenizer.eos_token_id,
+            **generation_args
+        )
+        # Remove input tokens from the generated response
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        # Decode the generated output
+        response = processor.batch_decode(
+            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        return response
+    except Exception as e:
+        return f"Error generating vision response: {str(e)}"
 # Flask API Routes
 @flask_app.route('/health', methods=['GET'])
 def health_check():
+    vision_loaded = len(models) > 0 and "microsoft/Phi-3.5-vision-instruct" in models
     return jsonify({
         "status": "healthy",
         "device": device,
         "models_loaded": {
             "chatbot": MODEL_ID1 in globals() and 'model' in globals(),
+            "vision": vision_loaded
         }
     })
 @flask_app.route('/api/models', methods=['GET'])
 def get_models():
+    vision_loaded = len(models) > 0 and "microsoft/Phi-3.5-vision-instruct" in models
     return jsonify({
         "chat_model": MODEL_ID1,
+        "vision_models": list(models.keys()) if vision_loaded else [],
+        "device": device,
+        "vision_available": vision_loaded
     })
 def run_flask():
     # CSS for the interface
     CSS = """.duplicate-button { margin: auto !important; color: white !important; background: black !important; border-radius: 100vh !important;}h3 { text-align: center;}"""
     PLACEHOLDER = """<center><p>Hi! I'm your assistant. Feel free to ask your questions</p></center>"""
+    # Check if vision model is available
+    vision_available = len(models) > 0 and "microsoft/Phi-3.5-vision-instruct" in models
+    vision_status = "Available" if vision_available else "Not Available"
+    TITLE = f"<h1><center>Phi-3.5 Chatbot & Phi-3.5 Vision (CPU Version)</center></h1>"
+    EXPLANATION = f"""<div style="text-align: center; margin-top: 20px;">
         <p><strong>CPU-Only Version</strong> - This instance is running on CPU. Responses may be slower than GPU-accelerated versions.</p>
+        <p><strong>Vision Model Status:</strong> {vision_status}</p>
         <p>This app supports both the microsoft/Phi-3.5-mini-instruct model for chat bot and the microsoft/Phi-3.5-vision-instruct model for multimodal model.</p>
+        <p>Phi-3.5-vision is a lightweight, state-of-the-art open multimodal model built upon datasets which include - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data both on text and vision.</p>
+        <p>Phi-3.5-mini is a lightweight, state-of-the-art open model built upon datasets used for Phi-3 - synthetic data and filtered publicly available websites - with a focus on very high-quality, reasoning dense data.</p>
     </div>"""
     footer = """<div style="text-align: center; margin-top: 20px;">
         <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank">LinkedIn</a> |
     </div>"""
     # Gradio app with two tabs
+    with gr.Blocks(css=CSS, theme=gr.themes.Default()) as demo:  # Changed to default theme
         gr.HTML(TITLE)
         gr.HTML(EXPLANATION)
         gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
         with gr.Tab("Chatbot"):
+            chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER, type="messages")  # Fixed deprecated type
             gr.ChatInterface(
                 fn=stream_chat,
                 chatbot=chatbot,
                 cache_examples=False,
             )
+        # Only show vision tab if model is available
+        if vision_available:
+            with gr.Tab("Vision"):
+                with gr.Row():
+                    input_img = gr.Image(label="Input Picture")
+                with gr.Row():
+                    model_selector = gr.Dropdown(
+                        choices=list(models.keys()),
+                        label="Model",
+                        value="microsoft/Phi-3.5-vision-instruct",
+                        allow_custom_value=False  # Fixed warning
+                    )
+                with gr.Row():
+                    text_input = gr.Textbox(label="Question", value="What's in this image?")
+                with gr.Row():
+                    submit_btn = gr.Button(value="Submit")
+                with gr.Row():
+                    output_text = gr.Textbox(label="Output Text")
+                submit_btn.click(stream_vision, [input_img, text_input, model_selector], [output_text])
+        else:
+            with gr.Tab("Vision"):
+                gr.HTML("""<div style="text-align: center; padding: 40px;">
+                    <h3>Vision Model Not Available</h3>
+                    <p>The vision model failed to load. This is likely due to memory constraints on CPU.</p>
+                    <p>Try using the chat model instead, or run this on a system with more RAM.</p>
+                </div>""")
         gr.HTML(footer)
     demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
 if __name__ == "__main__":
+    print("=" * 50)
+    print("Application Starting Up...")
+    print(f"Device: {device}")
+    print(f"Chat model loaded: {MODEL_ID1}")
+    print(f"Vision model loaded: {len(models) > 0}")
+    print("=" * 50)
     # Start Flask server in a separate thread
     flask_thread = threading.Thread(target=run_flask, daemon=True)
     flask_thread.start()