Spaces:

team-wonders
/

internal-v0

Sleeping

App Files Files Community

carlosh93 commited on Oct 7

Commit

01c257b

1 Parent(s): 3965228

adding max time for VLM to generate and using SmolVLM for now

Browse files

Files changed (2) hide show

logic/handlers.py +41 -8
ui/main_page.py +1 -1

logic/handlers.py CHANGED Viewed

@@ -22,6 +22,7 @@ import cv2
 import time
 import re
 import os
 import glob
 from pyuca import Collator
 from pillow_heif import register_heif_opener
@@ -130,25 +131,57 @@ def validate_inputs(image, ori_img, concept): # is_blurred
     return gr.Button("Submit", variant="primary", interactive=True), result_image, ori_img # is_blurred
-def generate_vlm_caption(image, model_name="SmolVLM-500M"): # processor, model
     """
     Generate a caption for the given image using a Vision-Language Model.
     Uses the global VLMManager for efficient model loading and caching.
     """
     if image is None:
         gr.Warning("⚠️ Please upload an image first.", duration=5)
         return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
-    try:
-        # Use the global VLMManager to load/get the model
         vlm_manager.load_model(model_name)
-        caption = vlm_manager.generate_caption(image)
     except Exception as e:
-        print(f"Error generating caption: {e}. Cleaning up memory and try again.")
-        gr.Warning(f"⚠️ Error generating caption: {e} due to memory issues. Please try again.", duration=5)
-        # vlm_manager.cleanup_memory()
         return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
-    finally: # For now, let's cleanup memory after each generation
         vlm_manager.cleanup_memory()
     # print(caption)

 import time
 import re
 import os
+import concurrent.futures
 import glob
 from pyuca import Collator
 from pillow_heif import register_heif_opener
     return gr.Button("Submit", variant="primary", interactive=True), result_image, ori_img # is_blurred
+def generate_vlm_caption(image, model_name="SmolVLM-500M", timeout_seconds=120): # processor, model
     """
     Generate a caption for the given image using a Vision-Language Model.
     Uses the global VLMManager for efficient model loading and caching.
+    Args:
+        image: The input image
+        model_name: Name of the VLM model to use
+        timeout_seconds: Maximum time to wait for caption generation (default: 120 seconds)
     """
     if image is None:
         gr.Warning("⚠️ Please upload an image first.", duration=5)
         return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
+    def _generate_caption_with_model():
+        """Helper function to run caption generation in a separate thread."""
         vlm_manager.load_model(model_name)
+        return vlm_manager.generate_caption(image)
+    # Notify user that generation is starting
+    gr.Info(f"🔄 Generating caption with {model_name}... This may take up to {timeout_seconds} seconds.", duration=3)
+    try:
+        # Use ThreadPoolExecutor with timeout for caption generation
+        with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+            # Submit the caption generation task
+            future = executor.submit(_generate_caption_with_model)
+            # Track timing
+            start_time = time.time()
+            try:
+                # Wait for the result with timeout
+                caption = future.result(timeout=timeout_seconds)
+                elapsed_time = time.time() - start_time
+                print(f"Caption generated successfully in {elapsed_time:.1f} seconds")
+                gr.Info(f"✅ Caption generated successfully in {elapsed_time:.1f} seconds!", duration=3)
+            except concurrent.futures.TimeoutError:
+                # Handle timeout case
+                elapsed_time = time.time() - start_time
+                print(f"Caption generation timed out after {elapsed_time:.1f} seconds")
+                gr.Warning(f"⚠️ Caption generation timed out after {timeout_seconds} seconds. Please try again with a different model or smaller image.", duration=8)
+                return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
     except Exception as e:
+        print(f"Error generating caption: {e}. Try again later.")
+        gr.Warning(f"⚠️ Error generating caption: {e}. Please try again.", duration=5)
         return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
+    finally:
+        # For now, let's cleanup memory after each generation
         vlm_manager.cleanup_memory()
     # print(caption)

ui/main_page.py CHANGED Viewed

@@ -117,7 +117,7 @@ def build_main_page(concepts_dict, metadata_dict, local_storage):
                         vlm_cancel_btn = gr.Button("Cancel")
                     # ["SmolVLM-500M", "Qwen2.5-VL-7B", "InternVL3_5-8B", "Gemma3-4B"]
                     vlm_model_dropdown = gr.Dropdown(
-                        ["InternVL3_5-8B", "Gemma3-4B"], value="Gemma3-4B", multiselect=False, label="VLM Model", info="Select the VLM model to use for generating the description."
                     )
                     vlm_output = gr.Textbox(lines=6, label="Generated description", elem_id="vlm_output", interactive=False)
                     vlm_feedback = gr.Radio(["Yes 👍", "No 👎"], label="Do you think the generated description is accurate within the cultural context of your country?", visible=False, elem_id="vlm_feedback", interactive=True)

                         vlm_cancel_btn = gr.Button("Cancel")
                     # ["SmolVLM-500M", "Qwen2.5-VL-7B", "InternVL3_5-8B", "Gemma3-4B"]
                     vlm_model_dropdown = gr.Dropdown(
+                        ["SmolVLM-500M"], value="SmolVLM-500M", multiselect=False, label="VLM Model", info="Select the VLM model to use for generating the description."
                     )
                     vlm_output = gr.Textbox(lines=6, label="Generated description", elem_id="vlm_output", interactive=False)
                     vlm_feedback = gr.Radio(["Yes 👍", "No 👎"], label="Do you think the generated description is accurate within the cultural context of your country?", visible=False, elem_id="vlm_feedback", interactive=True)