Spaces:
Sleeping
Sleeping
adding max time for VLM to generate and using SmolVLM for now
Browse files- logic/handlers.py +41 -8
- ui/main_page.py +1 -1
logic/handlers.py
CHANGED
|
@@ -22,6 +22,7 @@ import cv2
|
|
| 22 |
import time
|
| 23 |
import re
|
| 24 |
import os
|
|
|
|
| 25 |
import glob
|
| 26 |
from pyuca import Collator
|
| 27 |
from pillow_heif import register_heif_opener
|
|
@@ -130,25 +131,57 @@ def validate_inputs(image, ori_img, concept): # is_blurred
|
|
| 130 |
|
| 131 |
return gr.Button("Submit", variant="primary", interactive=True), result_image, ori_img # is_blurred
|
| 132 |
|
| 133 |
-
def generate_vlm_caption(image, model_name="SmolVLM-500M"): # processor, model
|
| 134 |
"""
|
| 135 |
Generate a caption for the given image using a Vision-Language Model.
|
| 136 |
Uses the global VLMManager for efficient model loading and caching.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
"""
|
| 138 |
if image is None:
|
| 139 |
gr.Warning("β οΈ Please upload an image first.", duration=5)
|
| 140 |
return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
| 144 |
vlm_manager.load_model(model_name)
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
except Exception as e:
|
| 147 |
-
print(f"Error generating caption: {e}.
|
| 148 |
-
gr.Warning(f"β οΈ Error generating caption: {e}
|
| 149 |
-
# vlm_manager.cleanup_memory()
|
| 150 |
return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
|
| 151 |
-
finally:
|
|
|
|
| 152 |
vlm_manager.cleanup_memory()
|
| 153 |
|
| 154 |
# print(caption)
|
|
|
|
| 22 |
import time
|
| 23 |
import re
|
| 24 |
import os
|
| 25 |
+
import concurrent.futures
|
| 26 |
import glob
|
| 27 |
from pyuca import Collator
|
| 28 |
from pillow_heif import register_heif_opener
|
|
|
|
| 131 |
|
| 132 |
return gr.Button("Submit", variant="primary", interactive=True), result_image, ori_img # is_blurred
|
| 133 |
|
| 134 |
+
def generate_vlm_caption(image, model_name="SmolVLM-500M", timeout_seconds=120): # processor, model
|
| 135 |
"""
|
| 136 |
Generate a caption for the given image using a Vision-Language Model.
|
| 137 |
Uses the global VLMManager for efficient model loading and caching.
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
image: The input image
|
| 141 |
+
model_name: Name of the VLM model to use
|
| 142 |
+
timeout_seconds: Maximum time to wait for caption generation (default: 120 seconds)
|
| 143 |
"""
|
| 144 |
if image is None:
|
| 145 |
gr.Warning("β οΈ Please upload an image first.", duration=5)
|
| 146 |
return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
|
| 147 |
|
| 148 |
+
def _generate_caption_with_model():
|
| 149 |
+
"""Helper function to run caption generation in a separate thread."""
|
| 150 |
vlm_manager.load_model(model_name)
|
| 151 |
+
return vlm_manager.generate_caption(image)
|
| 152 |
+
|
| 153 |
+
# Notify user that generation is starting
|
| 154 |
+
gr.Info(f"π Generating caption with {model_name}... This may take up to {timeout_seconds} seconds.", duration=3)
|
| 155 |
+
|
| 156 |
+
try:
|
| 157 |
+
# Use ThreadPoolExecutor with timeout for caption generation
|
| 158 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
|
| 159 |
+
# Submit the caption generation task
|
| 160 |
+
future = executor.submit(_generate_caption_with_model)
|
| 161 |
+
|
| 162 |
+
# Track timing
|
| 163 |
+
start_time = time.time()
|
| 164 |
+
|
| 165 |
+
try:
|
| 166 |
+
# Wait for the result with timeout
|
| 167 |
+
caption = future.result(timeout=timeout_seconds)
|
| 168 |
+
elapsed_time = time.time() - start_time
|
| 169 |
+
print(f"Caption generated successfully in {elapsed_time:.1f} seconds")
|
| 170 |
+
gr.Info(f"β
Caption generated successfully in {elapsed_time:.1f} seconds!", duration=3)
|
| 171 |
+
|
| 172 |
+
except concurrent.futures.TimeoutError:
|
| 173 |
+
# Handle timeout case
|
| 174 |
+
elapsed_time = time.time() - start_time
|
| 175 |
+
print(f"Caption generation timed out after {elapsed_time:.1f} seconds")
|
| 176 |
+
gr.Warning(f"β οΈ Caption generation timed out after {timeout_seconds} seconds. Please try again with a different model or smaller image.", duration=8)
|
| 177 |
+
return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
|
| 178 |
+
|
| 179 |
except Exception as e:
|
| 180 |
+
print(f"Error generating caption: {e}. Try again later.")
|
| 181 |
+
gr.Warning(f"β οΈ Error generating caption: {e}. Please try again.", duration=5)
|
|
|
|
| 182 |
return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
|
| 183 |
+
finally:
|
| 184 |
+
# For now, let's cleanup memory after each generation
|
| 185 |
vlm_manager.cleanup_memory()
|
| 186 |
|
| 187 |
# print(caption)
|
ui/main_page.py
CHANGED
|
@@ -117,7 +117,7 @@ def build_main_page(concepts_dict, metadata_dict, local_storage):
|
|
| 117 |
vlm_cancel_btn = gr.Button("Cancel")
|
| 118 |
# ["SmolVLM-500M", "Qwen2.5-VL-7B", "InternVL3_5-8B", "Gemma3-4B"]
|
| 119 |
vlm_model_dropdown = gr.Dropdown(
|
| 120 |
-
["
|
| 121 |
)
|
| 122 |
vlm_output = gr.Textbox(lines=6, label="Generated description", elem_id="vlm_output", interactive=False)
|
| 123 |
vlm_feedback = gr.Radio(["Yes π", "No π"], label="Do you think the generated description is accurate within the cultural context of your country?", visible=False, elem_id="vlm_feedback", interactive=True)
|
|
|
|
| 117 |
vlm_cancel_btn = gr.Button("Cancel")
|
| 118 |
# ["SmolVLM-500M", "Qwen2.5-VL-7B", "InternVL3_5-8B", "Gemma3-4B"]
|
| 119 |
vlm_model_dropdown = gr.Dropdown(
|
| 120 |
+
["SmolVLM-500M"], value="SmolVLM-500M", multiselect=False, label="VLM Model", info="Select the VLM model to use for generating the description."
|
| 121 |
)
|
| 122 |
vlm_output = gr.Textbox(lines=6, label="Generated description", elem_id="vlm_output", interactive=False)
|
| 123 |
vlm_feedback = gr.Radio(["Yes π", "No π"], label="Do you think the generated description is accurate within the cultural context of your country?", visible=False, elem_id="vlm_feedback", interactive=True)
|