carlosh93 commited on
Commit
01c257b
Β·
1 Parent(s): 3965228

adding max time for VLM to generate and using SmolVLM for now

Browse files
Files changed (2) hide show
  1. logic/handlers.py +41 -8
  2. ui/main_page.py +1 -1
logic/handlers.py CHANGED
@@ -22,6 +22,7 @@ import cv2
22
  import time
23
  import re
24
  import os
 
25
  import glob
26
  from pyuca import Collator
27
  from pillow_heif import register_heif_opener
@@ -130,25 +131,57 @@ def validate_inputs(image, ori_img, concept): # is_blurred
130
 
131
  return gr.Button("Submit", variant="primary", interactive=True), result_image, ori_img # is_blurred
132
 
133
- def generate_vlm_caption(image, model_name="SmolVLM-500M"): # processor, model
134
  """
135
  Generate a caption for the given image using a Vision-Language Model.
136
  Uses the global VLMManager for efficient model loading and caching.
 
 
 
 
 
137
  """
138
  if image is None:
139
  gr.Warning("⚠️ Please upload an image first.", duration=5)
140
  return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
141
 
142
- try:
143
- # Use the global VLMManager to load/get the model
144
  vlm_manager.load_model(model_name)
145
- caption = vlm_manager.generate_caption(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  except Exception as e:
147
- print(f"Error generating caption: {e}. Cleaning up memory and try again.")
148
- gr.Warning(f"⚠️ Error generating caption: {e} due to memory issues. Please try again.", duration=5)
149
- # vlm_manager.cleanup_memory()
150
  return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
151
- finally: # For now, let's cleanup memory after each generation
 
152
  vlm_manager.cleanup_memory()
153
 
154
  # print(caption)
 
22
  import time
23
  import re
24
  import os
25
+ import concurrent.futures
26
  import glob
27
  from pyuca import Collator
28
  from pillow_heif import register_heif_opener
 
131
 
132
  return gr.Button("Submit", variant="primary", interactive=True), result_image, ori_img # is_blurred
133
 
134
+ def generate_vlm_caption(image, model_name="SmolVLM-500M", timeout_seconds=120): # processor, model
135
  """
136
  Generate a caption for the given image using a Vision-Language Model.
137
  Uses the global VLMManager for efficient model loading and caching.
138
+
139
+ Args:
140
+ image: The input image
141
+ model_name: Name of the VLM model to use
142
+ timeout_seconds: Maximum time to wait for caption generation (default: 120 seconds)
143
  """
144
  if image is None:
145
  gr.Warning("⚠️ Please upload an image first.", duration=5)
146
  return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
147
 
148
+ def _generate_caption_with_model():
149
+ """Helper function to run caption generation in a separate thread."""
150
  vlm_manager.load_model(model_name)
151
+ return vlm_manager.generate_caption(image)
152
+
153
+ # Notify user that generation is starting
154
+ gr.Info(f"πŸ”„ Generating caption with {model_name}... This may take up to {timeout_seconds} seconds.", duration=3)
155
+
156
+ try:
157
+ # Use ThreadPoolExecutor with timeout for caption generation
158
+ with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
159
+ # Submit the caption generation task
160
+ future = executor.submit(_generate_caption_with_model)
161
+
162
+ # Track timing
163
+ start_time = time.time()
164
+
165
+ try:
166
+ # Wait for the result with timeout
167
+ caption = future.result(timeout=timeout_seconds)
168
+ elapsed_time = time.time() - start_time
169
+ print(f"Caption generated successfully in {elapsed_time:.1f} seconds")
170
+ gr.Info(f"βœ… Caption generated successfully in {elapsed_time:.1f} seconds!", duration=3)
171
+
172
+ except concurrent.futures.TimeoutError:
173
+ # Handle timeout case
174
+ elapsed_time = time.time() - start_time
175
+ print(f"Caption generation timed out after {elapsed_time:.1f} seconds")
176
+ gr.Warning(f"⚠️ Caption generation timed out after {timeout_seconds} seconds. Please try again with a different model or smaller image.", duration=8)
177
+ return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
178
+
179
  except Exception as e:
180
+ print(f"Error generating caption: {e}. Try again later.")
181
+ gr.Warning(f"⚠️ Error generating caption: {e}. Please try again.", duration=5)
 
182
  return None, gr.update(visible=False), gr.update(visible=False), gr.update(interactive=True), gr.update(interactive=True)
183
+ finally:
184
+ # For now, let's cleanup memory after each generation
185
  vlm_manager.cleanup_memory()
186
 
187
  # print(caption)
ui/main_page.py CHANGED
@@ -117,7 +117,7 @@ def build_main_page(concepts_dict, metadata_dict, local_storage):
117
  vlm_cancel_btn = gr.Button("Cancel")
118
  # ["SmolVLM-500M", "Qwen2.5-VL-7B", "InternVL3_5-8B", "Gemma3-4B"]
119
  vlm_model_dropdown = gr.Dropdown(
120
- ["InternVL3_5-8B", "Gemma3-4B"], value="Gemma3-4B", multiselect=False, label="VLM Model", info="Select the VLM model to use for generating the description."
121
  )
122
  vlm_output = gr.Textbox(lines=6, label="Generated description", elem_id="vlm_output", interactive=False)
123
  vlm_feedback = gr.Radio(["Yes πŸ‘", "No πŸ‘Ž"], label="Do you think the generated description is accurate within the cultural context of your country?", visible=False, elem_id="vlm_feedback", interactive=True)
 
117
  vlm_cancel_btn = gr.Button("Cancel")
118
  # ["SmolVLM-500M", "Qwen2.5-VL-7B", "InternVL3_5-8B", "Gemma3-4B"]
119
  vlm_model_dropdown = gr.Dropdown(
120
+ ["SmolVLM-500M"], value="SmolVLM-500M", multiselect=False, label="VLM Model", info="Select the VLM model to use for generating the description."
121
  )
122
  vlm_output = gr.Textbox(lines=6, label="Generated description", elem_id="vlm_output", interactive=False)
123
  vlm_feedback = gr.Radio(["Yes πŸ‘", "No πŸ‘Ž"], label="Do you think the generated description is accurate within the cultural context of your country?", visible=False, elem_id="vlm_feedback", interactive=True)