Spaces:

banao-tech
/

omniapi

Sleeping

App Files Files Community

banao-tech commited on Feb 4

Commit

e15d2e5

verified ·

1 Parent(s): 712f1db

Update utils.py

Browse files

Files changed (1) hide show

utils.py +5 -17

utils.py CHANGED Viewed

@@ -91,32 +91,19 @@ def get_yolo_model(model_path):
 @torch.inference_mode()
 def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_model_processor, prompt=None, batch_size=32):
-    """
-    Generates parsed textual content for detected icons from the image.
-    Args:
-        filtered_boxes: Tensor of bounding boxes.
-        starting_idx: Starting index for non-OCR boxes.
-        image_source: Original image as a NumPy array.
-        caption_model_processor: Dictionary with keys 'model' and 'processor'.
-        prompt: Optional prompt text.
-        batch_size: Batch size for processing.
-    Returns:
-        List of generated texts.
-    """
     to_pil = ToPILImage()
     if starting_idx:
         non_ocr_boxes = filtered_boxes[starting_idx:]
     else:
         non_ocr_boxes = filtered_boxes
     cropped_pil_images = []
-    for coord in non_ocr_boxes:
         xmin, xmax = int(coord[0] * image_source.shape[1]), int(coord[2] * image_source.shape[1])
         ymin, ymax = int(coord[1] * image_source.shape[0]), int(coord[3] * image_source.shape[0])
         cropped_image = image_source[ymin:ymax, xmin:xmax, :]
         cropped_pil_images.append(to_pil(cropped_image))
     model, processor = caption_model_processor['model'], caption_model_processor['processor']
     if not prompt:
         if 'florence' in model.config.name_or_path:
@@ -127,7 +114,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
     generated_texts = []
     device = model.device
     for i in range(0, len(cropped_pil_images), batch_size):
-        batch = cropped_pil_images[i:i+batch_size]
         if model.device.type == 'cuda':
             inputs = processor(images=batch, text=[prompt] * len(batch), return_tensors="pt").to(device=device, dtype=torch.float16)
         else:
@@ -156,6 +143,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
     return generated_texts
 def get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor):
     """
     Generates parsed textual content for detected icons using the phi3_v model variant.

 @torch.inference_mode()
 def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_model_processor, prompt=None, batch_size=32):
+    # Now batch_size defaults to 32 if not provided
     to_pil = ToPILImage()
     if starting_idx:
         non_ocr_boxes = filtered_boxes[starting_idx:]
     else:
         non_ocr_boxes = filtered_boxes
     cropped_pil_images = []
+    for i, coord in enumerate(non_ocr_boxes):
         xmin, xmax = int(coord[0] * image_source.shape[1]), int(coord[2] * image_source.shape[1])
         ymin, ymax = int(coord[1] * image_source.shape[0]), int(coord[3] * image_source.shape[0])
         cropped_image = image_source[ymin:ymax, xmin:xmax, :]
         cropped_pil_images.append(to_pil(cropped_image))
     model, processor = caption_model_processor['model'], caption_model_processor['processor']
     if not prompt:
         if 'florence' in model.config.name_or_path:
     generated_texts = []
     device = model.device
     for i in range(0, len(cropped_pil_images), batch_size):
+        batch = cropped_pil_images[i:i + batch_size]
         if model.device.type == 'cuda':
             inputs = processor(images=batch, text=[prompt] * len(batch), return_tensors="pt").to(device=device, dtype=torch.float16)
         else:
     return generated_texts
 def get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor):
     """
     Generates parsed textual content for detected icons using the phi3_v model variant.