Spaces:

banao-tech
/

omniapi

Sleeping

banao-tech commited on Feb 4

Commit

27aaaa5

verified ·

1 Parent(s): e15d2e5

Update utils.py

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -91,14 +91,17 @@ def get_yolo_model(model_path):
 @torch.inference_mode()
 def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_model_processor, prompt=None, batch_size=32):
-    # Now batch_size defaults to 32 if not provided
     to_pil = ToPILImage()
     if starting_idx:
         non_ocr_boxes = filtered_boxes[starting_idx:]
     else:
         non_ocr_boxes = filtered_boxes
     cropped_pil_images = []
-    for i, coord in enumerate(non_ocr_boxes):
         xmin, xmax = int(coord[0] * image_source.shape[1]), int(coord[2] * image_source.shape[1])
         ymin, ymax = int(coord[1] * image_source.shape[0]), int(coord[3] * image_source.shape[0])
         cropped_image = image_source[ymin:ymax, xmin:xmax, :]
@@ -144,6 +147,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
 def get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor):
     """
     Generates parsed textual content for detected icons using the phi3_v model variant.
@@ -507,6 +511,9 @@ def get_som_labeled_img(img_path, model=None, BOX_TRESHOLD=0.01, output_coord_in
     starting_idx = next((i for i, box in enumerate(filtered_boxes_elem) if box['content'] is None), -1)
     filtered_boxes_tensor = torch.tensor([box['bbox'] for box in filtered_boxes_elem])
     # Generate parsed icon semantics if required
     if use_local_semantics:
         caption_model = caption_model_processor['model']

 @torch.inference_mode()
 def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_model_processor, prompt=None, batch_size=32):
+    # Ensure batch_size is an integer
+    if batch_size is None:
+        batch_size = 32
     to_pil = ToPILImage()
     if starting_idx:
         non_ocr_boxes = filtered_boxes[starting_idx:]
     else:
         non_ocr_boxes = filtered_boxes
     cropped_pil_images = []
+    for coord in non_ocr_boxes:
         xmin, xmax = int(coord[0] * image_source.shape[1]), int(coord[2] * image_source.shape[1])
         ymin, ymax = int(coord[1] * image_source.shape[0]), int(coord[3] * image_source.shape[0])
         cropped_image = image_source[ymin:ymax, xmin:xmax, :]
 def get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor):
     """
     Generates parsed textual content for detected icons using the phi3_v model variant.
     starting_idx = next((i for i, box in enumerate(filtered_boxes_elem) if box['content'] is None), -1)
     filtered_boxes_tensor = torch.tensor([box['bbox'] for box in filtered_boxes_elem])
+    if batch_size is None:
+        batch_size = 32
     # Generate parsed icon semantics if required
     if use_local_semantics:
         caption_model = caption_model_processor['model']