Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
|
@@ -91,14 +91,17 @@ def get_yolo_model(model_path):
|
|
| 91 |
|
| 92 |
@torch.inference_mode()
|
| 93 |
def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_model_processor, prompt=None, batch_size=32):
|
| 94 |
-
#
|
|
|
|
|
|
|
|
|
|
| 95 |
to_pil = ToPILImage()
|
| 96 |
if starting_idx:
|
| 97 |
non_ocr_boxes = filtered_boxes[starting_idx:]
|
| 98 |
else:
|
| 99 |
non_ocr_boxes = filtered_boxes
|
| 100 |
cropped_pil_images = []
|
| 101 |
-
for
|
| 102 |
xmin, xmax = int(coord[0] * image_source.shape[1]), int(coord[2] * image_source.shape[1])
|
| 103 |
ymin, ymax = int(coord[1] * image_source.shape[0]), int(coord[3] * image_source.shape[0])
|
| 104 |
cropped_image = image_source[ymin:ymax, xmin:xmax, :]
|
|
@@ -144,6 +147,7 @@ def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_
|
|
| 144 |
|
| 145 |
|
| 146 |
|
|
|
|
| 147 |
def get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor):
|
| 148 |
"""
|
| 149 |
Generates parsed textual content for detected icons using the phi3_v model variant.
|
|
@@ -507,6 +511,9 @@ def get_som_labeled_img(img_path, model=None, BOX_TRESHOLD=0.01, output_coord_in
|
|
| 507 |
starting_idx = next((i for i, box in enumerate(filtered_boxes_elem) if box['content'] is None), -1)
|
| 508 |
filtered_boxes_tensor = torch.tensor([box['bbox'] for box in filtered_boxes_elem])
|
| 509 |
|
|
|
|
|
|
|
|
|
|
| 510 |
# Generate parsed icon semantics if required
|
| 511 |
if use_local_semantics:
|
| 512 |
caption_model = caption_model_processor['model']
|
|
|
|
| 91 |
|
| 92 |
@torch.inference_mode()
|
| 93 |
def get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_model_processor, prompt=None, batch_size=32):
|
| 94 |
+
# Ensure batch_size is an integer
|
| 95 |
+
if batch_size is None:
|
| 96 |
+
batch_size = 32
|
| 97 |
+
|
| 98 |
to_pil = ToPILImage()
|
| 99 |
if starting_idx:
|
| 100 |
non_ocr_boxes = filtered_boxes[starting_idx:]
|
| 101 |
else:
|
| 102 |
non_ocr_boxes = filtered_boxes
|
| 103 |
cropped_pil_images = []
|
| 104 |
+
for coord in non_ocr_boxes:
|
| 105 |
xmin, xmax = int(coord[0] * image_source.shape[1]), int(coord[2] * image_source.shape[1])
|
| 106 |
ymin, ymax = int(coord[1] * image_source.shape[0]), int(coord[3] * image_source.shape[0])
|
| 107 |
cropped_image = image_source[ymin:ymax, xmin:xmax, :]
|
|
|
|
| 147 |
|
| 148 |
|
| 149 |
|
| 150 |
+
|
| 151 |
def get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor):
|
| 152 |
"""
|
| 153 |
Generates parsed textual content for detected icons using the phi3_v model variant.
|
|
|
|
| 511 |
starting_idx = next((i for i, box in enumerate(filtered_boxes_elem) if box['content'] is None), -1)
|
| 512 |
filtered_boxes_tensor = torch.tensor([box['bbox'] for box in filtered_boxes_elem])
|
| 513 |
|
| 514 |
+
if batch_size is None:
|
| 515 |
+
batch_size = 32
|
| 516 |
+
|
| 517 |
# Generate parsed icon semantics if required
|
| 518 |
if use_local_semantics:
|
| 519 |
caption_model = caption_model_processor['model']
|