Spaces:
Sleeping
Sleeping
Update working_yolo_pipeline.py
Browse files- working_yolo_pipeline.py +16 -3
working_yolo_pipeline.py
CHANGED
|
@@ -1390,12 +1390,25 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
|
|
| 1390 |
# ====================================================================
|
| 1391 |
items_to_sort = []
|
| 1392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1393 |
for ocr_word in raw_ocr_output:
|
| 1394 |
is_suppressed = False
|
| 1395 |
for component in component_metadata:
|
| 1396 |
-
|
| 1397 |
-
|
| 1398 |
-
|
|
|
|
|
|
|
| 1399 |
is_suppressed = True
|
| 1400 |
break
|
| 1401 |
if not is_suppressed:
|
|
|
|
| 1390 |
# ====================================================================
|
| 1391 |
items_to_sort = []
|
| 1392 |
|
| 1393 |
+
# for ocr_word in raw_ocr_output:
|
| 1394 |
+
# is_suppressed = False
|
| 1395 |
+
# for component in component_metadata:
|
| 1396 |
+
# # Do not include words that are inside figure/equation boxes
|
| 1397 |
+
# ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
|
| 1398 |
+
# if ioa > IOA_SUPPRESSION_THRESHOLD:
|
| 1399 |
+
# is_suppressed = True
|
| 1400 |
+
# break
|
| 1401 |
+
# if not is_suppressed:
|
| 1402 |
+
# items_to_sort.append(ocr_word)
|
| 1403 |
+
|
| 1404 |
for ocr_word in raw_ocr_output:
|
| 1405 |
is_suppressed = False
|
| 1406 |
for component in component_metadata:
|
| 1407 |
+
ioa_ocr_in_eq = calculate_ioa(ocr_word['bbox'], component['bbox'])
|
| 1408 |
+
ioa_eq_in_ocr = calculate_ioa(component['bbox'], ocr_word['bbox'])
|
| 1409 |
+
# Suppress if either: OCR word is mostly inside equation box,
|
| 1410 |
+
# OR equation box is mostly inside OCR word box
|
| 1411 |
+
if ioa_ocr_in_eq > IOA_SUPPRESSION_THRESHOLD or ioa_eq_in_ocr > IOA_SUPPRESSION_THRESHOLD:
|
| 1412 |
is_suppressed = True
|
| 1413 |
break
|
| 1414 |
if not is_suppressed:
|