heerjtdev commited on
Commit
0414041
·
verified ·
1 Parent(s): 1970d0f

Update working_yolo_pipeline.py

Browse files
Files changed (1) hide show
  1. working_yolo_pipeline.py +16 -3
working_yolo_pipeline.py CHANGED
@@ -1390,12 +1390,25 @@ def preprocess_and_ocr_page(original_img: np.ndarray, model, pdf_path: str,
1390
  # ====================================================================
1391
  items_to_sort = []
1392
 
 
 
 
 
 
 
 
 
 
 
 
1393
  for ocr_word in raw_ocr_output:
1394
  is_suppressed = False
1395
  for component in component_metadata:
1396
- # Do not include words that are inside figure/equation boxes
1397
- ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1398
- if ioa > IOA_SUPPRESSION_THRESHOLD:
 
 
1399
  is_suppressed = True
1400
  break
1401
  if not is_suppressed:
 
1390
  # ====================================================================
1391
  items_to_sort = []
1392
 
1393
+ # for ocr_word in raw_ocr_output:
1394
+ # is_suppressed = False
1395
+ # for component in component_metadata:
1396
+ # # Do not include words that are inside figure/equation boxes
1397
+ # ioa = calculate_ioa(ocr_word['bbox'], component['bbox'])
1398
+ # if ioa > IOA_SUPPRESSION_THRESHOLD:
1399
+ # is_suppressed = True
1400
+ # break
1401
+ # if not is_suppressed:
1402
+ # items_to_sort.append(ocr_word)
1403
+
1404
  for ocr_word in raw_ocr_output:
1405
  is_suppressed = False
1406
  for component in component_metadata:
1407
+ ioa_ocr_in_eq = calculate_ioa(ocr_word['bbox'], component['bbox'])
1408
+ ioa_eq_in_ocr = calculate_ioa(component['bbox'], ocr_word['bbox'])
1409
+ # Suppress if either: OCR word is mostly inside equation box,
1410
+ # OR equation box is mostly inside OCR word box
1411
+ if ioa_ocr_in_eq > IOA_SUPPRESSION_THRESHOLD or ioa_eq_in_ocr > IOA_SUPPRESSION_THRESHOLD:
1412
  is_suppressed = True
1413
  break
1414
  if not is_suppressed: