Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +40 -10
mdr_pdf_parser.py
CHANGED
|
@@ -1013,7 +1013,8 @@ class _MDR_TextDetector(_MDR_PredictBase):
|
|
| 1013 |
new_boxes.append(box)
|
| 1014 |
return np.array(new_boxes)
|
| 1015 |
|
| 1016 |
-
|
|
|
|
| 1017 |
def __call__(self, img):
|
| 1018 |
ori_im = img.copy()
|
| 1019 |
data = {"image": img}
|
|
@@ -1026,14 +1027,14 @@ class _MDR_TextDetector(_MDR_PredictBase):
|
|
| 1026 |
print(f" DEBUG OCR: _MDR_TextDetector: Error during preprocessing (mdr_ocr_transform): {e_preproc}")
|
| 1027 |
import traceback
|
| 1028 |
traceback.print_exc()
|
| 1029 |
-
return np.array([])
|
| 1030 |
|
| 1031 |
if data is None:
|
| 1032 |
print(
|
| 1033 |
" DEBUG OCR: _MDR_TextDetector: Preprocessing (mdr_ocr_transform) returned None. No text will be detected.")
|
| 1034 |
return np.array([])
|
| 1035 |
|
| 1036 |
-
processed_img, shape_list = data
|
| 1037 |
if processed_img is None:
|
| 1038 |
print(" DEBUG OCR: _MDR_TextDetector: Processed image after transform is None. No text will be detected.")
|
| 1039 |
return np.array([])
|
|
@@ -1052,7 +1053,7 @@ class _MDR_TextDetector(_MDR_PredictBase):
|
|
| 1052 |
print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference for detection failed: {e_infer}")
|
| 1053 |
import traceback
|
| 1054 |
traceback.print_exc()
|
| 1055 |
-
return np.array([])
|
| 1056 |
print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference done. Output map shape: {outputs[0].shape}")
|
| 1057 |
|
| 1058 |
preds = {"maps": outputs[0]}
|
|
@@ -1064,17 +1065,46 @@ class _MDR_TextDetector(_MDR_PredictBase):
|
|
| 1064 |
traceback.print_exc()
|
| 1065 |
return np.array([])
|
| 1066 |
|
| 1067 |
-
|
| 1068 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1069 |
return np.array([])
|
|
|
|
| 1070 |
|
| 1071 |
-
boxes_from_post = post_res[0]['points']
|
| 1072 |
print(
|
| 1073 |
f" DEBUG OCR: _MDR_TextDetector: Boxes from DBPostProcess before final filtering: {len(boxes_from_post)}")
|
| 1074 |
|
| 1075 |
-
|
| 1076 |
-
|
| 1077 |
-
|
|
|
|
|
|
|
|
|
|
| 1078 |
return np.array([])
|
| 1079 |
|
| 1080 |
if self.args.det_box_type == 'poly':
|
|
|
|
| 1013 |
new_boxes.append(box)
|
| 1014 |
return np.array(new_boxes)
|
| 1015 |
|
| 1016 |
+
# In class _MDR_TextDetector:
|
| 1017 |
+
|
| 1018 |
def __call__(self, img):
|
| 1019 |
ori_im = img.copy()
|
| 1020 |
data = {"image": img}
|
|
|
|
| 1027 |
print(f" DEBUG OCR: _MDR_TextDetector: Error during preprocessing (mdr_ocr_transform): {e_preproc}")
|
| 1028 |
import traceback
|
| 1029 |
traceback.print_exc()
|
| 1030 |
+
return np.array([])
|
| 1031 |
|
| 1032 |
if data is None:
|
| 1033 |
print(
|
| 1034 |
" DEBUG OCR: _MDR_TextDetector: Preprocessing (mdr_ocr_transform) returned None. No text will be detected.")
|
| 1035 |
return np.array([])
|
| 1036 |
|
| 1037 |
+
processed_img, shape_list = data
|
| 1038 |
if processed_img is None:
|
| 1039 |
print(" DEBUG OCR: _MDR_TextDetector: Processed image after transform is None. No text will be detected.")
|
| 1040 |
return np.array([])
|
|
|
|
| 1053 |
print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference for detection failed: {e_infer}")
|
| 1054 |
import traceback
|
| 1055 |
traceback.print_exc()
|
| 1056 |
+
return np.array([])
|
| 1057 |
print(f" DEBUG OCR: _MDR_TextDetector: ONNX inference done. Output map shape: {outputs[0].shape}")
|
| 1058 |
|
| 1059 |
preds = {"maps": outputs[0]}
|
|
|
|
| 1065 |
traceback.print_exc()
|
| 1066 |
return np.array([])
|
| 1067 |
|
| 1068 |
+
# --- START: REFINED CHECK ---
|
| 1069 |
+
# 1. Check if post_res itself is valid and contains the expected structure.
|
| 1070 |
+
if not post_res or not isinstance(post_res, list) or len(post_res) == 0 or \
|
| 1071 |
+
not isinstance(post_res[0], dict) or 'points' not in post_res[0]:
|
| 1072 |
+
print(" DEBUG OCR: _MDR_TextDetector: DBPostProcess returned invalid or empty structure for points.")
|
| 1073 |
+
return np.array([])
|
| 1074 |
+
|
| 1075 |
+
boxes_from_post = post_res[0]['points'] # This is expected to be a np.ndarray or a list of boxes
|
| 1076 |
+
|
| 1077 |
+
# 2. Check if boxes_from_post is actually empty.
|
| 1078 |
+
# For a NumPy array, check its size. For a list, check if it's empty.
|
| 1079 |
+
no_boxes_found = False
|
| 1080 |
+
if isinstance(boxes_from_post, np.ndarray):
|
| 1081 |
+
if boxes_from_post.size == 0:
|
| 1082 |
+
no_boxes_found = True
|
| 1083 |
+
elif isinstance(boxes_from_post, list):
|
| 1084 |
+
if not boxes_from_post: # Empty list
|
| 1085 |
+
no_boxes_found = True
|
| 1086 |
+
elif boxes_from_post is None: # Explicitly check for None
|
| 1087 |
+
no_boxes_found = True
|
| 1088 |
+
else:
|
| 1089 |
+
# Should not happen if _MDR_DBPostProcess behaves as expected, but good to log
|
| 1090 |
+
print(
|
| 1091 |
+
f" DEBUG OCR: _MDR_TextDetector: 'points' from DBPostProcess is of unexpected type: {type(boxes_from_post)}")
|
| 1092 |
+
return np.array([])
|
| 1093 |
+
|
| 1094 |
+
if no_boxes_found:
|
| 1095 |
+
print(" DEBUG OCR: _MDR_TextDetector: DBPostProcess returned no actual point data.")
|
| 1096 |
return np.array([])
|
| 1097 |
+
# --- END: REFINED CHECK ---
|
| 1098 |
|
|
|
|
| 1099 |
print(
|
| 1100 |
f" DEBUG OCR: _MDR_TextDetector: Boxes from DBPostProcess before final filtering: {len(boxes_from_post)}")
|
| 1101 |
|
| 1102 |
+
# The following check might be redundant now but can be kept for extra safety
|
| 1103 |
+
# or if boxes_from_post could be other types not handled above.
|
| 1104 |
+
if not isinstance(boxes_from_post, (list, np.ndarray)) or \
|
| 1105 |
+
(isinstance(boxes_from_post, np.ndarray) and boxes_from_post.size == 0) or \
|
| 1106 |
+
(isinstance(boxes_from_post, list) and not boxes_from_post):
|
| 1107 |
+
print(" DEBUG OCR: _MDR_TextDetector: No boxes from DBPostProcess to filter (secondary check).")
|
| 1108 |
return np.array([])
|
| 1109 |
|
| 1110 |
if self.args.det_box_type == 'poly':
|