Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +26 -9
mdr_pdf_parser.py
CHANGED
|
@@ -1417,6 +1417,7 @@ class _MDR_TextSystem:
|
|
| 1417 |
return [], []
|
| 1418 |
|
| 1419 |
img_crop_list: list[np.ndarray] = []
|
|
|
|
| 1420 |
for i in range(len(dt_boxes_sorted)):
|
| 1421 |
crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
|
| 1422 |
# Ensure crop_im is not empty or too small before adding
|
|
@@ -1807,8 +1808,8 @@ class _MDR_ONNXParams:
|
|
| 1807 |
|
| 1808 |
# Attributes with default values (Group 2 - Detection)
|
| 1809 |
det_algorithm: str = "DB"
|
| 1810 |
-
det_limit_side_len: int =
|
| 1811 |
-
det_limit_type: str = '
|
| 1812 |
det_db_thresh: float = 0.3
|
| 1813 |
det_db_box_thresh: float = 0.6
|
| 1814 |
det_db_unclip_ratio: float = 1.5
|
|
@@ -1857,9 +1858,10 @@ class MDROcrEngine:
|
|
| 1857 |
rec_model_dir=paths["rec"],
|
| 1858 |
rec_char_dict_path=paths["keys"],
|
| 1859 |
# much lower thresholds so we actually get some candidate masks:
|
| 1860 |
-
det_db_thresh=0.
|
| 1861 |
-
det_db_box_thresh=0.
|
| 1862 |
-
|
|
|
|
| 1863 |
use_angle_cls=False,
|
| 1864 |
)
|
| 1865 |
try:
|
|
@@ -2094,7 +2096,7 @@ class MDRLayoutReader:
|
|
| 2094 |
layoutreader_cache_dir = Path(self._model_path) # self._model_path is like "./mdr_models/layoutreader"
|
| 2095 |
mdr_ensure_directory(str(layoutreader_cache_dir)) # Ensure this specific directory exists
|
| 2096 |
|
| 2097 |
-
name = "
|
| 2098 |
|
| 2099 |
print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{name}'. Cache dir: {layoutreader_cache_dir}")
|
| 2100 |
try:
|
|
@@ -2711,7 +2713,24 @@ class MDRExtractionEngine:
|
|
| 2711 |
def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
|
| 2712 |
"""Analyzes a single page image to extract layout and content."""
|
| 2713 |
print(" Engine: Analyzing image...")
|
| 2714 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2715 |
print(" Engine: Initial OCR...")
|
| 2716 |
frags = list(self._ocr_engine.find_text_fragments(optimizer.image_np))
|
| 2717 |
print(f" Engine: {len(frags)} fragments found.")
|
|
@@ -2760,8 +2779,6 @@ class MDRExtractionEngine:
|
|
| 2760 |
return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image,
|
| 2761 |
adjusted_image=optimizer.adjusted_image)
|
| 2762 |
|
| 2763 |
-
# In class MDRExtractionEngine:
|
| 2764 |
-
|
| 2765 |
# In class MDRExtractionEngine
|
| 2766 |
def _run_yolo_detection(self, img: Image, yolo: Any): # yolo is an ultralytics.YOLO instance
|
| 2767 |
img_rgb = img.convert("RGB")
|
|
|
|
| 1417 |
return [], []
|
| 1418 |
|
| 1419 |
img_crop_list: list[np.ndarray] = []
|
| 1420 |
+
|
| 1421 |
for i in range(len(dt_boxes_sorted)):
|
| 1422 |
crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
|
| 1423 |
# Ensure crop_im is not empty or too small before adding
|
|
|
|
| 1808 |
|
| 1809 |
# Attributes with default values (Group 2 - Detection)
|
| 1810 |
det_algorithm: str = "DB"
|
| 1811 |
+
det_limit_side_len: int = 1280
|
| 1812 |
+
det_limit_type: str = 'min'
|
| 1813 |
det_db_thresh: float = 0.3
|
| 1814 |
det_db_box_thresh: float = 0.6
|
| 1815 |
det_db_unclip_ratio: float = 1.5
|
|
|
|
| 1858 |
rec_model_dir=paths["rec"],
|
| 1859 |
rec_char_dict_path=paths["keys"],
|
| 1860 |
# much lower thresholds so we actually get some candidate masks:
|
| 1861 |
+
det_db_thresh=0.15,
|
| 1862 |
+
det_db_box_thresh=0.15,
|
| 1863 |
+
unclip_ratio=2.0,
|
| 1864 |
+
drop_score=0.01,
|
| 1865 |
use_angle_cls=False,
|
| 1866 |
)
|
| 1867 |
try:
|
|
|
|
| 2096 |
layoutreader_cache_dir = Path(self._model_path) # self._model_path is like "./mdr_models/layoutreader"
|
| 2097 |
mdr_ensure_directory(str(layoutreader_cache_dir)) # Ensure this specific directory exists
|
| 2098 |
|
| 2099 |
+
name = "Cahya/layoutlmv3-base-finetuned-doclaynet"
|
| 2100 |
|
| 2101 |
print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{name}'. Cache dir: {layoutreader_cache_dir}")
|
| 2102 |
try:
|
|
|
|
| 2713 |
def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
|
| 2714 |
"""Analyzes a single page image to extract layout and content."""
|
| 2715 |
print(" Engine: Analyzing image...")
|
| 2716 |
+
# --- START: ADDED CLAHE PREPROCESSING ---
|
| 2717 |
+
# Convert PIL Image to OpenCV BGR format
|
| 2718 |
+
ori_im_cv = cv2.cvtColor(np.array(image.convert("RGB")), cv2.COLOR_RGB2BGR)
|
| 2719 |
+
|
| 2720 |
+
gray_cv = cv2.cvtColor(ori_im_cv, cv2.COLOR_BGR2GRAY)
|
| 2721 |
+
clahe_obj = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
|
| 2722 |
+
enhanced_gray_cv = clahe_obj.apply(gray_cv)
|
| 2723 |
+
# Convert back to BGR for downstream components that might expect 3 channels
|
| 2724 |
+
# (even if they only use one, like the detector)
|
| 2725 |
+
# And then back to PIL Image for the optimizer
|
| 2726 |
+
processed_cv_bgr = cv2.cvtColor(enhanced_gray_cv, cv2.COLOR_GRAY2BGR)
|
| 2727 |
+
|
| 2728 |
+
# Convert the processed OpenCV image back to PIL Image for the optimizer
|
| 2729 |
+
# The optimizer expects a PIL Image.
|
| 2730 |
+
# The image passed to optimizer will now be the CLAHE'd version.
|
| 2731 |
+
processed_pil_image = Image.fromarray(cv2.cvtColor(processed_cv_bgr, cv2.COLOR_BGR2RGB))
|
| 2732 |
+
print(" Engine: CLAHE preprocessing applied to input image.")
|
| 2733 |
+
optimizer = MDRImageOptimizer(processed_pil_image, adjust_points)
|
| 2734 |
print(" Engine: Initial OCR...")
|
| 2735 |
frags = list(self._ocr_engine.find_text_fragments(optimizer.image_np))
|
| 2736 |
print(f" Engine: {len(frags)} fragments found.")
|
|
|
|
| 2779 |
return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image,
|
| 2780 |
adjusted_image=optimizer.adjusted_image)
|
| 2781 |
|
|
|
|
|
|
|
| 2782 |
# In class MDRExtractionEngine
|
| 2783 |
def _run_yolo_detection(self, img: Image, yolo: Any): # yolo is an ultralytics.YOLO instance
|
| 2784 |
img_rgb = img.convert("RGB")
|