Spaces:

ResearchMAGIC
/

teammrag-parser-moreai

Sleeping

App Files Files Community

rodrigomasini commited on May 30

Commit

88bb726

verified ·

1 Parent(s): 8bb89d1

Update mdr_pdf_parser.py

Browse files

Files changed (1) hide show

mdr_pdf_parser.py +26 -9

mdr_pdf_parser.py CHANGED Viewed

@@ -1417,6 +1417,7 @@ class _MDR_TextSystem:
             return [], []
         img_crop_list: list[np.ndarray] = []
         for i in range(len(dt_boxes_sorted)):
             crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
             # Ensure crop_im is not empty or too small before adding
@@ -1807,8 +1808,8 @@ class _MDR_ONNXParams:
     # Attributes with default values (Group 2 - Detection)
     det_algorithm: str = "DB"
-    det_limit_side_len: int = 960
-    det_limit_type: str = 'max'
     det_db_thresh: float = 0.3
     det_db_box_thresh: float = 0.6
     det_db_unclip_ratio: float = 1.5
@@ -1857,9 +1858,10 @@ class MDROcrEngine:
                 rec_model_dir=paths["rec"],
                 rec_char_dict_path=paths["keys"],
                 # much lower thresholds so we actually get some candidate masks:
-                det_db_thresh=0.1,
-                det_db_box_thresh=0.3,
-                drop_score=0.0,
                 use_angle_cls=False,
             )
             try:
@@ -2094,7 +2096,7 @@ class MDRLayoutReader:
             layoutreader_cache_dir = Path(self._model_path)  # self._model_path is like "./mdr_models/layoutreader"
             mdr_ensure_directory(str(layoutreader_cache_dir))  # Ensure this specific directory exists
-            name = "microsoft/layoutlmv3-base"
             print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{name}'. Cache dir: {layoutreader_cache_dir}")
             try:
@@ -2711,7 +2713,24 @@ class MDRExtractionEngine:
     def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
         """Analyzes a single page image to extract layout and content."""
         print("  Engine: Analyzing image...")
-        optimizer = MDRImageOptimizer(image, adjust_points)
         print("  Engine: Initial OCR...")
         frags = list(self._ocr_engine.find_text_fragments(optimizer.image_np))
         print(f"  Engine: {len(frags)} fragments found.")
@@ -2760,8 +2779,6 @@ class MDRExtractionEngine:
         return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image,
                                    adjusted_image=optimizer.adjusted_image)
-        # In class MDRExtractionEngine:
     # In class MDRExtractionEngine
     def _run_yolo_detection(self, img: Image, yolo: Any):  # yolo is an ultralytics.YOLO instance
         img_rgb = img.convert("RGB")

             return [], []
         img_crop_list: list[np.ndarray] = []
         for i in range(len(dt_boxes_sorted)):
             crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
             # Ensure crop_im is not empty or too small before adding
     # Attributes with default values (Group 2 - Detection)
     det_algorithm: str = "DB"
+    det_limit_side_len: int = 1280
+    det_limit_type: str = 'min'
     det_db_thresh: float = 0.3
     det_db_box_thresh: float = 0.6
     det_db_unclip_ratio: float = 1.5
                 rec_model_dir=paths["rec"],
                 rec_char_dict_path=paths["keys"],
                 # much lower thresholds so we actually get some candidate masks:
+                det_db_thresh=0.15,
+                det_db_box_thresh=0.15,
+                unclip_ratio=2.0,
+                drop_score=0.01,
                 use_angle_cls=False,
             )
             try:
             layoutreader_cache_dir = Path(self._model_path)  # self._model_path is like "./mdr_models/layoutreader"
             mdr_ensure_directory(str(layoutreader_cache_dir))  # Ensure this specific directory exists
+            name = "Cahya/layoutlmv3-base-finetuned-doclaynet"
             print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{name}'. Cache dir: {layoutreader_cache_dir}")
             try:
     def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
         """Analyzes a single page image to extract layout and content."""
         print("  Engine: Analyzing image...")
+        # --- START: ADDED CLAHE PREPROCESSING ---
+        # Convert PIL Image to OpenCV BGR format
+        ori_im_cv = cv2.cvtColor(np.array(image.convert("RGB")), cv2.COLOR_RGB2BGR)
+        gray_cv = cv2.cvtColor(ori_im_cv, cv2.COLOR_BGR2GRAY)
+        clahe_obj = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+        enhanced_gray_cv = clahe_obj.apply(gray_cv)
+        # Convert back to BGR for downstream components that might expect 3 channels
+        # (even if they only use one, like the detector)
+        # And then back to PIL Image for the optimizer
+        processed_cv_bgr = cv2.cvtColor(enhanced_gray_cv, cv2.COLOR_GRAY2BGR)
+        # Convert the processed OpenCV image back to PIL Image for the optimizer
+        # The optimizer expects a PIL Image.
+        # The image passed to optimizer will now be the CLAHE'd version.
+        processed_pil_image = Image.fromarray(cv2.cvtColor(processed_cv_bgr, cv2.COLOR_BGR2RGB))
+        print("  Engine: CLAHE preprocessing applied to input image.")
+        optimizer = MDRImageOptimizer(processed_pil_image, adjust_points)
         print("  Engine: Initial OCR...")
         frags = list(self._ocr_engine.find_text_fragments(optimizer.image_np))
         print(f"  Engine: {len(frags)} fragments found.")
         return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image,
                                    adjusted_image=optimizer.adjusted_image)
     # In class MDRExtractionEngine
     def _run_yolo_detection(self, img: Image, yolo: Any):  # yolo is an ultralytics.YOLO instance
         img_rgb = img.convert("RGB")