rodrigomasini commited on
Commit
88bb726
·
verified ·
1 Parent(s): 8bb89d1

Update mdr_pdf_parser.py

Browse files
Files changed (1) hide show
  1. mdr_pdf_parser.py +26 -9
mdr_pdf_parser.py CHANGED
@@ -1417,6 +1417,7 @@ class _MDR_TextSystem:
1417
  return [], []
1418
 
1419
  img_crop_list: list[np.ndarray] = []
 
1420
  for i in range(len(dt_boxes_sorted)):
1421
  crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
1422
  # Ensure crop_im is not empty or too small before adding
@@ -1807,8 +1808,8 @@ class _MDR_ONNXParams:
1807
 
1808
  # Attributes with default values (Group 2 - Detection)
1809
  det_algorithm: str = "DB"
1810
- det_limit_side_len: int = 960
1811
- det_limit_type: str = 'max'
1812
  det_db_thresh: float = 0.3
1813
  det_db_box_thresh: float = 0.6
1814
  det_db_unclip_ratio: float = 1.5
@@ -1857,9 +1858,10 @@ class MDROcrEngine:
1857
  rec_model_dir=paths["rec"],
1858
  rec_char_dict_path=paths["keys"],
1859
  # much lower thresholds so we actually get some candidate masks:
1860
- det_db_thresh=0.1,
1861
- det_db_box_thresh=0.3,
1862
- drop_score=0.0,
 
1863
  use_angle_cls=False,
1864
  )
1865
  try:
@@ -2094,7 +2096,7 @@ class MDRLayoutReader:
2094
  layoutreader_cache_dir = Path(self._model_path) # self._model_path is like "./mdr_models/layoutreader"
2095
  mdr_ensure_directory(str(layoutreader_cache_dir)) # Ensure this specific directory exists
2096
 
2097
- name = "microsoft/layoutlmv3-base"
2098
 
2099
  print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{name}'. Cache dir: {layoutreader_cache_dir}")
2100
  try:
@@ -2711,7 +2713,24 @@ class MDRExtractionEngine:
2711
  def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
2712
  """Analyzes a single page image to extract layout and content."""
2713
  print(" Engine: Analyzing image...")
2714
- optimizer = MDRImageOptimizer(image, adjust_points)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2715
  print(" Engine: Initial OCR...")
2716
  frags = list(self._ocr_engine.find_text_fragments(optimizer.image_np))
2717
  print(f" Engine: {len(frags)} fragments found.")
@@ -2760,8 +2779,6 @@ class MDRExtractionEngine:
2760
  return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image,
2761
  adjusted_image=optimizer.adjusted_image)
2762
 
2763
- # In class MDRExtractionEngine:
2764
-
2765
  # In class MDRExtractionEngine
2766
  def _run_yolo_detection(self, img: Image, yolo: Any): # yolo is an ultralytics.YOLO instance
2767
  img_rgb = img.convert("RGB")
 
1417
  return [], []
1418
 
1419
  img_crop_list: list[np.ndarray] = []
1420
+
1421
  for i in range(len(dt_boxes_sorted)):
1422
  crop_im = mdr_get_rotated_crop(ori_im, dt_boxes_sorted[i])
1423
  # Ensure crop_im is not empty or too small before adding
 
1808
 
1809
  # Attributes with default values (Group 2 - Detection)
1810
  det_algorithm: str = "DB"
1811
+ det_limit_side_len: int = 1280
1812
+ det_limit_type: str = 'min'
1813
  det_db_thresh: float = 0.3
1814
  det_db_box_thresh: float = 0.6
1815
  det_db_unclip_ratio: float = 1.5
 
1858
  rec_model_dir=paths["rec"],
1859
  rec_char_dict_path=paths["keys"],
1860
  # much lower thresholds so we actually get some candidate masks:
1861
+ det_db_thresh=0.15,
1862
+ det_db_box_thresh=0.15,
1863
+ unclip_ratio=2.0,
1864
+ drop_score=0.01,
1865
  use_angle_cls=False,
1866
  )
1867
  try:
 
2096
  layoutreader_cache_dir = Path(self._model_path) # self._model_path is like "./mdr_models/layoutreader"
2097
  mdr_ensure_directory(str(layoutreader_cache_dir)) # Ensure this specific directory exists
2098
 
2099
+ name = "Cahya/layoutlmv3-base-finetuned-doclaynet"
2100
 
2101
  print(f"MDRLayoutReader: Attempting to load LayoutLMv3 model '{name}'. Cache dir: {layoutreader_cache_dir}")
2102
  try:
 
2713
  def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
2714
  """Analyzes a single page image to extract layout and content."""
2715
  print(" Engine: Analyzing image...")
2716
+ # --- START: ADDED CLAHE PREPROCESSING ---
2717
+ # Convert PIL Image to OpenCV BGR format
2718
+ ori_im_cv = cv2.cvtColor(np.array(image.convert("RGB")), cv2.COLOR_RGB2BGR)
2719
+
2720
+ gray_cv = cv2.cvtColor(ori_im_cv, cv2.COLOR_BGR2GRAY)
2721
+ clahe_obj = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
2722
+ enhanced_gray_cv = clahe_obj.apply(gray_cv)
2723
+ # Convert back to BGR for downstream components that might expect 3 channels
2724
+ # (even if they only use one, like the detector)
2725
+ # And then back to PIL Image for the optimizer
2726
+ processed_cv_bgr = cv2.cvtColor(enhanced_gray_cv, cv2.COLOR_GRAY2BGR)
2727
+
2728
+ # Convert the processed OpenCV image back to PIL Image for the optimizer
2729
+ # The optimizer expects a PIL Image.
2730
+ # The image passed to optimizer will now be the CLAHE'd version.
2731
+ processed_pil_image = Image.fromarray(cv2.cvtColor(processed_cv_bgr, cv2.COLOR_BGR2RGB))
2732
+ print(" Engine: CLAHE preprocessing applied to input image.")
2733
+ optimizer = MDRImageOptimizer(processed_pil_image, adjust_points)
2734
  print(" Engine: Initial OCR...")
2735
  frags = list(self._ocr_engine.find_text_fragments(optimizer.image_np))
2736
  print(f" Engine: {len(frags)} fragments found.")
 
2779
  return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image,
2780
  adjusted_image=optimizer.adjusted_image)
2781
 
 
 
2782
  # In class MDRExtractionEngine
2783
  def _run_yolo_detection(self, img: Image, yolo: Any): # yolo is an ultralytics.YOLO instance
2784
  img_rgb = img.convert("RGB")