Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +135 -42
mdr_pdf_parser.py
CHANGED
|
@@ -2221,27 +2221,98 @@ class MDRLayoutReader:
|
|
| 2221 |
bbox_list.sort(key=lambda b: (b.value[1], b.value[0]))
|
| 2222 |
return bbox_list
|
| 2223 |
|
| 2224 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2225 |
layout_map = defaultdict(list)
|
| 2226 |
-
|
| 2227 |
-
|
| 2228 |
-
|
| 2229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2230 |
nfo = 0
|
| 2231 |
-
for
|
| 2232 |
-
|
| 2233 |
-
if not frags:
|
| 2234 |
continue
|
| 2235 |
-
|
| 2236 |
-
|
| 2237 |
-
|
| 2238 |
-
|
| 2239 |
-
|
| 2240 |
-
|
| 2241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2242 |
frag.order = nfo
|
| 2243 |
nfo += 1
|
| 2244 |
-
|
|
|
|
| 2245 |
|
| 2246 |
def _estimate_line_h(self, layouts: list[MDRLayoutElement]) -> float:
|
| 2247 |
heights = [f.rect.size[1] for l in layouts for f in l.fragments if f.rect.size[1] > 0]
|
|
@@ -2586,52 +2657,74 @@ class MDRExtractionEngine:
|
|
| 2586 |
print(f"MDR Extraction Engine initialized on device: {self._device}")
|
| 2587 |
|
| 2588 |
# --- MODIFIED _get_yolo_model METHOD for HF ---
|
| 2589 |
-
|
| 2590 |
-
|
| 2591 |
-
|
|
|
|
|
|
|
| 2592 |
repo_id = "hantian/yolo-doclaynet"
|
| 2593 |
filename = "yolov10b-doclaynet.pt"
|
| 2594 |
-
|
| 2595 |
-
yolo_cache_dir = Path(self._model_dir) / "
|
| 2596 |
-
mdr_ensure_directory(str(yolo_cache_dir))
|
| 2597 |
|
| 2598 |
print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}'...")
|
| 2599 |
print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
|
| 2600 |
|
| 2601 |
try:
|
| 2602 |
-
# Download the model file using huggingface_hub, caching it
|
| 2603 |
yolo_model_filepath = hf_hub_download(
|
| 2604 |
repo_id=repo_id,
|
| 2605 |
filename=filename,
|
| 2606 |
-
cache_dir=yolo_cache_dir,
|
| 2607 |
-
local_files_only=False,
|
| 2608 |
-
force_download=False,
|
| 2609 |
)
|
| 2610 |
print(f"YOLO model file path: {yolo_model_filepath}")
|
| 2611 |
|
| 2612 |
-
#
|
| 2613 |
-
|
| 2614 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2615 |
|
| 2616 |
-
|
| 2617 |
-
|
| 2618 |
-
print(
|
| 2619 |
-
f"ERROR: Failed to download/access YOLO model via Hugging Face Hub: {e}") # Slightly updated message
|
| 2620 |
self._yolo = None
|
| 2621 |
-
except FileNotFoundError as e: #
|
| 2622 |
-
print(f"ERROR: YOLO model file not found
|
| 2623 |
self._yolo = None
|
| 2624 |
-
except Exception as e:
|
| 2625 |
-
|
| 2626 |
-
|
| 2627 |
-
|
| 2628 |
self._yolo = None
|
| 2629 |
|
| 2630 |
-
elif YOLOv10 is None:
|
| 2631 |
-
print("MDR YOLOv10 class not available. Layout detection skipped.")
|
| 2632 |
-
|
| 2633 |
return self._yolo
|
| 2634 |
|
|
|
|
| 2635 |
def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
|
| 2636 |
"""Analyzes a single page image to extract layout and content."""
|
| 2637 |
print(" Engine: Analyzing image...")
|
|
|
|
| 2221 |
bbox_list.sort(key=lambda b: (b.value[1], b.value[0]))
|
| 2222 |
return bbox_list
|
| 2223 |
|
| 2224 |
+
# In class MDRLayoutReader
|
| 2225 |
+
|
| 2226 |
+
def _apply_order(self, original_layouts_list: list[MDRLayoutElement],
|
| 2227 |
+
ordered_bbox_list_with_final_orders: list[_MDR_ReaderBBox]) -> list[MDRLayoutElement]:
|
| 2228 |
+
|
| 2229 |
+
# layout_map: maps original layout index to a list of its _MDR_ReaderBBox objects (which now have final .order)
|
| 2230 |
layout_map = defaultdict(list)
|
| 2231 |
+
for bbox_item in ordered_bbox_list_with_final_orders:
|
| 2232 |
+
layout_map[bbox_item.layout_index].append(bbox_item)
|
| 2233 |
+
|
| 2234 |
+
# Determine the new order of layouts themselves
|
| 2235 |
+
# The .order in bbox_item here is the *within-layout* order for fragments/virtual boxes.
|
| 2236 |
+
# We need the median of these *final reading orders* to sort the layouts.
|
| 2237 |
+
# The .order attribute of _MDR_ReaderBBox should have been updated by mdr_parse_reader_logits.
|
| 2238 |
+
|
| 2239 |
+
layout_median_orders = []
|
| 2240 |
+
for original_layout_idx, bboxes_for_this_layout in layout_map.items():
|
| 2241 |
+
if bboxes_for_this_layout: # Ensure there are bboxes
|
| 2242 |
+
# Each bbox_item.order here is its final reading order determined by LayoutLM
|
| 2243 |
+
median_order_for_layout = self._median([b.order for b in bboxes_for_this_layout])
|
| 2244 |
+
layout_median_orders.append((original_layout_idx, median_order_for_layout))
|
| 2245 |
+
|
| 2246 |
+
layout_median_orders.sort(key=lambda x: x[1]) # Sort layouts by their median reading order
|
| 2247 |
+
|
| 2248 |
+
# Create the new list of sorted layouts
|
| 2249 |
+
# Important: We are reordering the original_layouts_list.
|
| 2250 |
+
# The fragment objects within these layouts are the ones we need to sort.
|
| 2251 |
+
final_sorted_layouts = [original_layouts_list[idx] for idx, _ in layout_median_orders]
|
| 2252 |
+
|
| 2253 |
+
# Now, sort fragments within each layout
|
| 2254 |
+
# nfo (next fragment order) is a global counter for the absolute order of fragments across all layouts
|
| 2255 |
nfo = 0
|
| 2256 |
+
for layout_obj in final_sorted_layouts:
|
| 2257 |
+
if not layout_obj.fragments: # Skip layouts with no fragments
|
|
|
|
| 2258 |
continue
|
| 2259 |
+
|
| 2260 |
+
# Get the _MDR_ReaderBBox items that correspond to this specific layout_obj
|
| 2261 |
+
# We need the original index of layout_obj from the input `original_layouts_list`
|
| 2262 |
+
# This assumes original_layouts_list has not been reordered yet by this function.
|
| 2263 |
+
try:
|
| 2264 |
+
# Find the original index of the current layout_obj
|
| 2265 |
+
# This is safe if original_layouts_list is the list passed into this function
|
| 2266 |
+
original_idx_of_current_layout = original_layouts_list.index(layout_obj)
|
| 2267 |
+
except ValueError:
|
| 2268 |
+
# This should not happen if layout_obj came from original_layouts_list via layout_median_orders
|
| 2269 |
+
print(
|
| 2270 |
+
f" ERROR: Could not find layout in original list during fragment sort. Skipping fragment sort for this layout.")
|
| 2271 |
+
# Assign sequential order as a fallback for fragments in this layout
|
| 2272 |
+
for i_frag, frag_in_layout in enumerate(layout_obj.fragments):
|
| 2273 |
+
frag_in_layout.order = nfo + i_frag
|
| 2274 |
+
nfo += len(layout_obj.fragments)
|
| 2275 |
+
continue
|
| 2276 |
+
|
| 2277 |
+
# Get the _MDR_ReaderBBox items for this layout, which contain the final .order for each fragment_index
|
| 2278 |
+
reader_bboxes_for_this_layout = [
|
| 2279 |
+
b for b in layout_map[original_idx_of_current_layout] if not b.virtual
|
| 2280 |
+
]
|
| 2281 |
+
|
| 2282 |
+
if reader_bboxes_for_this_layout:
|
| 2283 |
+
# Create a map from original_fragment_index to its new_reading_order
|
| 2284 |
+
frag_idx_to_new_order_map = {
|
| 2285 |
+
b.fragment_index: b.order for b in reader_bboxes_for_this_layout
|
| 2286 |
+
}
|
| 2287 |
+
|
| 2288 |
+
# Sort the actual MDROcrFragment objects in layout_obj.fragments
|
| 2289 |
+
# The key for sorting should use the original index of the fragment
|
| 2290 |
+
# to look up its new_reading_order from the map.
|
| 2291 |
+
# We assume layout_obj.fragments has not been reordered yet by this function for this layout.
|
| 2292 |
+
# We need to sort a list of (fragment_object, original_index) tuples first.
|
| 2293 |
+
|
| 2294 |
+
fragments_with_original_indices = list(enumerate(layout_obj.fragments))
|
| 2295 |
+
|
| 2296 |
+
fragments_with_original_indices.sort(
|
| 2297 |
+
key=lambda item: frag_idx_to_new_order_map.get(item[0], float('inf')) # item[0] is original index
|
| 2298 |
+
)
|
| 2299 |
+
|
| 2300 |
+
# Reconstruct the sorted list of fragment objects
|
| 2301 |
+
layout_obj.fragments = [item[1] for item in
|
| 2302 |
+
fragments_with_original_indices] # item[1] is fragment object
|
| 2303 |
+
|
| 2304 |
+
else: # No corresponding reader_bboxes (e.g. layout was all virtual or had no frags initially)
|
| 2305 |
+
# or if the layout was created as a fallback and has no reader_bboxes.
|
| 2306 |
+
print(
|
| 2307 |
+
f" LayoutReader ApplyOrder: No reader_bboxes for layout (orig_idx {original_idx_of_current_layout}). Sorting frags geometrically.")
|
| 2308 |
+
layout_obj.fragments.sort(key=lambda f: (f.rect.lt[1], f.rect.lt[0])) # Fallback geometric sort
|
| 2309 |
+
|
| 2310 |
+
# Assign the final absolute order (nfo)
|
| 2311 |
+
for frag in layout_obj.fragments:
|
| 2312 |
frag.order = nfo
|
| 2313 |
nfo += 1
|
| 2314 |
+
|
| 2315 |
+
return final_sorted_layouts
|
| 2316 |
|
| 2317 |
def _estimate_line_h(self, layouts: list[MDRLayoutElement]) -> float:
|
| 2318 |
heights = [f.rect.size[1] for l in layouts for f in l.fragments if f.rect.size[1] > 0]
|
|
|
|
| 2657 |
print(f"MDR Extraction Engine initialized on device: {self._device}")
|
| 2658 |
|
| 2659 |
# --- MODIFIED _get_yolo_model METHOD for HF ---
|
| 2660 |
+
# In class MDRExtractionEngine:
|
| 2661 |
+
|
| 2662 |
+
def _get_yolo_model(self) -> Any | None: # Return type can be ultralytics.YOLO
|
| 2663 |
+
"""Loads the YOLOv10b-DocLayNet layout detection model."""
|
| 2664 |
+
if self._yolo is None:
|
| 2665 |
repo_id = "hantian/yolo-doclaynet"
|
| 2666 |
filename = "yolov10b-doclaynet.pt"
|
| 2667 |
+
|
| 2668 |
+
yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache_doclaynet"
|
| 2669 |
+
mdr_ensure_directory(str(yolo_cache_dir))
|
| 2670 |
|
| 2671 |
print(f"Attempting to load YOLO model '{filename}' from repo '{repo_id}'...")
|
| 2672 |
print(f"Hugging Face Hub cache directory for YOLO: {yolo_cache_dir}")
|
| 2673 |
|
| 2674 |
try:
|
|
|
|
| 2675 |
yolo_model_filepath = hf_hub_download(
|
| 2676 |
repo_id=repo_id,
|
| 2677 |
filename=filename,
|
| 2678 |
+
cache_dir=yolo_cache_dir,
|
| 2679 |
+
local_files_only=False,
|
| 2680 |
+
force_download=False,
|
| 2681 |
)
|
| 2682 |
print(f"YOLO model file path: {yolo_model_filepath}")
|
| 2683 |
|
| 2684 |
+
# --- START: MODIFIED LOADING ---
|
| 2685 |
+
# Attempt to load directly using ultralytics.YOLO
|
| 2686 |
+
try:
|
| 2687 |
+
from ultralytics import YOLO as UltralyticsYOLO
|
| 2688 |
+
self._yolo = UltralyticsYOLO(yolo_model_filepath)
|
| 2689 |
+
print("MDR YOLOv10b-DocLayNet model loaded successfully using ultralytics.YOLO.")
|
| 2690 |
+
except ImportError:
|
| 2691 |
+
print("ERROR: ultralytics library not found. Cannot load YOLOv10b-DocLayNet.")
|
| 2692 |
+
print("Please install it: pip install ultralytics")
|
| 2693 |
+
self._yolo = None
|
| 2694 |
+
return None # Critical failure
|
| 2695 |
+
except Exception as e_ultra: # Catch other ultralytics loading errors
|
| 2696 |
+
print(f"ERROR: Failed to load YOLO model with ultralytics.YOLO: {e_ultra}")
|
| 2697 |
+
# If direct ultralytics fails, and your YOLOv10 wrapper exists, you could try it as a fallback,
|
| 2698 |
+
# but it's likely to fail if ultralytics.YOLO failed due to model structure.
|
| 2699 |
+
if YOLOv10 is not None:
|
| 2700 |
+
print("Attempting fallback to doclayout_yolo.YOLOv10 wrapper...")
|
| 2701 |
+
try:
|
| 2702 |
+
self._yolo = YOLOv10(yolo_model_filepath)
|
| 2703 |
+
print("MDR YOLOv10b-DocLayNet model loaded with doclayout_yolo.YOLOv10 wrapper (fallback).")
|
| 2704 |
+
except Exception as e_wrapper:
|
| 2705 |
+
print(f"ERROR: Fallback to doclayout_yolo.YOLOv10 also failed: {e_wrapper}")
|
| 2706 |
+
self._yolo = None
|
| 2707 |
+
return None
|
| 2708 |
+
else:
|
| 2709 |
+
self._yolo = None
|
| 2710 |
+
return None
|
| 2711 |
+
# --- END: MODIFIED LOADING ---
|
| 2712 |
|
| 2713 |
+
except HfHubHTTPError as e:
|
| 2714 |
+
print(f"ERROR: Failed to download/access YOLO model '{filename}' via Hugging Face Hub: {e}")
|
|
|
|
|
|
|
| 2715 |
self._yolo = None
|
| 2716 |
+
except FileNotFoundError as e: # This might be redundant if hf_hub_download raises its own error
|
| 2717 |
+
print(f"ERROR: YOLO model file '{filename}' not found after download attempt: {e}")
|
| 2718 |
self._yolo = None
|
| 2719 |
+
except Exception as e: # General catch-all for unexpected issues during hf_hub_download or path ops
|
| 2720 |
+
print(f"ERROR: An unexpected issue occurred related to YOLO model file handling for {filename}: {e}")
|
| 2721 |
+
import traceback
|
| 2722 |
+
traceback.print_exc()
|
| 2723 |
self._yolo = None
|
| 2724 |
|
|
|
|
|
|
|
|
|
|
| 2725 |
return self._yolo
|
| 2726 |
|
| 2727 |
+
|
| 2728 |
def analyze_image(self, image: Image, adjust_points: bool = False) -> MDRExtractionResult:
|
| 2729 |
"""Analyzes a single page image to extract layout and content."""
|
| 2730 |
print(" Engine: Analyzing image...")
|