Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +179 -23
mdr_pdf_parser.py
CHANGED
|
@@ -2589,8 +2589,8 @@ class MDRExtractionEngine:
|
|
| 2589 |
def _get_yolo_model(self) -> YOLOv10 | None:
|
| 2590 |
"""Loads the YOLOv10 layout detection model using hf_hub_download."""
|
| 2591 |
if self._yolo is None and YOLOv10 is not None:
|
| 2592 |
-
repo_id = "
|
| 2593 |
-
filename = "
|
| 2594 |
# Use a subdirectory within the main model dir for YOLO cache via HF Hub
|
| 2595 |
yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache"
|
| 2596 |
mdr_ensure_directory(str(yolo_cache_dir)) # Ensure cache dir exists
|
|
@@ -2684,37 +2684,193 @@ class MDRExtractionEngine:
|
|
| 2684 |
return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image,
|
| 2685 |
adjusted_image=optimizer.adjusted_image)
|
| 2686 |
|
| 2687 |
-
|
|
|
|
|
|
|
| 2688 |
img_rgb = img.convert("RGB")
|
| 2689 |
-
res = yolo.predict(source=img_rgb, imgsz=1024, conf=0.20,
|
| 2690 |
-
device=self._device, verbose=False)
|
| 2691 |
|
| 2692 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2693 |
return
|
| 2694 |
|
| 2695 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2696 |
MDRLayoutClass.TITLE,
|
| 2697 |
MDRLayoutClass.PLAIN_TEXT,
|
| 2698 |
-
MDRLayoutClass.ABANDON,
|
| 2699 |
-
MDRLayoutClass.FIGURE_CAPTION,
|
| 2700 |
-
MDRLayoutClass.TABLE_CAPTION,
|
| 2701 |
-
MDRLayoutClass.TABLE_FOOTNOTE,
|
| 2702 |
-
MDRLayoutClass.FORMULA_CAPTION,
|
| 2703 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2704 |
|
| 2705 |
-
|
| 2706 |
-
|
| 2707 |
-
|
| 2708 |
-
rect = MDRRectangle((x1, y1), (x2, y1), (x1, y2), (x2, y2))
|
| 2709 |
-
if rect.area < 10:
|
| 2710 |
continue
|
| 2711 |
|
| 2712 |
-
if
|
| 2713 |
-
yield MDRTableLayoutElement(rect=rect, fragments=[], parsed=None)
|
| 2714 |
-
elif
|
| 2715 |
-
yield MDRFormulaLayoutElement(rect=rect, fragments=[], latex=None)
|
| 2716 |
-
elif
|
| 2717 |
-
yield MDRPlainLayoutElement(cls=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2718 |
|
| 2719 |
def _match_fragments_to_layouts(self, frags: list[MDROcrFragment], layouts: list[MDRLayoutElement]) -> list[
|
| 2720 |
MDRLayoutElement]:
|
|
|
|
| 2589 |
def _get_yolo_model(self) -> YOLOv10 | None:
|
| 2590 |
"""Loads the YOLOv10 layout detection model using hf_hub_download."""
|
| 2591 |
if self._yolo is None and YOLOv10 is not None:
|
| 2592 |
+
repo_id = "hantian/yolo-doclaynet"
|
| 2593 |
+
filename = "yolov10b-doclaynet.pt"
|
| 2594 |
# Use a subdirectory within the main model dir for YOLO cache via HF Hub
|
| 2595 |
yolo_cache_dir = Path(self._model_dir) / "yolo_hf_cache"
|
| 2596 |
mdr_ensure_directory(str(yolo_cache_dir)) # Ensure cache dir exists
|
|
|
|
| 2684 |
return MDRExtractionResult(rotation=optimizer.rotation, layouts=layouts, extracted_image=image,
|
| 2685 |
adjusted_image=optimizer.adjusted_image)
|
| 2686 |
|
| 2687 |
+
# In class MDRExtractionEngine:
|
| 2688 |
+
|
| 2689 |
+
def _run_yolo_detection(self, img: Image, yolo: Any): # yolo can be doclayout_yolo.YOLOv10 or ultralytics.YOLO
|
| 2690 |
img_rgb = img.convert("RGB")
|
|
|
|
|
|
|
| 2691 |
|
| 2692 |
+
# Standard predict call
|
| 2693 |
+
# The conf threshold might need adjustment based on the new model's performance
|
| 2694 |
+
# For DocLayNet, 'Text' is often a high-confidence class.
|
| 2695 |
+
res_list = yolo.predict(source=img_rgb, imgsz=1024, conf=0.25,
|
| 2696 |
+
# Slightly higher conf for potentially better precision
|
| 2697 |
+
device=self._device, verbose=False)
|
| 2698 |
+
|
| 2699 |
+
if not res_list or not hasattr(res_list[0], 'boxes') or res_list[0].boxes is None:
|
| 2700 |
+
print(" Engine: YOLO detection returned no results or no boxes.")
|
| 2701 |
return
|
| 2702 |
|
| 2703 |
+
results = res_list[0] # Get the first (and usually only) result object
|
| 2704 |
+
|
| 2705 |
+
# --- Determine Class Mapping ---
|
| 2706 |
+
# This mapping needs to be verified against the actual model's output.
|
| 2707 |
+
# The hantian/yolo-doclaynet model card or its files might specify this.
|
| 2708 |
+
# Common DocLayNet class order (example, VERIFY THIS):
|
| 2709 |
+
# 0: Caption, 1: Footnote, 2: Formula, 3: List-item, 4: Page-footer,
|
| 2710 |
+
# 5: Page-header, 6: Picture, 7: Section-header, 8: Table, 9: Text, 10: Title
|
| 2711 |
+
|
| 2712 |
+
# Let's try to get names from the model directly if possible
|
| 2713 |
+
model_class_names = {}
|
| 2714 |
+
if hasattr(results, 'names') and isinstance(results.names, dict):
|
| 2715 |
+
model_class_names = results.names # results.names is usually {id: name}
|
| 2716 |
+
print(f" Engine: YOLO model class names: {model_class_names}")
|
| 2717 |
+
else:
|
| 2718 |
+
print(
|
| 2719 |
+
" Engine: Warning - Could not automatically get class names from YOLO model. Using predefined fallback mapping.")
|
| 2720 |
+
# Fallback predefined mapping (MUST BE VERIFIED FOR hantian/yolo-doclaynet)
|
| 2721 |
+
# This is a GUESS based on common DocLayNet order.
|
| 2722 |
+
# You MUST verify this by inspecting the model's config or output.
|
| 2723 |
+
_doclaynet_names_fallback = ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header',
|
| 2724 |
+
'Picture', 'Section-header', 'Table', 'Text', 'Title']
|
| 2725 |
+
model_class_names = {i: name for i, name in enumerate(_doclaynet_names_fallback)}
|
| 2726 |
+
|
| 2727 |
+
# Define your mapping from DocLayNet names (or indices if names are not available) to MDRLayoutClass
|
| 2728 |
+
# This is crucial and specific to the new model's output classes.
|
| 2729 |
+
doclaynet_to_mdr_map = {
|
| 2730 |
+
model_class_names.get(k): v for k, v in {
|
| 2731 |
+
# Map by string name if model_class_names is populated correctly
|
| 2732 |
+
'Text': MDRLayoutClass.PLAIN_TEXT,
|
| 2733 |
+
'Title': MDRLayoutClass.TITLE,
|
| 2734 |
+
'Section-header': MDRLayoutClass.TITLE, # Or a new MDRLayoutClass if needed
|
| 2735 |
+
'List-item': MDRLayoutClass.PLAIN_TEXT, # Treat list items as plain text
|
| 2736 |
+
'Table': MDRLayoutClass.TABLE,
|
| 2737 |
+
'Picture': MDRLayoutClass.FIGURE,
|
| 2738 |
+
'Formula': MDRLayoutClass.ISOLATE_FORMULA,
|
| 2739 |
+
'Caption': MDRLayoutClass.FIGURE_CAPTION, # Or TABLE_CAPTION, needs context
|
| 2740 |
+
'Footnote': MDRLayoutClass.TABLE_FOOTNOTE, # Or a general footnote class
|
| 2741 |
+
'Page-header': MDRLayoutClass.ABANDON, # Often headers/footers are ignored
|
| 2742 |
+
'Page-footer': MDRLayoutClass.ABANDON,
|
| 2743 |
+
}.items() if k in model_class_names.values() # Ensure key exists
|
| 2744 |
+
}
|
| 2745 |
+
|
| 2746 |
+
# If mapping by string name failed (e.g. model_class_names was not populated as expected),
|
| 2747 |
+
# try mapping by assumed index if you know the class ID for 'Text'.
|
| 2748 |
+
# The hantian/yolo-doclaynet example uses `classes=[1]` for Text. This implies ID 1 is Text.
|
| 2749 |
+
# This is risky if the order changes.
|
| 2750 |
+
if 'Text' not in [name for name in model_class_names.values() if name in doclaynet_to_mdr_map]:
|
| 2751 |
+
print(
|
| 2752 |
+
" Engine: Warning - 'Text' class not found in model_class_names via string mapping. Attempting index-based mapping for critical classes.")
|
| 2753 |
+
# Example: If you know from model card that class ID 9 is 'Text' and 10 is 'Title' for hantian/yolo-doclaynet
|
| 2754 |
+
# This is a COMMON order for DocLayNet, but VERIFY for hantian's model.
|
| 2755 |
+
# From some sources, for DocLayNet, 'Text' is often ID 9, 'Title' is ID 10.
|
| 2756 |
+
# The example `classes=[1]` from the HF page for hantian/yolo-doclaynet is confusing if 'Text' is ID 9.
|
| 2757 |
+
# Let's assume the example `classes=[1]` meant "the class at index 1 in some list", not necessarily ID 1.
|
| 2758 |
+
# We MUST get the correct ID for 'Text'.
|
| 2759 |
+
# For now, let's try to find 'Text' and 'Title' by string in model_class_names and get their IDs.
|
| 2760 |
+
|
| 2761 |
+
text_id = None
|
| 2762 |
+
title_id = None
|
| 2763 |
+
table_id = None
|
| 2764 |
+
figure_id = None
|
| 2765 |
+
formula_id = None
|
| 2766 |
+
caption_id = None # Generic caption
|
| 2767 |
+
|
| 2768 |
+
for id_val, name_val in model_class_names.items():
|
| 2769 |
+
if name_val == 'Text':
|
| 2770 |
+
text_id = id_val
|
| 2771 |
+
elif name_val == 'Title':
|
| 2772 |
+
title_id = id_val
|
| 2773 |
+
elif name_val == 'Table':
|
| 2774 |
+
table_id = id_val
|
| 2775 |
+
elif name_val == 'Picture':
|
| 2776 |
+
figure_id = id_val
|
| 2777 |
+
elif name_val == 'Formula':
|
| 2778 |
+
formula_id = id_val
|
| 2779 |
+
elif name_val == 'Caption':
|
| 2780 |
+
caption_id = id_val
|
| 2781 |
+
# Add other mappings as needed
|
| 2782 |
+
|
| 2783 |
+
temp_map_by_id = {}
|
| 2784 |
+
if text_id is not None: temp_map_by_id[text_id] = MDRLayoutClass.PLAIN_TEXT
|
| 2785 |
+
if title_id is not None: temp_map_by_id[title_id] = MDRLayoutClass.TITLE
|
| 2786 |
+
if table_id is not None: temp_map_by_id[table_id] = MDRLayoutClass.TABLE
|
| 2787 |
+
if figure_id is not None: temp_map_by_id[figure_id] = MDRLayoutClass.FIGURE
|
| 2788 |
+
if formula_id is not None: temp_map_by_id[formula_id] = MDRLayoutClass.ISOLATE_FORMULA
|
| 2789 |
+
if caption_id is not None: temp_map_by_id[
|
| 2790 |
+
caption_id] = MDRLayoutClass.FIGURE_CAPTION # Default, refine later
|
| 2791 |
+
|
| 2792 |
+
# Override doclaynet_to_mdr_map if direct ID mapping is more reliable
|
| 2793 |
+
if temp_map_by_id:
|
| 2794 |
+
print(f" Engine: Using direct ID mapping for some classes: {temp_map_by_id}")
|
| 2795 |
+
# This isn't quite right, the map should be from YOLO ID to MDR Class
|
| 2796 |
+
# The previous doclaynet_to_mdr_map was from string name to MDR Class.
|
| 2797 |
+
# We need a single, consistent map from YOLO's predicted class ID to MDRLayoutClass.
|
| 2798 |
+
|
| 2799 |
+
# Let's rebuild the map: yolo_class_id -> MDRLayoutClass
|
| 2800 |
+
final_yolo_id_to_mdr_class_map = {}
|
| 2801 |
+
if text_id is not None: final_yolo_id_to_mdr_class_map[text_id] = MDRLayoutClass.PLAIN_TEXT
|
| 2802 |
+
if title_id is not None: final_yolo_id_to_mdr_class_map[title_id] = MDRLayoutClass.TITLE
|
| 2803 |
+
# ... map others based on their found IDs ...
|
| 2804 |
+
# For simplicity, let's assume the string-based map from above is preferred if names are available.
|
| 2805 |
+
# The most important thing is to get the ID for 'Text'.
|
| 2806 |
+
# If `model_class_names` is `{0: 'Caption', 1: 'Footnote', ..., 9: 'Text', 10: 'Title'}`
|
| 2807 |
+
# then `doclaynet_to_mdr_map` should correctly map 'Text' to `MDRLayoutClass.PLAIN_TEXT`.
|
| 2808 |
+
|
| 2809 |
+
# Define which MDRLayoutClasses are considered "plain" for fragment merging later (if needed)
|
| 2810 |
+
# This set should use your MDRLayoutClass enum members.
|
| 2811 |
+
plain_mdr_classes: set[MDRLayoutClass] = {
|
| 2812 |
MDRLayoutClass.TITLE,
|
| 2813 |
MDRLayoutClass.PLAIN_TEXT,
|
| 2814 |
+
# MDRLayoutClass.ABANDON, # ABANDON layouts usually shouldn't get general text fragments
|
| 2815 |
+
MDRLayoutClass.FIGURE_CAPTION, # Captions are text
|
| 2816 |
+
MDRLayoutClass.TABLE_CAPTION, # Captions are text
|
| 2817 |
+
MDRLayoutClass.TABLE_FOOTNOTE, # Footnotes are text
|
| 2818 |
+
MDRLayoutClass.FORMULA_CAPTION, # Captions are text
|
| 2819 |
}
|
| 2820 |
+
print(f" Engine: Mapping YOLO classes to MDR classes. Effective map used for generation:")
|
| 2821 |
+
|
| 2822 |
+
for cls_id_tensor, xyxy_tensor in zip(results.boxes.cls, results.boxes.xyxy):
|
| 2823 |
+
yolo_cls_id = int(cls_id_tensor.item()) # Get integer class ID from tensor
|
| 2824 |
+
|
| 2825 |
+
# Get the string name for logging/mapping
|
| 2826 |
+
yolo_cls_name = model_class_names.get(yolo_cls_id, f"UnknownID-{yolo_cls_id}")
|
| 2827 |
+
|
| 2828 |
+
# Map YOLO class name to your MDRLayoutClass
|
| 2829 |
+
mdr_cls = None
|
| 2830 |
+
if yolo_cls_name == 'Text':
|
| 2831 |
+
mdr_cls = MDRLayoutClass.PLAIN_TEXT
|
| 2832 |
+
elif yolo_cls_name == 'Title':
|
| 2833 |
+
mdr_cls = MDRLayoutClass.TITLE
|
| 2834 |
+
elif yolo_cls_name == 'Section-header':
|
| 2835 |
+
mdr_cls = MDRLayoutClass.TITLE # Or a specific header class
|
| 2836 |
+
elif yolo_cls_name == 'List-item':
|
| 2837 |
+
mdr_cls = MDRLayoutClass.PLAIN_TEXT
|
| 2838 |
+
elif yolo_cls_name == 'Table':
|
| 2839 |
+
mdr_cls = MDRLayoutClass.TABLE
|
| 2840 |
+
elif yolo_cls_name == 'Picture':
|
| 2841 |
+
mdr_cls = MDRLayoutClass.FIGURE
|
| 2842 |
+
elif yolo_cls_name == 'Formula':
|
| 2843 |
+
mdr_cls = MDRLayoutClass.ISOLATE_FORMULA
|
| 2844 |
+
elif yolo_cls_name == 'Caption':
|
| 2845 |
+
mdr_cls = MDRLayoutClass.FIGURE_CAPTION # Default, could be table too
|
| 2846 |
+
elif yolo_cls_name == 'Footnote':
|
| 2847 |
+
mdr_cls = MDRLayoutClass.TABLE_FOOTNOTE # Or general footnote
|
| 2848 |
+
elif yolo_cls_name in ['Page-header', 'Page-footer']:
|
| 2849 |
+
mdr_cls = MDRLayoutClass.ABANDON
|
| 2850 |
+
|
| 2851 |
+
if mdr_cls is None:
|
| 2852 |
+
# print(f" Engine: Skipping YOLO box with class '{yolo_cls_name}' (ID {yolo_cls_id}) as it's not mapped to an MDRLayoutClass.")
|
| 2853 |
+
continue
|
| 2854 |
+
|
| 2855 |
+
# print(f" Engine: Detected YOLO class '{yolo_cls_name}' (ID {yolo_cls_id}), mapped to MDR class '{mdr_cls.name}'")
|
| 2856 |
|
| 2857 |
+
x1, y1, x2, y2 = map(float, xyxy_tensor)
|
| 2858 |
+
rect = MDRRectangle(lt=(x1, y1), rt=(x2, y1), lb=(x1, y2), rb=(x2, y2))
|
| 2859 |
+
if rect.area < 10: # Filter tiny boxes
|
|
|
|
|
|
|
| 2860 |
continue
|
| 2861 |
|
| 2862 |
+
if mdr_cls == MDRLayoutClass.TABLE:
|
| 2863 |
+
yield MDRTableLayoutElement(rect=rect, fragments=[], parsed=None, cls=mdr_cls) # Explicitly pass cls
|
| 2864 |
+
elif mdr_cls == MDRLayoutClass.ISOLATE_FORMULA:
|
| 2865 |
+
yield MDRFormulaLayoutElement(rect=rect, fragments=[], latex=None, cls=mdr_cls) # Explicitly pass cls
|
| 2866 |
+
elif mdr_cls == MDRLayoutClass.FIGURE: # Figure is not in plain_mdr_classes for default fragment assignment
|
| 2867 |
+
yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
|
| 2868 |
+
elif mdr_cls in plain_mdr_classes: # For TITLE, PLAIN_TEXT, CAPTION, etc.
|
| 2869 |
+
yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
|
| 2870 |
+
elif mdr_cls == MDRLayoutClass.ABANDON: # ABANDON class if you want to track but not assign frags by default
|
| 2871 |
+
yield MDRPlainLayoutElement(cls=mdr_cls, rect=rect, fragments=[])
|
| 2872 |
+
# else:
|
| 2873 |
+
# print(f" Engine: MDR class '{mdr_cls.name}' not explicitly handled for yielding, but was mapped.")
|
| 2874 |
|
| 2875 |
def _match_fragments_to_layouts(self, frags: list[MDROcrFragment], layouts: list[MDRLayoutElement]) -> list[
|
| 2876 |
MDRLayoutElement]:
|