Update mdr_pdf_parser.py
Browse files- mdr_pdf_parser.py +47 -27
mdr_pdf_parser.py
CHANGED
|
@@ -47,6 +47,7 @@ from alphabet_detector import AlphabetDetector
|
|
| 47 |
from munch import Munch
|
| 48 |
from transformers import LayoutLMv3ForTokenClassification
|
| 49 |
import onnxruntime
|
|
|
|
| 50 |
# --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
|
| 51 |
from huggingface_hub import hf_hub_download
|
| 52 |
from huggingface_hub.errors import HfHubHTTPError
|
|
@@ -91,7 +92,6 @@ def mdr_download_model(url: str, file_path: Path):
|
|
| 91 |
if file_path.exists(): os.remove(file_path)
|
| 92 |
raise e
|
| 93 |
|
| 94 |
-
# --- MDR Utilities ---
|
| 95 |
def mdr_ensure_directory(path: str) -> str:
|
| 96 |
"""Ensures a directory exists, creating it if necessary."""
|
| 97 |
path = os.path.abspath(path)
|
|
@@ -113,7 +113,7 @@ def mdr_expand_image(image: Image, percent: float) -> Image:
|
|
| 113 |
else: fill = (255, 255, 255)
|
| 114 |
return pil_expand(image=image, border=(bw, bh), fill=fill)
|
| 115 |
|
| 116 |
-
# --- MDR Geometry
|
| 117 |
MDRPoint: TypeAlias = tuple[float, float]
|
| 118 |
@dataclass
|
| 119 |
class MDRRectangle:
|
|
@@ -181,17 +181,22 @@ class MDRBaseLayoutElement:
|
|
| 181 |
@dataclass
|
| 182 |
class MDRPlainLayoutElement(MDRBaseLayoutElement):
|
| 183 |
"""Layout element for plain text, titles, captions, figures, etc."""
|
| 184 |
-
|
|
|
|
| 185 |
|
| 186 |
@dataclass
|
| 187 |
class MDRTableLayoutElement(MDRBaseLayoutElement):
|
| 188 |
"""Layout element specifically for tables."""
|
| 189 |
-
parsed: tuple[str, MDRTableLayoutParsedFormat] | None
|
|
|
|
|
|
|
| 190 |
|
| 191 |
@dataclass
|
| 192 |
class MDRFormulaLayoutElement(MDRBaseLayoutElement):
|
| 193 |
"""Layout element specifically for formulas."""
|
| 194 |
-
latex: str | None
|
|
|
|
|
|
|
| 195 |
|
| 196 |
MDRLayoutElement = MDRPlainLayoutElement | MDRTableLayoutElement | MDRFormulaLayoutElement # Type alias
|
| 197 |
|
|
@@ -218,24 +223,35 @@ class MDRTextSpan:
|
|
| 218 |
@dataclass
|
| 219 |
class MDRBasicBlock:
|
| 220 |
"""Base class for structured blocks extracted from the document."""
|
| 221 |
-
rect: MDRRectangle
|
|
|
|
|
|
|
| 222 |
|
| 223 |
@dataclass
|
| 224 |
class MDRTextBlock(MDRBasicBlock):
|
| 225 |
"""A structured block containing text content."""
|
| 226 |
-
kind: MDRTextKind
|
|
|
|
|
|
|
| 227 |
|
| 228 |
-
class MDRTableFormat(Enum):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
@dataclass
|
| 231 |
class MDRTableBlock(MDRBasicBlock):
|
| 232 |
"""A structured block representing a table."""
|
| 233 |
-
content: str
|
|
|
|
|
|
|
| 234 |
|
| 235 |
@dataclass
|
| 236 |
class MDRFormulaBlock(MDRBasicBlock):
|
| 237 |
"""A structured block representing a formula."""
|
| 238 |
-
content: str | None
|
|
|
|
| 239 |
|
| 240 |
@dataclass
|
| 241 |
class MDRFigureBlock(MDRBasicBlock):
|
|
@@ -278,13 +294,20 @@ def mdr_contains_cjka(text: str):
|
|
| 278 |
return bool(_MDR_CJKA_PATTERN.search(text)) if text else False
|
| 279 |
|
| 280 |
# --- MDR Text Processing ---
|
| 281 |
-
class _MDR_TokenPhase(Enum):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
_mdr_alphabet_detector = AlphabetDetector()
|
| 284 |
|
| 285 |
def _mdr_is_letter(char: str):
|
| 286 |
-
if not category(char).startswith("L"):
|
| 287 |
-
|
|
|
|
|
|
|
| 288 |
except: return False
|
| 289 |
|
| 290 |
def mdr_split_into_words(text: str):
|
|
@@ -373,8 +396,10 @@ class MDRRotationAdjuster:
|
|
| 373 |
return x + self._n_off[0], y + self._n_off[1]
|
| 374 |
|
| 375 |
def mdr_normalize_vertical_rotation(rot: float) -> float:
|
| 376 |
-
while rot >= pi:
|
| 377 |
-
|
|
|
|
|
|
|
| 378 |
return rot
|
| 379 |
|
| 380 |
def _mdr_get_rectangle_angles(rect: MDRRectangle) -> tuple[list[float], list[float]] | None:
|
|
@@ -452,11 +477,14 @@ class _MDR_PredictBase:
|
|
| 452 |
print(" CUDAExecutionProvider not available. Check ONNXRuntime-GPU installation and CUDA setup.")
|
| 453 |
raise e
|
| 454 |
|
| 455 |
-
def get_output_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
|
|
|
|
| 456 |
|
| 457 |
-
def get_input_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
|
|
|
|
| 458 |
|
| 459 |
-
def get_input_feed(self, names: List[str], img_np: np.ndarray) -> Dict[str, np.ndarray]:
|
|
|
|
| 460 |
|
| 461 |
# --- MDR ONNX OCR Internals ---
|
| 462 |
class _MDR_NormalizeImage:
|
|
@@ -590,11 +618,9 @@ def mdr_ocr_transform(
|
|
| 590 |
) -> Optional[Any]:
|
| 591 |
"""
|
| 592 |
Applies a sequence of transformation operations to the input data.
|
| 593 |
-
|
| 594 |
This function iterates through a list of operations (callables) and
|
| 595 |
applies each one sequentially to the data. If any operation
|
| 596 |
returns None, the processing stops immediately, and None is returned.
|
| 597 |
-
|
| 598 |
Args:
|
| 599 |
data: The initial data to be transformed. Can be of any type
|
| 600 |
compatible with the operations.
|
|
@@ -603,7 +629,6 @@ def mdr_ocr_transform(
|
|
| 603 |
the transformed data or None to signal an early exit.
|
| 604 |
If None or an empty list is provided, the original data
|
| 605 |
is returned unchanged.
|
| 606 |
-
|
| 607 |
Returns:
|
| 608 |
The transformed data after applying all operations successfully,
|
| 609 |
or None if any operation in the sequence returned None.
|
|
@@ -2261,7 +2286,6 @@ class MagicPDFProcessor:
|
|
| 2261 |
def __init__(self, device: Literal["cpu", "cuda"]="cuda", model_dir_path: str="./mdr_models", ocr_level: MDROcrLevel=MDROcrLevel.Once, extract_formula: bool=True, extract_table_format: MDRExtractedTableFormat|None=None, debug_dir_path: str|None=None):
|
| 2262 |
"""
|
| 2263 |
Initializes the MagicPDFProcessor.
|
| 2264 |
-
|
| 2265 |
Args:
|
| 2266 |
device: Computation device ('cpu' or 'cuda'). Defaults to 'cuda'. Fallbacks to 'cpu' if CUDA not available.
|
| 2267 |
model_dir_path: Path to directory for storing/caching downloaded models. Defaults to './mdr_models'.
|
|
@@ -2283,11 +2307,9 @@ class MagicPDFProcessor:
|
|
| 2283 |
def process_document(self, pdf_input: str|FitzDocument, report_progress: MDRProgressReportCallback|None=None) -> Generator[MDRStructuredBlock, None, None]:
|
| 2284 |
"""
|
| 2285 |
Processes the entire PDF document and yields all extracted structured blocks.
|
| 2286 |
-
|
| 2287 |
Args:
|
| 2288 |
pdf_input: Path to the PDF file or a loaded fitz.Document object.
|
| 2289 |
report_progress: Optional callback function for progress updates (receives completed_scan_pages, total_scan_pages).
|
| 2290 |
-
|
| 2291 |
Yields:
|
| 2292 |
MDRStructuredBlock: An extracted block (MDRTextBlock, MDRTableBlock, etc.).
|
| 2293 |
"""
|
|
@@ -2300,12 +2322,10 @@ class MagicPDFProcessor:
|
|
| 2300 |
"""
|
| 2301 |
Processes specific pages (or all if page_indexes is None) of the PDF document.
|
| 2302 |
Yields results page by page, including the page index, extracted blocks, and the original page image.
|
| 2303 |
-
|
| 2304 |
Args:
|
| 2305 |
pdf_input: Path to the PDF file or a loaded fitz.Document object.
|
| 2306 |
page_indexes: An iterable of 0-based page indices to process. If None, processes all pages.
|
| 2307 |
report_progress: Optional callback function for progress updates.
|
| 2308 |
-
|
| 2309 |
Yields:
|
| 2310 |
tuple[int, list[MDRStructuredBlock], Image]:
|
| 2311 |
- page_index (0-based)
|
|
@@ -2617,4 +2637,4 @@ if __name__ == '__main__':
|
|
| 2617 |
print(f"\nFATAL ERROR during processing: {e}")
|
| 2618 |
import traceback
|
| 2619 |
traceback.print_exc()
|
| 2620 |
-
exit(1)
|
|
|
|
| 47 |
from munch import Munch
|
| 48 |
from transformers import LayoutLMv3ForTokenClassification
|
| 49 |
import onnxruntime
|
| 50 |
+
from enum import auto, Enum
|
| 51 |
# --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
|
| 52 |
from huggingface_hub import hf_hub_download
|
| 53 |
from huggingface_hub.errors import HfHubHTTPError
|
|
|
|
| 92 |
if file_path.exists(): os.remove(file_path)
|
| 93 |
raise e
|
| 94 |
|
|
|
|
| 95 |
def mdr_ensure_directory(path: str) -> str:
|
| 96 |
"""Ensures a directory exists, creating it if necessary."""
|
| 97 |
path = os.path.abspath(path)
|
|
|
|
| 113 |
else: fill = (255, 255, 255)
|
| 114 |
return pil_expand(image=image, border=(bw, bh), fill=fill)
|
| 115 |
|
| 116 |
+
# --- MDR Geometry ---
|
| 117 |
MDRPoint: TypeAlias = tuple[float, float]
|
| 118 |
@dataclass
|
| 119 |
class MDRRectangle:
|
|
|
|
| 181 |
@dataclass
|
| 182 |
class MDRPlainLayoutElement(MDRBaseLayoutElement):
|
| 183 |
"""Layout element for plain text, titles, captions, figures, etc."""
|
| 184 |
+
# MODIFIED: Replaced Literal[...] with the Enum class name
|
| 185 |
+
cls: MDRLayoutClass # The type hint is now the Enum class itself
|
| 186 |
|
| 187 |
@dataclass
|
| 188 |
class MDRTableLayoutElement(MDRBaseLayoutElement):
|
| 189 |
"""Layout element specifically for tables."""
|
| 190 |
+
parsed: tuple[str, MDRTableLayoutParsedFormat] | None
|
| 191 |
+
# MODIFIED: Replaced Literal[EnumMember] with the Enum class name
|
| 192 |
+
cls: MDRLayoutClass = MDRLayoutClass.TABLE # Hint with Enum, assign default member
|
| 193 |
|
| 194 |
@dataclass
|
| 195 |
class MDRFormulaLayoutElement(MDRBaseLayoutElement):
|
| 196 |
"""Layout element specifically for formulas."""
|
| 197 |
+
latex: str | None
|
| 198 |
+
# MODIFIED: Replaced Literal[EnumMember] with the Enum class name
|
| 199 |
+
cls: MDRLayoutClass = MDRLayoutClass.ISOLATE_FORMULA # Hint with Enum, assign default member
|
| 200 |
|
| 201 |
MDRLayoutElement = MDRPlainLayoutElement | MDRTableLayoutElement | MDRFormulaLayoutElement # Type alias
|
| 202 |
|
|
|
|
| 223 |
@dataclass
|
| 224 |
class MDRBasicBlock:
|
| 225 |
"""Base class for structured blocks extracted from the document."""
|
| 226 |
+
rect: MDRRectangle
|
| 227 |
+
texts: list[MDRTextSpan]
|
| 228 |
+
font_size: float # Relative font size (0-1)
|
| 229 |
|
| 230 |
@dataclass
|
| 231 |
class MDRTextBlock(MDRBasicBlock):
|
| 232 |
"""A structured block containing text content."""
|
| 233 |
+
kind: MDRTextKind
|
| 234 |
+
has_paragraph_indentation: bool = False
|
| 235 |
+
last_line_touch_end: bool = False
|
| 236 |
|
| 237 |
+
class MDRTableFormat(Enum):
|
| 238 |
+
LATEX=auto()
|
| 239 |
+
MARKDOWN=auto()
|
| 240 |
+
HTML=auto()
|
| 241 |
+
UNRECOGNIZABLE=auto()
|
| 242 |
|
| 243 |
@dataclass
|
| 244 |
class MDRTableBlock(MDRBasicBlock):
|
| 245 |
"""A structured block representing a table."""
|
| 246 |
+
content: str
|
| 247 |
+
format: MDRTableFormat
|
| 248 |
+
image: Image # Image clip of the table
|
| 249 |
|
| 250 |
@dataclass
|
| 251 |
class MDRFormulaBlock(MDRBasicBlock):
|
| 252 |
"""A structured block representing a formula."""
|
| 253 |
+
content: str | None
|
| 254 |
+
image: Image # Image clip of the formula
|
| 255 |
|
| 256 |
@dataclass
|
| 257 |
class MDRFigureBlock(MDRBasicBlock):
|
|
|
|
| 294 |
return bool(_MDR_CJKA_PATTERN.search(text)) if text else False
|
| 295 |
|
| 296 |
# --- MDR Text Processing ---
|
| 297 |
+
class _MDR_TokenPhase(Enum):
|
| 298 |
+
Init=0
|
| 299 |
+
Letter=1
|
| 300 |
+
Character=2
|
| 301 |
+
Number=3
|
| 302 |
+
Space=4
|
| 303 |
|
| 304 |
_mdr_alphabet_detector = AlphabetDetector()
|
| 305 |
|
| 306 |
def _mdr_is_letter(char: str):
|
| 307 |
+
if not category(char).startswith("L"):
|
| 308 |
+
return False
|
| 309 |
+
try:
|
| 310 |
+
return _mdr_alphabet_detector.is_latin(char) or _mdr_alphabet_detector.is_cyrillic(char) or _mdr_alphabet_detector.is_greek(char) or _mdr_alphabet_detector.is_hebrew(char)
|
| 311 |
except: return False
|
| 312 |
|
| 313 |
def mdr_split_into_words(text: str):
|
|
|
|
| 396 |
return x + self._n_off[0], y + self._n_off[1]
|
| 397 |
|
| 398 |
def mdr_normalize_vertical_rotation(rot: float) -> float:
|
| 399 |
+
while rot >= pi:
|
| 400 |
+
rot -= pi
|
| 401 |
+
while rot < 0:
|
| 402 |
+
rot += pi
|
| 403 |
return rot
|
| 404 |
|
| 405 |
def _mdr_get_rectangle_angles(rect: MDRRectangle) -> tuple[list[float], list[float]] | None:
|
|
|
|
| 477 |
print(" CUDAExecutionProvider not available. Check ONNXRuntime-GPU installation and CUDA setup.")
|
| 478 |
raise e
|
| 479 |
|
| 480 |
+
def get_output_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
|
| 481 |
+
return [n.name for n in sess.get_outputs()]
|
| 482 |
|
| 483 |
+
def get_input_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
|
| 484 |
+
return [n.name for n in sess.get_inputs()]
|
| 485 |
|
| 486 |
+
def get_input_feed(self, names: List[str], img_np: np.ndarray) -> Dict[str, np.ndarray]:
|
| 487 |
+
return {name: img_np for name in names}
|
| 488 |
|
| 489 |
# --- MDR ONNX OCR Internals ---
|
| 490 |
class _MDR_NormalizeImage:
|
|
|
|
| 618 |
) -> Optional[Any]:
|
| 619 |
"""
|
| 620 |
Applies a sequence of transformation operations to the input data.
|
|
|
|
| 621 |
This function iterates through a list of operations (callables) and
|
| 622 |
applies each one sequentially to the data. If any operation
|
| 623 |
returns None, the processing stops immediately, and None is returned.
|
|
|
|
| 624 |
Args:
|
| 625 |
data: The initial data to be transformed. Can be of any type
|
| 626 |
compatible with the operations.
|
|
|
|
| 629 |
the transformed data or None to signal an early exit.
|
| 630 |
If None or an empty list is provided, the original data
|
| 631 |
is returned unchanged.
|
|
|
|
| 632 |
Returns:
|
| 633 |
The transformed data after applying all operations successfully,
|
| 634 |
or None if any operation in the sequence returned None.
|
|
|
|
| 2286 |
def __init__(self, device: Literal["cpu", "cuda"]="cuda", model_dir_path: str="./mdr_models", ocr_level: MDROcrLevel=MDROcrLevel.Once, extract_formula: bool=True, extract_table_format: MDRExtractedTableFormat|None=None, debug_dir_path: str|None=None):
|
| 2287 |
"""
|
| 2288 |
Initializes the MagicPDFProcessor.
|
|
|
|
| 2289 |
Args:
|
| 2290 |
device: Computation device ('cpu' or 'cuda'). Defaults to 'cuda'. Fallbacks to 'cpu' if CUDA not available.
|
| 2291 |
model_dir_path: Path to directory for storing/caching downloaded models. Defaults to './mdr_models'.
|
|
|
|
| 2307 |
def process_document(self, pdf_input: str|FitzDocument, report_progress: MDRProgressReportCallback|None=None) -> Generator[MDRStructuredBlock, None, None]:
|
| 2308 |
"""
|
| 2309 |
Processes the entire PDF document and yields all extracted structured blocks.
|
|
|
|
| 2310 |
Args:
|
| 2311 |
pdf_input: Path to the PDF file or a loaded fitz.Document object.
|
| 2312 |
report_progress: Optional callback function for progress updates (receives completed_scan_pages, total_scan_pages).
|
|
|
|
| 2313 |
Yields:
|
| 2314 |
MDRStructuredBlock: An extracted block (MDRTextBlock, MDRTableBlock, etc.).
|
| 2315 |
"""
|
|
|
|
| 2322 |
"""
|
| 2323 |
Processes specific pages (or all if page_indexes is None) of the PDF document.
|
| 2324 |
Yields results page by page, including the page index, extracted blocks, and the original page image.
|
|
|
|
| 2325 |
Args:
|
| 2326 |
pdf_input: Path to the PDF file or a loaded fitz.Document object.
|
| 2327 |
page_indexes: An iterable of 0-based page indices to process. If None, processes all pages.
|
| 2328 |
report_progress: Optional callback function for progress updates.
|
|
|
|
| 2329 |
Yields:
|
| 2330 |
tuple[int, list[MDRStructuredBlock], Image]:
|
| 2331 |
- page_index (0-based)
|
|
|
|
| 2637 |
print(f"\nFATAL ERROR during processing: {e}")
|
| 2638 |
import traceback
|
| 2639 |
traceback.print_exc()
|
| 2640 |
+
exit(1)
|