Spaces:

ResearchMAGIC
/

teammrag-parser-moreai

Sleeping

App Files Files Community

rodrigomasini commited on Apr 29

Commit

a65e05e

verified ·

1 Parent(s): c79a3f8

Update mdr_pdf_parser.py

Browse files

Files changed (1) hide show

mdr_pdf_parser.py +47 -27

mdr_pdf_parser.py CHANGED Viewed

@@ -47,6 +47,7 @@ from alphabet_detector import AlphabetDetector
 from munch import Munch
 from transformers import LayoutLMv3ForTokenClassification
 import onnxruntime
 # --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
 from huggingface_hub import hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
@@ -91,7 +92,6 @@ def mdr_download_model(url: str, file_path: Path):
     if file_path.exists(): os.remove(file_path)
     raise e
-# --- MDR Utilities ---
 def mdr_ensure_directory(path: str) -> str:
   """Ensures a directory exists, creating it if necessary."""
   path = os.path.abspath(path)
@@ -113,7 +113,7 @@ def mdr_expand_image(image: Image, percent: float) -> Image:
   else: fill = (255, 255, 255)
   return pil_expand(image=image, border=(bw, bh), fill=fill)
-# --- MDR Geometry (rectangle.py) ---
 MDRPoint: TypeAlias = tuple[float, float]
 @dataclass
 class MDRRectangle:
@@ -181,17 +181,22 @@ class MDRBaseLayoutElement:
 @dataclass
 class MDRPlainLayoutElement(MDRBaseLayoutElement):
   """Layout element for plain text, titles, captions, figures, etc."""
-  cls: Literal[MDRLayoutClass.TITLE, MDRLayoutClass.PLAIN_TEXT, MDRLayoutClass.ABANDON, MDRLayoutClass.FIGURE, MDRLayoutClass.FIGURE_CAPTION, MDRLayoutClass.TABLE_CAPTION, MDRLayoutClass.TABLE_FOOTNOTE, MDRLayoutClass.FORMULA_CAPTION]
 @dataclass
 class MDRTableLayoutElement(MDRBaseLayoutElement):
   """Layout element specifically for tables."""
-  parsed: tuple[str, MDRTableLayoutParsedFormat] | None; cls: Literal[MDRLayoutClass.TABLE] = MDRLayoutClass.TABLE
 @dataclass
 class MDRFormulaLayoutElement(MDRBaseLayoutElement):
   """Layout element specifically for formulas."""
-  latex: str | None; cls: Literal[MDRLayoutClass.ISOLATE_FORMULA] = MDRLayoutClass.ISOLATE_FORMULA
 MDRLayoutElement = MDRPlainLayoutElement | MDRTableLayoutElement | MDRFormulaLayoutElement # Type alias
@@ -218,24 +223,35 @@ class MDRTextSpan:
 @dataclass
 class MDRBasicBlock:
   """Base class for structured blocks extracted from the document."""
-  rect: MDRRectangle; texts: list[MDRTextSpan]; font_size: float # Relative font size (0-1)
 @dataclass
 class MDRTextBlock(MDRBasicBlock):
   """A structured block containing text content."""
-  kind: MDRTextKind; has_paragraph_indentation: bool = False; last_line_touch_end: bool = False
-class MDRTableFormat(Enum): LATEX=auto(); MARKDOWN=auto(); HTML=auto(); UNRECOGNIZABLE=auto()
 @dataclass
 class MDRTableBlock(MDRBasicBlock):
   """A structured block representing a table."""
-  content: str; format: MDRTableFormat; image: Image # Image clip of the table
 @dataclass
 class MDRFormulaBlock(MDRBasicBlock):
   """A structured block representing a formula."""
-  content: str | None; image: Image # Image clip of the formula
 @dataclass
 class MDRFigureBlock(MDRBasicBlock):
@@ -278,13 +294,20 @@ def mdr_contains_cjka(text: str):
   return bool(_MDR_CJKA_PATTERN.search(text)) if text else False
 # --- MDR Text Processing ---
-class _MDR_TokenPhase(Enum): Init=0; Letter=1; Character=2; Number=3; Space=4
 _mdr_alphabet_detector = AlphabetDetector()
 def _mdr_is_letter(char: str):
-  if not category(char).startswith("L"): return False
-  try: return _mdr_alphabet_detector.is_latin(char) or _mdr_alphabet_detector.is_cyrillic(char) or _mdr_alphabet_detector.is_greek(char) or _mdr_alphabet_detector.is_hebrew(char)
   except: return False
 def mdr_split_into_words(text: str):
@@ -373,8 +396,10 @@ class MDRRotationAdjuster:
     return x + self._n_off[0], y + self._n_off[1]
 def mdr_normalize_vertical_rotation(rot: float) -> float:
-  while rot >= pi: rot -= pi;
-  while rot < 0: rot += pi;
   return rot
 def _mdr_get_rectangle_angles(rect: MDRRectangle) -> tuple[list[float], list[float]] | None:
@@ -452,11 +477,14 @@ class _MDR_PredictBase:
              print("  CUDAExecutionProvider not available. Check ONNXRuntime-GPU installation and CUDA setup.")
         raise e
-  def get_output_name(self, sess: onnxruntime.InferenceSession) -> List[str]: return [n.name for n in sess.get_outputs()]
-  def get_input_name(self, sess: onnxruntime.InferenceSession) -> List[str]: return [n.name for n in sess.get_inputs()]
-  def get_input_feed(self, names: List[str], img_np: np.ndarray) -> Dict[str, np.ndarray]: return {name: img_np for name in names}
 # --- MDR ONNX OCR Internals ---
 class _MDR_NormalizeImage:
@@ -590,11 +618,9 @@ def mdr_ocr_transform(
 ) -> Optional[Any]:
     """
     Applies a sequence of transformation operations to the input data.
     This function iterates through a list of operations (callables) and
     applies each one sequentially to the data. If any operation
     returns None, the processing stops immediately, and None is returned.
     Args:
         data: The initial data to be transformed. Can be of any type
               compatible with the operations.
@@ -603,7 +629,6 @@ def mdr_ocr_transform(
              the transformed data or None to signal an early exit.
              If None or an empty list is provided, the original data
              is returned unchanged.
     Returns:
         The transformed data after applying all operations successfully,
         or None if any operation in the sequence returned None.
@@ -2261,7 +2286,6 @@ class MagicPDFProcessor:
   def __init__(self, device: Literal["cpu", "cuda"]="cuda", model_dir_path: str="./mdr_models", ocr_level: MDROcrLevel=MDROcrLevel.Once, extract_formula: bool=True, extract_table_format: MDRExtractedTableFormat|None=None, debug_dir_path: str|None=None):
     """
     Initializes the MagicPDFProcessor.
     Args:
         device: Computation device ('cpu' or 'cuda'). Defaults to 'cuda'. Fallbacks to 'cpu' if CUDA not available.
         model_dir_path: Path to directory for storing/caching downloaded models. Defaults to './mdr_models'.
@@ -2283,11 +2307,9 @@ class MagicPDFProcessor:
   def process_document(self, pdf_input: str|FitzDocument, report_progress: MDRProgressReportCallback|None=None) -> Generator[MDRStructuredBlock, None, None]:
     """
     Processes the entire PDF document and yields all extracted structured blocks.
     Args:
         pdf_input: Path to the PDF file or a loaded fitz.Document object.
         report_progress: Optional callback function for progress updates (receives completed_scan_pages, total_scan_pages).
     Yields:
         MDRStructuredBlock: An extracted block (MDRTextBlock, MDRTableBlock, etc.).
     """
@@ -2300,12 +2322,10 @@ class MagicPDFProcessor:
     """
     Processes specific pages (or all if page_indexes is None) of the PDF document.
     Yields results page by page, including the page index, extracted blocks, and the original page image.
     Args:
         pdf_input: Path to the PDF file or a loaded fitz.Document object.
         page_indexes: An iterable of 0-based page indices to process. If None, processes all pages.
         report_progress: Optional callback function for progress updates.
     Yields:
         tuple[int, list[MDRStructuredBlock], Image]:
             - page_index (0-based)
@@ -2617,4 +2637,4 @@ if __name__ == '__main__':
         print(f"\nFATAL ERROR during processing: {e}")
         import traceback
         traceback.print_exc()
-        exit(1)

 from munch import Munch
 from transformers import LayoutLMv3ForTokenClassification
 import onnxruntime
+from enum import auto, Enum
 # --- HUGGING FACE HUB IMPORT ONLY BECAUSE RUNNING IN SPACES NOT NECESSARY IN PROD ---
 from huggingface_hub import hf_hub_download
 from huggingface_hub.errors import HfHubHTTPError
     if file_path.exists(): os.remove(file_path)
     raise e
 def mdr_ensure_directory(path: str) -> str:
   """Ensures a directory exists, creating it if necessary."""
   path = os.path.abspath(path)
   else: fill = (255, 255, 255)
   return pil_expand(image=image, border=(bw, bh), fill=fill)
+# --- MDR Geometry ---
 MDRPoint: TypeAlias = tuple[float, float]
 @dataclass
 class MDRRectangle:
 @dataclass
 class MDRPlainLayoutElement(MDRBaseLayoutElement):
   """Layout element for plain text, titles, captions, figures, etc."""
+  # MODIFIED: Replaced Literal[...] with the Enum class name
+  cls: MDRLayoutClass # The type hint is now the Enum class itself
 @dataclass
 class MDRTableLayoutElement(MDRBaseLayoutElement):
   """Layout element specifically for tables."""
+  parsed: tuple[str, MDRTableLayoutParsedFormat] | None
+  # MODIFIED: Replaced Literal[EnumMember] with the Enum class name
+  cls: MDRLayoutClass = MDRLayoutClass.TABLE # Hint with Enum, assign default member
 @dataclass
 class MDRFormulaLayoutElement(MDRBaseLayoutElement):
   """Layout element specifically for formulas."""
+  latex: str | None
+  # MODIFIED: Replaced Literal[EnumMember] with the Enum class name
+  cls: MDRLayoutClass = MDRLayoutClass.ISOLATE_FORMULA # Hint with Enum, assign default member
 MDRLayoutElement = MDRPlainLayoutElement | MDRTableLayoutElement | MDRFormulaLayoutElement # Type alias
 @dataclass
 class MDRBasicBlock:
   """Base class for structured blocks extracted from the document."""
+  rect: MDRRectangle
+  texts: list[MDRTextSpan]
+  font_size: float # Relative font size (0-1)
 @dataclass
 class MDRTextBlock(MDRBasicBlock):
   """A structured block containing text content."""
+  kind: MDRTextKind
+  has_paragraph_indentation: bool = False
+  last_line_touch_end: bool = False
+class MDRTableFormat(Enum):
+    LATEX=auto()
+    MARKDOWN=auto()
+    HTML=auto()
+    UNRECOGNIZABLE=auto()
 @dataclass
 class MDRTableBlock(MDRBasicBlock):
   """A structured block representing a table."""
+  content: str
+  format: MDRTableFormat
+  image: Image # Image clip of the table
 @dataclass
 class MDRFormulaBlock(MDRBasicBlock):
   """A structured block representing a formula."""
+  content: str | None
+  image: Image # Image clip of the formula
 @dataclass
 class MDRFigureBlock(MDRBasicBlock):
   return bool(_MDR_CJKA_PATTERN.search(text)) if text else False
 # --- MDR Text Processing ---
+class _MDR_TokenPhase(Enum):
+    Init=0
+    Letter=1
+    Character=2
+    Number=3
+    Space=4
 _mdr_alphabet_detector = AlphabetDetector()
 def _mdr_is_letter(char: str):
+  if not category(char).startswith("L"):
+      return False
+  try:
+      return _mdr_alphabet_detector.is_latin(char) or _mdr_alphabet_detector.is_cyrillic(char) or _mdr_alphabet_detector.is_greek(char) or _mdr_alphabet_detector.is_hebrew(char)
   except: return False
 def mdr_split_into_words(text: str):
     return x + self._n_off[0], y + self._n_off[1]
 def mdr_normalize_vertical_rotation(rot: float) -> float:
+  while rot >= pi:
+      rot -= pi
+  while rot < 0:
+      rot += pi
   return rot
 def _mdr_get_rectangle_angles(rect: MDRRectangle) -> tuple[list[float], list[float]] | None:
              print("  CUDAExecutionProvider not available. Check ONNXRuntime-GPU installation and CUDA setup.")
         raise e
+  def get_output_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
+      return [n.name for n in sess.get_outputs()]
+  def get_input_name(self, sess: onnxruntime.InferenceSession) -> List[str]:
+      return [n.name for n in sess.get_inputs()]
+  def get_input_feed(self, names: List[str], img_np: np.ndarray) -> Dict[str, np.ndarray]:
+      return {name: img_np for name in names}
 # --- MDR ONNX OCR Internals ---
 class _MDR_NormalizeImage:
 ) -> Optional[Any]:
     """
     Applies a sequence of transformation operations to the input data.
     This function iterates through a list of operations (callables) and
     applies each one sequentially to the data. If any operation
     returns None, the processing stops immediately, and None is returned.
     Args:
         data: The initial data to be transformed. Can be of any type
               compatible with the operations.
              the transformed data or None to signal an early exit.
              If None or an empty list is provided, the original data
              is returned unchanged.
     Returns:
         The transformed data after applying all operations successfully,
         or None if any operation in the sequence returned None.
   def __init__(self, device: Literal["cpu", "cuda"]="cuda", model_dir_path: str="./mdr_models", ocr_level: MDROcrLevel=MDROcrLevel.Once, extract_formula: bool=True, extract_table_format: MDRExtractedTableFormat|None=None, debug_dir_path: str|None=None):
     """
     Initializes the MagicPDFProcessor.
     Args:
         device: Computation device ('cpu' or 'cuda'). Defaults to 'cuda'. Fallbacks to 'cpu' if CUDA not available.
         model_dir_path: Path to directory for storing/caching downloaded models. Defaults to './mdr_models'.
   def process_document(self, pdf_input: str|FitzDocument, report_progress: MDRProgressReportCallback|None=None) -> Generator[MDRStructuredBlock, None, None]:
     """
     Processes the entire PDF document and yields all extracted structured blocks.
     Args:
         pdf_input: Path to the PDF file or a loaded fitz.Document object.
         report_progress: Optional callback function for progress updates (receives completed_scan_pages, total_scan_pages).
     Yields:
         MDRStructuredBlock: An extracted block (MDRTextBlock, MDRTableBlock, etc.).
     """
     """
     Processes specific pages (or all if page_indexes is None) of the PDF document.
     Yields results page by page, including the page index, extracted blocks, and the original page image.
     Args:
         pdf_input: Path to the PDF file or a loaded fitz.Document object.
         page_indexes: An iterable of 0-based page indices to process. If None, processes all pages.
         report_progress: Optional callback function for progress updates.
     Yields:
         tuple[int, list[MDRStructuredBlock], Image]:
             - page_index (0-based)
         print(f"\nFATAL ERROR during processing: {e}")
         import traceback
         traceback.print_exc()
+        exit(1)