Spaces:

rt4u
/

marker

Sleeping

peppermenta commited on Mar 12

Commit

5729ffa

1 Parent(s): 8b0bdf4

Cleanup and refactor old texify tests to the new model

Files changed (6) hide show

marker/builders/line.py CHANGED Viewed

@@ -93,10 +93,6 @@ class LineBuilder(BaseBuilder):
         bool,
         "Whether to use the LLM model for advanced processing."
     ] = False
-    texify_inline_spans: Annotated[
-        bool,
-        "Whether to run texify on inline math spans."
-    ] = False
     ocr_remove_blocks: Tuple[BlockTypes, ...] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents, BlockTypes.Equation)
     disable_tqdm: Annotated[
         bool,
@@ -112,10 +108,10 @@ class LineBuilder(BaseBuilder):
     def __call__(self, document: Document, provider: PdfProvider):
         # Disable inline detection for documents where layout model doesn't detect any equations
-        # Also disable if we won't use the inline detections (if we aren't using the LLM or texify)
         do_inline_math_detection = all([
             len(document.contained_blocks([BlockTypes.Equation, BlockTypes.TextInlineMath])) > 0,
-            (self.texify_inline_spans or self.use_llm)
         ])
         provider_lines, ocr_lines = self.get_all_lines(document, provider, do_inline_math_detection)

         bool,
         "Whether to use the LLM model for advanced processing."
     ] = False
     ocr_remove_blocks: Tuple[BlockTypes, ...] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents, BlockTypes.Equation)
     disable_tqdm: Annotated[
         bool,
     def __call__(self, document: Document, provider: PdfProvider):
         # Disable inline detection for documents where layout model doesn't detect any equations
+        # Also disable if we won't use the inline detections (if we aren't using the LLM)
         do_inline_math_detection = all([
             len(document.contained_blocks([BlockTypes.Equation, BlockTypes.TextInlineMath])) > 0,
+            (self.use_llm)
         ])
         provider_lines, ocr_lines = self.get_all_lines(document, provider, do_inline_math_detection)

marker/models.py CHANGED Viewed

@@ -6,8 +6,6 @@ from surya.layout import LayoutPredictor
 from surya.ocr_error import OCRErrorPredictor
 from surya.recognition import RecognitionPredictor
 from surya.table_rec import TableRecPredictor
-from surya.texify import TexifyPredictor
 def create_model_dict(device=None, dtype=None) -> dict:
     return {

 from surya.ocr_error import OCRErrorPredictor
 from surya.recognition import RecognitionPredictor
 from surya.table_rec import TableRecPredictor
 def create_model_dict(device=None, dtype=None) -> dict:
     return {

marker/processors/equation.py CHANGED Viewed

@@ -24,7 +24,7 @@ class EquationProcessor(BaseProcessor):
     ] = 1024
     equation_batch_size: Annotated[
         Optional[int],
-        "The batch size to use for the Texify model.",
         "Default is None, which will use the default batch size for the model."
     ] = None
     disable_tqdm: Annotated[
@@ -40,7 +40,7 @@ class EquationProcessor(BaseProcessor):
     # TODO Find optimal values
     def get_batch_size(self):
         if self.equation_batch_size is not None:
-            return self.texify_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
             return 8
         elif settings.TORCH_DEVICE_MODEL == "mps":

     ] = 1024
     equation_batch_size: Annotated[
         Optional[int],
+        "The batch size to use for the recognition model while processing equations.",
         "Default is None, which will use the default batch size for the model."
     ] = None
     disable_tqdm: Annotated[
     # TODO Find optimal values
     def get_batch_size(self):
         if self.equation_batch_size is not None:
+            return self.equation_batch_size
         elif settings.TORCH_DEVICE_MODEL == "cuda":
             return 8
         elif settings.TORCH_DEVICE_MODEL == "mps":

tests/conftest.py CHANGED Viewed

@@ -40,11 +40,6 @@ def detection_model(model_dict):
     yield model_dict["detection_model"]
-@pytest.fixture(scope="session")
-def texify_model(model_dict):
-    yield model_dict["texify_model"]
 @pytest.fixture(scope="session")
 def recognition_model(model_dict):
     yield model_dict["recognition_model"]

     yield model_dict["detection_model"]
 @pytest.fixture(scope="session")
 def recognition_model(model_dict):
     yield model_dict["recognition_model"]

tests/processors/test_equation_processor.py CHANGED Viewed

@@ -5,10 +5,10 @@ from marker.processors.equation import EquationProcessor
 @pytest.mark.config({"page_range": [0]})
-def test_equation_processor(pdf_document, texify_model):
-    processor = EquationProcessor(texify_model)
     processor(pdf_document)
     for block in pdf_document.pages[0].children:
         if block.block_type == BlockTypes.Equation:
-            assert block.html is not None

 @pytest.mark.config({"page_range": [0]})
+def test_equation_processor(pdf_document, recognition_model):
+    processor = EquationProcessor(recognition_model)
     processor(pdf_document)
     for block in pdf_document.pages[0].children:
         if block.block_type == BlockTypes.Equation:
+            assert block.html is not None

tests/processors/test_inline_math.py CHANGED Viewed

@@ -38,6 +38,7 @@ def test_llm_text_processor_disabled(pdf_document):
     assert len(text_lines) == 0
 @pytest.mark.filename("adversarial.pdf")
 @pytest.mark.config({"page_range": [0], "texify_inline_spans": True})
 def test_llm_text_processor_texify(pdf_document):

     assert len(text_lines) == 0
+@pytest.mark.skip("We do not support this method of inline math anymore")
 @pytest.mark.filename("adversarial.pdf")
 @pytest.mark.config({"page_range": [0], "texify_inline_spans": True})
 def test_llm_text_processor_texify(pdf_document):