Commit
·
5729ffa
1
Parent(s):
8b0bdf4
Cleanup and refactor old texify tests to the new model
Browse files- marker/builders/line.py +2 -6
- marker/models.py +0 -2
- marker/processors/equation.py +2 -2
- tests/conftest.py +0 -5
- tests/processors/test_equation_processor.py +3 -3
- tests/processors/test_inline_math.py +1 -0
marker/builders/line.py
CHANGED
|
@@ -93,10 +93,6 @@ class LineBuilder(BaseBuilder):
|
|
| 93 |
bool,
|
| 94 |
"Whether to use the LLM model for advanced processing."
|
| 95 |
] = False
|
| 96 |
-
texify_inline_spans: Annotated[
|
| 97 |
-
bool,
|
| 98 |
-
"Whether to run texify on inline math spans."
|
| 99 |
-
] = False
|
| 100 |
ocr_remove_blocks: Tuple[BlockTypes, ...] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents, BlockTypes.Equation)
|
| 101 |
disable_tqdm: Annotated[
|
| 102 |
bool,
|
|
@@ -112,10 +108,10 @@ class LineBuilder(BaseBuilder):
|
|
| 112 |
|
| 113 |
def __call__(self, document: Document, provider: PdfProvider):
|
| 114 |
# Disable inline detection for documents where layout model doesn't detect any equations
|
| 115 |
-
# Also disable if we won't use the inline detections (if we aren't using the LLM
|
| 116 |
do_inline_math_detection = all([
|
| 117 |
len(document.contained_blocks([BlockTypes.Equation, BlockTypes.TextInlineMath])) > 0,
|
| 118 |
-
(self.
|
| 119 |
])
|
| 120 |
|
| 121 |
provider_lines, ocr_lines = self.get_all_lines(document, provider, do_inline_math_detection)
|
|
|
|
| 93 |
bool,
|
| 94 |
"Whether to use the LLM model for advanced processing."
|
| 95 |
] = False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
ocr_remove_blocks: Tuple[BlockTypes, ...] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents, BlockTypes.Equation)
|
| 97 |
disable_tqdm: Annotated[
|
| 98 |
bool,
|
|
|
|
| 108 |
|
| 109 |
def __call__(self, document: Document, provider: PdfProvider):
|
| 110 |
# Disable inline detection for documents where layout model doesn't detect any equations
|
| 111 |
+
# Also disable if we won't use the inline detections (if we aren't using the LLM)
|
| 112 |
do_inline_math_detection = all([
|
| 113 |
len(document.contained_blocks([BlockTypes.Equation, BlockTypes.TextInlineMath])) > 0,
|
| 114 |
+
(self.use_llm)
|
| 115 |
])
|
| 116 |
|
| 117 |
provider_lines, ocr_lines = self.get_all_lines(document, provider, do_inline_math_detection)
|
marker/models.py
CHANGED
|
@@ -6,8 +6,6 @@ from surya.layout import LayoutPredictor
|
|
| 6 |
from surya.ocr_error import OCRErrorPredictor
|
| 7 |
from surya.recognition import RecognitionPredictor
|
| 8 |
from surya.table_rec import TableRecPredictor
|
| 9 |
-
from surya.texify import TexifyPredictor
|
| 10 |
-
|
| 11 |
|
| 12 |
def create_model_dict(device=None, dtype=None) -> dict:
|
| 13 |
return {
|
|
|
|
| 6 |
from surya.ocr_error import OCRErrorPredictor
|
| 7 |
from surya.recognition import RecognitionPredictor
|
| 8 |
from surya.table_rec import TableRecPredictor
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def create_model_dict(device=None, dtype=None) -> dict:
|
| 11 |
return {
|
marker/processors/equation.py
CHANGED
|
@@ -24,7 +24,7 @@ class EquationProcessor(BaseProcessor):
|
|
| 24 |
] = 1024
|
| 25 |
equation_batch_size: Annotated[
|
| 26 |
Optional[int],
|
| 27 |
-
"The batch size to use for the
|
| 28 |
"Default is None, which will use the default batch size for the model."
|
| 29 |
] = None
|
| 30 |
disable_tqdm: Annotated[
|
|
@@ -40,7 +40,7 @@ class EquationProcessor(BaseProcessor):
|
|
| 40 |
# TODO Find optimal values
|
| 41 |
def get_batch_size(self):
|
| 42 |
if self.equation_batch_size is not None:
|
| 43 |
-
return self.
|
| 44 |
elif settings.TORCH_DEVICE_MODEL == "cuda":
|
| 45 |
return 8
|
| 46 |
elif settings.TORCH_DEVICE_MODEL == "mps":
|
|
|
|
| 24 |
] = 1024
|
| 25 |
equation_batch_size: Annotated[
|
| 26 |
Optional[int],
|
| 27 |
+
"The batch size to use for the recognition model while processing equations.",
|
| 28 |
"Default is None, which will use the default batch size for the model."
|
| 29 |
] = None
|
| 30 |
disable_tqdm: Annotated[
|
|
|
|
| 40 |
# TODO Find optimal values
|
| 41 |
def get_batch_size(self):
|
| 42 |
if self.equation_batch_size is not None:
|
| 43 |
+
return self.equation_batch_size
|
| 44 |
elif settings.TORCH_DEVICE_MODEL == "cuda":
|
| 45 |
return 8
|
| 46 |
elif settings.TORCH_DEVICE_MODEL == "mps":
|
tests/conftest.py
CHANGED
|
@@ -40,11 +40,6 @@ def detection_model(model_dict):
|
|
| 40 |
yield model_dict["detection_model"]
|
| 41 |
|
| 42 |
|
| 43 |
-
@pytest.fixture(scope="session")
|
| 44 |
-
def texify_model(model_dict):
|
| 45 |
-
yield model_dict["texify_model"]
|
| 46 |
-
|
| 47 |
-
|
| 48 |
@pytest.fixture(scope="session")
|
| 49 |
def recognition_model(model_dict):
|
| 50 |
yield model_dict["recognition_model"]
|
|
|
|
| 40 |
yield model_dict["detection_model"]
|
| 41 |
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
@pytest.fixture(scope="session")
|
| 44 |
def recognition_model(model_dict):
|
| 45 |
yield model_dict["recognition_model"]
|
tests/processors/test_equation_processor.py
CHANGED
|
@@ -5,10 +5,10 @@ from marker.processors.equation import EquationProcessor
|
|
| 5 |
|
| 6 |
|
| 7 |
@pytest.mark.config({"page_range": [0]})
|
| 8 |
-
def test_equation_processor(pdf_document,
|
| 9 |
-
processor = EquationProcessor(
|
| 10 |
processor(pdf_document)
|
| 11 |
|
| 12 |
for block in pdf_document.pages[0].children:
|
| 13 |
if block.block_type == BlockTypes.Equation:
|
| 14 |
-
assert block.html is not None
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
@pytest.mark.config({"page_range": [0]})
|
| 8 |
+
def test_equation_processor(pdf_document, recognition_model):
|
| 9 |
+
processor = EquationProcessor(recognition_model)
|
| 10 |
processor(pdf_document)
|
| 11 |
|
| 12 |
for block in pdf_document.pages[0].children:
|
| 13 |
if block.block_type == BlockTypes.Equation:
|
| 14 |
+
assert block.html is not None
|
tests/processors/test_inline_math.py
CHANGED
|
@@ -38,6 +38,7 @@ def test_llm_text_processor_disabled(pdf_document):
|
|
| 38 |
assert len(text_lines) == 0
|
| 39 |
|
| 40 |
|
|
|
|
| 41 |
@pytest.mark.filename("adversarial.pdf")
|
| 42 |
@pytest.mark.config({"page_range": [0], "texify_inline_spans": True})
|
| 43 |
def test_llm_text_processor_texify(pdf_document):
|
|
|
|
| 38 |
assert len(text_lines) == 0
|
| 39 |
|
| 40 |
|
| 41 |
+
@pytest.mark.skip("We do not support this method of inline math anymore")
|
| 42 |
@pytest.mark.filename("adversarial.pdf")
|
| 43 |
@pytest.mark.config({"page_range": [0], "texify_inline_spans": True})
|
| 44 |
def test_llm_text_processor_texify(pdf_document):
|