peppermenta commited on
Commit
5729ffa
·
1 Parent(s): 8b0bdf4

Cleanup and refactor old texify tests to the new model

Browse files
marker/builders/line.py CHANGED
@@ -93,10 +93,6 @@ class LineBuilder(BaseBuilder):
93
  bool,
94
  "Whether to use the LLM model for advanced processing."
95
  ] = False
96
- texify_inline_spans: Annotated[
97
- bool,
98
- "Whether to run texify on inline math spans."
99
- ] = False
100
  ocr_remove_blocks: Tuple[BlockTypes, ...] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents, BlockTypes.Equation)
101
  disable_tqdm: Annotated[
102
  bool,
@@ -112,10 +108,10 @@ class LineBuilder(BaseBuilder):
112
 
113
  def __call__(self, document: Document, provider: PdfProvider):
114
  # Disable inline detection for documents where layout model doesn't detect any equations
115
- # Also disable if we won't use the inline detections (if we aren't using the LLM or texify)
116
  do_inline_math_detection = all([
117
  len(document.contained_blocks([BlockTypes.Equation, BlockTypes.TextInlineMath])) > 0,
118
- (self.texify_inline_spans or self.use_llm)
119
  ])
120
 
121
  provider_lines, ocr_lines = self.get_all_lines(document, provider, do_inline_math_detection)
 
93
  bool,
94
  "Whether to use the LLM model for advanced processing."
95
  ] = False
 
 
 
 
96
  ocr_remove_blocks: Tuple[BlockTypes, ...] = (BlockTypes.Table, BlockTypes.Form, BlockTypes.TableOfContents, BlockTypes.Equation)
97
  disable_tqdm: Annotated[
98
  bool,
 
108
 
109
  def __call__(self, document: Document, provider: PdfProvider):
110
  # Disable inline detection for documents where layout model doesn't detect any equations
111
+ # Also disable if we won't use the inline detections (if we aren't using the LLM)
112
  do_inline_math_detection = all([
113
  len(document.contained_blocks([BlockTypes.Equation, BlockTypes.TextInlineMath])) > 0,
114
+ (self.use_llm)
115
  ])
116
 
117
  provider_lines, ocr_lines = self.get_all_lines(document, provider, do_inline_math_detection)
marker/models.py CHANGED
@@ -6,8 +6,6 @@ from surya.layout import LayoutPredictor
6
  from surya.ocr_error import OCRErrorPredictor
7
  from surya.recognition import RecognitionPredictor
8
  from surya.table_rec import TableRecPredictor
9
- from surya.texify import TexifyPredictor
10
-
11
 
12
  def create_model_dict(device=None, dtype=None) -> dict:
13
  return {
 
6
  from surya.ocr_error import OCRErrorPredictor
7
  from surya.recognition import RecognitionPredictor
8
  from surya.table_rec import TableRecPredictor
 
 
9
 
10
  def create_model_dict(device=None, dtype=None) -> dict:
11
  return {
marker/processors/equation.py CHANGED
@@ -24,7 +24,7 @@ class EquationProcessor(BaseProcessor):
24
  ] = 1024
25
  equation_batch_size: Annotated[
26
  Optional[int],
27
- "The batch size to use for the Texify model.",
28
  "Default is None, which will use the default batch size for the model."
29
  ] = None
30
  disable_tqdm: Annotated[
@@ -40,7 +40,7 @@ class EquationProcessor(BaseProcessor):
40
  # TODO Find optimal values
41
  def get_batch_size(self):
42
  if self.equation_batch_size is not None:
43
- return self.texify_batch_size
44
  elif settings.TORCH_DEVICE_MODEL == "cuda":
45
  return 8
46
  elif settings.TORCH_DEVICE_MODEL == "mps":
 
24
  ] = 1024
25
  equation_batch_size: Annotated[
26
  Optional[int],
27
+ "The batch size to use for the recognition model while processing equations.",
28
  "Default is None, which will use the default batch size for the model."
29
  ] = None
30
  disable_tqdm: Annotated[
 
40
  # TODO Find optimal values
41
  def get_batch_size(self):
42
  if self.equation_batch_size is not None:
43
+ return self.equation_batch_size
44
  elif settings.TORCH_DEVICE_MODEL == "cuda":
45
  return 8
46
  elif settings.TORCH_DEVICE_MODEL == "mps":
tests/conftest.py CHANGED
@@ -40,11 +40,6 @@ def detection_model(model_dict):
40
  yield model_dict["detection_model"]
41
 
42
 
43
- @pytest.fixture(scope="session")
44
- def texify_model(model_dict):
45
- yield model_dict["texify_model"]
46
-
47
-
48
  @pytest.fixture(scope="session")
49
  def recognition_model(model_dict):
50
  yield model_dict["recognition_model"]
 
40
  yield model_dict["detection_model"]
41
 
42
 
 
 
 
 
 
43
  @pytest.fixture(scope="session")
44
  def recognition_model(model_dict):
45
  yield model_dict["recognition_model"]
tests/processors/test_equation_processor.py CHANGED
@@ -5,10 +5,10 @@ from marker.processors.equation import EquationProcessor
5
 
6
 
7
  @pytest.mark.config({"page_range": [0]})
8
- def test_equation_processor(pdf_document, texify_model):
9
- processor = EquationProcessor(texify_model)
10
  processor(pdf_document)
11
 
12
  for block in pdf_document.pages[0].children:
13
  if block.block_type == BlockTypes.Equation:
14
- assert block.html is not None
 
5
 
6
 
7
  @pytest.mark.config({"page_range": [0]})
8
+ def test_equation_processor(pdf_document, recognition_model):
9
+ processor = EquationProcessor(recognition_model)
10
  processor(pdf_document)
11
 
12
  for block in pdf_document.pages[0].children:
13
  if block.block_type == BlockTypes.Equation:
14
+ assert block.html is not None
tests/processors/test_inline_math.py CHANGED
@@ -38,6 +38,7 @@ def test_llm_text_processor_disabled(pdf_document):
38
  assert len(text_lines) == 0
39
 
40
 
 
41
  @pytest.mark.filename("adversarial.pdf")
42
  @pytest.mark.config({"page_range": [0], "texify_inline_spans": True})
43
  def test_llm_text_processor_texify(pdf_document):
 
38
  assert len(text_lines) == 0
39
 
40
 
41
+ @pytest.mark.skip("We do not support this method of inline math anymore")
42
  @pytest.mark.filename("adversarial.pdf")
43
  @pytest.mark.config({"page_range": [0], "texify_inline_spans": True})
44
  def test_llm_text_processor_texify(pdf_document):