Spaces:

rt4u
/

marker

Sleeping

Vik Paruchuri commited on Jan 22

Commit

6bd5629

1 Parent(s): 457f524

Fix tests

Files changed (4) hide show

marker/builders/ocr.py CHANGED Viewed

@@ -35,6 +35,10 @@ class OcrBuilder(BaseBuilder):
         "A list of languages to use for OCR.",
         "Default is None."
     ] = None
     def __init__(self, detection_model: DetectionPredictor, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
@@ -67,12 +71,12 @@ class OcrBuilder(BaseBuilder):
         # Remove tables because we re-OCR them later with the table processor
         recognition_results = self.recognition_model(
-            images=[page.get_image(highres=False, remove_tables=True) for page in page_list],
             langs=[self.languages] * len(page_list),
             det_predictor=self.detection_model,
             detection_batch_size=int(self.get_detection_batch_size()),
             recognition_batch_size=int(self.get_recognition_batch_size()),
-            highres_images=[page.get_image(highres=True, remove_tables=True) for page in page_list]
         )
         page_lines = {}

         "A list of languages to use for OCR.",
         "Default is None."
     ] = None
+    enable_table_ocr: Annotated[
+        bool,
+        "Whether to skip OCR on tables.  The TableProcessor will re-OCR them.  Only enable if the TableProcessor is not running.",
+    ] = False
     def __init__(self, detection_model: DetectionPredictor, recognition_model: RecognitionPredictor, config=None):
         super().__init__(config)
         # Remove tables because we re-OCR them later with the table processor
         recognition_results = self.recognition_model(
+            images=[page.get_image(highres=False, remove_tables=not self.enable_table_ocr) for page in page_list],
             langs=[self.languages] * len(page_list),
             det_predictor=self.detection_model,
             detection_batch_size=int(self.get_detection_batch_size()),
             recognition_batch_size=int(self.get_recognition_batch_size()),
+            highres_images=[page.get_image(highres=True, remove_tables=not self.enable_table_ocr) for page in page_list]
         )
         page_lines = {}

marker/schema/blocks/base.py CHANGED Viewed

@@ -167,9 +167,10 @@ class Block(BaseModel):
     def raw_text(self, document: Document) -> str:
         from marker.schema.text.line import Line
         from marker.schema.text.span import Span
         if self.structure is None:
-            if isinstance(self, Span):
                 return self.text
             else:
                 return ""

     def raw_text(self, document: Document) -> str:
         from marker.schema.text.line import Line
         from marker.schema.text.span import Span
+        from marker.schema.blocks.tablecell import TableCell
         if self.structure is None:
+            if isinstance(self, (Span, TableCell)):
                 return self.text
             else:
                 return ""

marker/scripts/convert.py CHANGED Viewed

@@ -100,7 +100,7 @@ def convert_cli(in_folder: str, **kwargs):
     else:
         model_dict = create_model_dict()
         for k, v in model_dict.items():
-            v.share_memory()
     print(f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}")
     task_args = [(f, kwargs) for f in files_to_convert]

     else:
         model_dict = create_model_dict()
         for k, v in model_dict.items():
+            v.model.share_memory()
     print(f"Converting {len(files_to_convert)} pdfs in chunk {kwargs['chunk_idx'] + 1}/{kwargs['num_chunks']} with {total_processes} processes and saving to {kwargs['output_dir']}")
     task_args = [(f, kwargs) for f in files_to_convert]

tests/builders/test_garbled_pdf.py CHANGED Viewed

@@ -2,10 +2,11 @@ import pytest
 from marker.builders.document import DocumentBuilder
 from marker.builders.layout import LayoutBuilder
 from marker.schema import BlockTypes
 @pytest.mark.filename("water_damage.pdf")
-def test_garbled_pdf(pdf_document):
     assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'
     table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
@@ -16,9 +17,16 @@ def test_garbled_pdf(pdf_document):
     assert table_cell.block_type == BlockTypes.Line
     assert table_cell.structure[0] == "/page/0/Span/2"
-    span = pdf_document.pages[0].get_block(table_cell.structure[0])
     assert span.block_type == BlockTypes.Span
-    assert "комплекс" in span.text
 @pytest.mark.filename("hindi_judgement.pdf")
@@ -30,7 +38,7 @@ def test_garbled_builder(config, pdf_provider, layout_model, ocr_error_model):
     bad_ocr_results = layout_builder.surya_ocr_error_detection(document.pages, pdf_provider.page_lines)
     assert len(bad_ocr_results.labels) == 2
-    assert all([l == "bad" for l in bad_ocr_results.labels])
 @pytest.mark.filename("adversarial.pdf")

 from marker.builders.document import DocumentBuilder
 from marker.builders.layout import LayoutBuilder
+from marker.processors.table import TableProcessor
 from marker.schema import BlockTypes
 @pytest.mark.filename("water_damage.pdf")
+def test_garbled_pdf(pdf_document, detection_model, recognition_model, table_rec_model):
     assert pdf_document.pages[0].structure[0] == '/page/0/Table/0'
     table_block = pdf_document.pages[0].get_block(pdf_document.pages[0].structure[0])
     assert table_cell.block_type == BlockTypes.Line
     assert table_cell.structure[0] == "/page/0/Span/2"
+    span = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Span,))[0]
     assert span.block_type == BlockTypes.Span
+    assert len(span.text.strip()) == 0
+    # We don't OCR in the initial pass, only with the TableProcessor
+    processor = TableProcessor(detection_model, recognition_model, table_rec_model)
+    processor(pdf_document)
+    table = pdf_document.pages[0].contained_blocks(pdf_document, (BlockTypes.Table,))[0]
+    assert "варіант" in table.raw_text(pdf_document)
 @pytest.mark.filename("hindi_judgement.pdf")
     bad_ocr_results = layout_builder.surya_ocr_error_detection(document.pages, pdf_provider.page_lines)
     assert len(bad_ocr_results.labels) == 2
+    assert any([l == "bad" for l in bad_ocr_results.labels])
 @pytest.mark.filename("adversarial.pdf")