Vik Paruchuri
commited on
Commit
·
f2f1a27
1
Parent(s):
65d4b3a
Test services
Browse files- README.md +2 -1
- marker/builders/layout.py +0 -2
- marker/builders/llm_layout.py +1 -1
- marker/builders/ocr.py +0 -1
- marker/config/parser.py +1 -1
- marker/converters/pdf.py +2 -0
- marker/util.py +1 -1
- tests/conftest.py +13 -10
- tests/processors/test_llm_processors.py +7 -7
- tests/processors/test_table_merge.py +1 -1
- tests/services/test_service_init.py +41 -0
README.md
CHANGED
|
@@ -165,7 +165,8 @@ converter = PdfConverter(
|
|
| 165 |
config=config_parser.generate_config_dict(),
|
| 166 |
artifact_dict=create_model_dict(),
|
| 167 |
processor_list=config_parser.get_processors(),
|
| 168 |
-
renderer=config_parser.get_renderer()
|
|
|
|
| 169 |
)
|
| 170 |
rendered = converter("FILEPATH")
|
| 171 |
```
|
|
|
|
| 165 |
config=config_parser.generate_config_dict(),
|
| 166 |
artifact_dict=create_model_dict(),
|
| 167 |
processor_list=config_parser.get_processors(),
|
| 168 |
+
renderer=config_parser.get_renderer(),
|
| 169 |
+
llm_service=config_parser.get_llm_service()
|
| 170 |
)
|
| 171 |
rendered = converter("FILEPATH")
|
| 172 |
```
|
marker/builders/layout.py
CHANGED
|
@@ -2,10 +2,8 @@ from typing import Annotated, List, Optional
|
|
| 2 |
|
| 3 |
from surya.layout import LayoutPredictor
|
| 4 |
from surya.layout.schema import LayoutResult, LayoutBox
|
| 5 |
-
from surya.ocr_error.schema import OCRErrorDetectionResult
|
| 6 |
|
| 7 |
from marker.builders import BaseBuilder
|
| 8 |
-
from marker.providers import ProviderPageLines
|
| 9 |
from marker.providers.pdf import PdfProvider
|
| 10 |
from marker.schema import BlockTypes
|
| 11 |
from marker.schema.document import Document
|
|
|
|
| 2 |
|
| 3 |
from surya.layout import LayoutPredictor
|
| 4 |
from surya.layout.schema import LayoutResult, LayoutBox
|
|
|
|
| 5 |
|
| 6 |
from marker.builders import BaseBuilder
|
|
|
|
| 7 |
from marker.providers.pdf import PdfProvider
|
| 8 |
from marker.schema import BlockTypes
|
| 9 |
from marker.schema.document import Document
|
marker/builders/llm_layout.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 2 |
-
from typing import Annotated
|
| 3 |
|
| 4 |
from surya.layout import LayoutPredictor
|
| 5 |
from tqdm import tqdm
|
|
|
|
| 1 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 2 |
+
from typing import Annotated
|
| 3 |
|
| 4 |
from surya.layout import LayoutPredictor
|
| 5 |
from tqdm import tqdm
|
marker/builders/ocr.py
CHANGED
|
@@ -5,7 +5,6 @@ from ftfy import fix_text
|
|
| 5 |
from surya.recognition import RecognitionPredictor
|
| 6 |
|
| 7 |
from marker.builders import BaseBuilder
|
| 8 |
-
from marker.providers import ProviderPageLines
|
| 9 |
from marker.providers.pdf import PdfProvider
|
| 10 |
from marker.schema import BlockTypes
|
| 11 |
from marker.schema.blocks import BlockId
|
|
|
|
| 5 |
from surya.recognition import RecognitionPredictor
|
| 6 |
|
| 7 |
from marker.builders import BaseBuilder
|
|
|
|
| 8 |
from marker.providers.pdf import PdfProvider
|
| 9 |
from marker.schema import BlockTypes
|
| 10 |
from marker.schema.blocks import BlockId
|
marker/config/parser.py
CHANGED
|
@@ -83,7 +83,7 @@ class ConfigParser:
|
|
| 83 |
|
| 84 |
def get_llm_service(self):
|
| 85 |
# Only return an LLM service when use_llm is enabled
|
| 86 |
-
if not self.cli_options
|
| 87 |
return None
|
| 88 |
|
| 89 |
service_cls = self.cli_options["llm_service"]
|
|
|
|
| 83 |
|
| 84 |
def get_llm_service(self):
|
| 85 |
# Only return an LLM service when use_llm is enabled
|
| 86 |
+
if not self.cli_options.get("use_llm", False):
|
| 87 |
return None
|
| 88 |
|
| 89 |
service_cls = self.cli_options["llm_service"]
|
marker/converters/pdf.py
CHANGED
|
@@ -115,6 +115,8 @@ class PdfConverter(BaseConverter):
|
|
| 115 |
if llm_service:
|
| 116 |
llm_service_cls = strings_to_classes([llm_service])[0]
|
| 117 |
llm_service = self.resolve_dependencies(llm_service_cls)
|
|
|
|
|
|
|
| 118 |
|
| 119 |
# Inject llm service into artifact_dict so it can be picked up by processors, etc.
|
| 120 |
artifact_dict["llm_service"] = llm_service
|
|
|
|
| 115 |
if llm_service:
|
| 116 |
llm_service_cls = strings_to_classes([llm_service])[0]
|
| 117 |
llm_service = self.resolve_dependencies(llm_service_cls)
|
| 118 |
+
elif config.get("use_llm", False):
|
| 119 |
+
llm_service = self.resolve_dependencies(GoogleGeminiService)
|
| 120 |
|
| 121 |
# Inject llm service into artifact_dict so it can be picked up by processors, etc.
|
| 122 |
artifact_dict["llm_service"] = llm_service
|
marker/util.py
CHANGED
|
@@ -33,7 +33,7 @@ def verify_config_keys(obj):
|
|
| 33 |
if value is None:
|
| 34 |
none_vals += f"{attr_name}, "
|
| 35 |
|
| 36 |
-
assert len(none_vals) == 0, f"
|
| 37 |
|
| 38 |
|
| 39 |
def assign_config(cls, config: BaseModel | dict | None):
|
|
|
|
| 33 |
if value is None:
|
| 34 |
none_vals += f"{attr_name}, "
|
| 35 |
|
| 36 |
+
assert len(none_vals) == 0, f"In order to use {obj.__class__.__name__}, you must set the configuration values `{none_vals}`."
|
| 37 |
|
| 38 |
|
| 39 |
def assign_config(cls, config: BaseModel | dict | None):
|
tests/conftest.py
CHANGED
|
@@ -19,7 +19,8 @@ from marker.renderers.markdown import MarkdownRenderer
|
|
| 19 |
from marker.renderers.json import JSONRenderer
|
| 20 |
from marker.schema.registry import register_block_class
|
| 21 |
from marker.services.gemini import GoogleGeminiService
|
| 22 |
-
from marker.util import classes_to_strings
|
|
|
|
| 23 |
|
| 24 |
@pytest.fixture(scope="session")
|
| 25 |
def model_dict():
|
|
@@ -105,12 +106,15 @@ def pdf_document(request, config, pdf_provider, layout_model, ocr_error_model, r
|
|
| 105 |
|
| 106 |
|
| 107 |
@pytest.fixture(scope="function")
|
| 108 |
-
def pdf_converter(request, config, model_dict, renderer):
|
|
|
|
|
|
|
| 109 |
yield PdfConverter(
|
| 110 |
artifact_dict=model_dict,
|
| 111 |
processor_list=None,
|
| 112 |
renderer=classes_to_strings([renderer])[0],
|
| 113 |
-
config=config
|
|
|
|
| 114 |
)
|
| 115 |
|
| 116 |
|
|
@@ -129,13 +133,12 @@ def renderer(request, config):
|
|
| 129 |
|
| 130 |
|
| 131 |
@pytest.fixture(scope="function")
|
| 132 |
-
def llm_service(request):
|
| 133 |
-
llm_service =
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
yield llm_service
|
| 139 |
|
| 140 |
|
| 141 |
@pytest.fixture(scope="function")
|
|
|
|
| 19 |
from marker.renderers.json import JSONRenderer
|
| 20 |
from marker.schema.registry import register_block_class
|
| 21 |
from marker.services.gemini import GoogleGeminiService
|
| 22 |
+
from marker.util import classes_to_strings, strings_to_classes
|
| 23 |
+
|
| 24 |
|
| 25 |
@pytest.fixture(scope="session")
|
| 26 |
def model_dict():
|
|
|
|
| 106 |
|
| 107 |
|
| 108 |
@pytest.fixture(scope="function")
|
| 109 |
+
def pdf_converter(request, config, model_dict, renderer, llm_service):
|
| 110 |
+
if llm_service:
|
| 111 |
+
llm_service = classes_to_strings([llm_service])[0]
|
| 112 |
yield PdfConverter(
|
| 113 |
artifact_dict=model_dict,
|
| 114 |
processor_list=None,
|
| 115 |
renderer=classes_to_strings([renderer])[0],
|
| 116 |
+
config=config,
|
| 117 |
+
llm_service=llm_service
|
| 118 |
)
|
| 119 |
|
| 120 |
|
|
|
|
| 133 |
|
| 134 |
|
| 135 |
@pytest.fixture(scope="function")
|
| 136 |
+
def llm_service(request, config):
|
| 137 |
+
llm_service = config.get("llm_service")
|
| 138 |
+
if not llm_service:
|
| 139 |
+
yield None
|
| 140 |
+
else:
|
| 141 |
+
yield strings_to_classes([llm_service])[0]
|
|
|
|
| 142 |
|
| 143 |
|
| 144 |
@pytest.fixture(scope="function")
|
tests/processors/test_llm_processors.py
CHANGED
|
@@ -29,7 +29,7 @@ def test_llm_form_processor_no_config(pdf_document, llm_service):
|
|
| 29 |
@pytest.mark.filename("form_1040.pdf")
|
| 30 |
@pytest.mark.config({"page_range": [0]})
|
| 31 |
def test_llm_form_processor_no_cells(pdf_document, llm_service):
|
| 32 |
-
config = {"use_llm": True, "
|
| 33 |
processor_lst = [LLMFormProcessor(config)]
|
| 34 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service, config)
|
| 35 |
processor(pdf_document)
|
|
@@ -50,7 +50,7 @@ def test_llm_form_processor(pdf_document, detection_model, table_rec_model, reco
|
|
| 50 |
cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model)
|
| 51 |
cell_processor(pdf_document)
|
| 52 |
|
| 53 |
-
config = {"use_llm": True, "
|
| 54 |
processor_lst = [LLMFormProcessor(config)]
|
| 55 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
|
| 56 |
processor(pdf_document)
|
|
@@ -92,7 +92,7 @@ def test_llm_table_processor(pdf_document, detection_model, table_rec_model, rec
|
|
| 92 |
cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model)
|
| 93 |
cell_processor(pdf_document)
|
| 94 |
|
| 95 |
-
processor = LLMTableProcessor(mock_cls, {"use_llm": True, "
|
| 96 |
processor(pdf_document)
|
| 97 |
|
| 98 |
tables = pdf_document.contained_blocks((BlockTypes.Table,))
|
|
@@ -106,7 +106,7 @@ def test_llm_table_processor(pdf_document, detection_model, table_rec_model, rec
|
|
| 106 |
@pytest.mark.filename("A17_FlightPlan.pdf")
|
| 107 |
@pytest.mark.config({"page_range": [0]})
|
| 108 |
def test_llm_caption_processor_disabled(pdf_document):
|
| 109 |
-
config = {"use_llm": True, "
|
| 110 |
mock_cls = MagicMock()
|
| 111 |
processor_lst = [LLMImageDescriptionProcessor(config)]
|
| 112 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
|
|
@@ -122,7 +122,7 @@ def test_llm_caption_processor(pdf_document):
|
|
| 122 |
mock_cls = Mock()
|
| 123 |
mock_cls.return_value = {"image_description": description}
|
| 124 |
|
| 125 |
-
config = {"use_llm": True, "
|
| 126 |
processor_lst = [LLMImageDescriptionProcessor(config)]
|
| 127 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
|
| 128 |
processor(pdf_document)
|
|
@@ -152,7 +152,7 @@ def test_llm_complex_region_processor(pdf_document):
|
|
| 152 |
pdf_document.pages[0].replace_block(old_block, new_block)
|
| 153 |
|
| 154 |
# Test processor
|
| 155 |
-
config = {"use_llm": True, "
|
| 156 |
processor_lst = [LLMComplexRegionProcessor(config)]
|
| 157 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
|
| 158 |
processor(pdf_document)
|
|
@@ -170,7 +170,7 @@ def test_multi_llm_processors(pdf_document):
|
|
| 170 |
mock_cls = Mock()
|
| 171 |
mock_cls.return_value = {"image_description": description, "html_equation": description}
|
| 172 |
|
| 173 |
-
config = {"use_llm": True, "
|
| 174 |
processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]
|
| 175 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
|
| 176 |
processor(pdf_document)
|
|
|
|
| 29 |
@pytest.mark.filename("form_1040.pdf")
|
| 30 |
@pytest.mark.config({"page_range": [0]})
|
| 31 |
def test_llm_form_processor_no_cells(pdf_document, llm_service):
|
| 32 |
+
config = {"use_llm": True, "gemini_api_key": "test"}
|
| 33 |
processor_lst = [LLMFormProcessor(config)]
|
| 34 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, llm_service, config)
|
| 35 |
processor(pdf_document)
|
|
|
|
| 50 |
cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model)
|
| 51 |
cell_processor(pdf_document)
|
| 52 |
|
| 53 |
+
config = {"use_llm": True, "gemini_api_key": "test"}
|
| 54 |
processor_lst = [LLMFormProcessor(config)]
|
| 55 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
|
| 56 |
processor(pdf_document)
|
|
|
|
| 92 |
cell_processor = TableProcessor(detection_model, recognition_model, table_rec_model)
|
| 93 |
cell_processor(pdf_document)
|
| 94 |
|
| 95 |
+
processor = LLMTableProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"})
|
| 96 |
processor(pdf_document)
|
| 97 |
|
| 98 |
tables = pdf_document.contained_blocks((BlockTypes.Table,))
|
|
|
|
| 106 |
@pytest.mark.filename("A17_FlightPlan.pdf")
|
| 107 |
@pytest.mark.config({"page_range": [0]})
|
| 108 |
def test_llm_caption_processor_disabled(pdf_document):
|
| 109 |
+
config = {"use_llm": True, "gemini_api_key": "test"}
|
| 110 |
mock_cls = MagicMock()
|
| 111 |
processor_lst = [LLMImageDescriptionProcessor(config)]
|
| 112 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
|
|
|
|
| 122 |
mock_cls = Mock()
|
| 123 |
mock_cls.return_value = {"image_description": description}
|
| 124 |
|
| 125 |
+
config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False}
|
| 126 |
processor_lst = [LLMImageDescriptionProcessor(config)]
|
| 127 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
|
| 128 |
processor(pdf_document)
|
|
|
|
| 152 |
pdf_document.pages[0].replace_block(old_block, new_block)
|
| 153 |
|
| 154 |
# Test processor
|
| 155 |
+
config = {"use_llm": True, "gemini_api_key": "test"}
|
| 156 |
processor_lst = [LLMComplexRegionProcessor(config)]
|
| 157 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
|
| 158 |
processor(pdf_document)
|
|
|
|
| 170 |
mock_cls = Mock()
|
| 171 |
mock_cls.return_value = {"image_description": description, "html_equation": description}
|
| 172 |
|
| 173 |
+
config = {"use_llm": True, "gemini_api_key": "test", "extract_images": False, "min_equation_height": .001}
|
| 174 |
processor_lst = [LLMImageDescriptionProcessor(config), LLMEquationProcessor(config)]
|
| 175 |
processor = LLMSimpleBlockMetaProcessor(processor_lst, mock_cls, config)
|
| 176 |
processor(pdf_document)
|
tests/processors/test_table_merge.py
CHANGED
|
@@ -21,7 +21,7 @@ def test_llm_table_processor_nomerge(pdf_document, detection_model, table_rec_mo
|
|
| 21 |
tables = pdf_document.contained_blocks((BlockTypes.Table,))
|
| 22 |
assert len(tables) == 3
|
| 23 |
|
| 24 |
-
processor = LLMTableMergeProcessor(mock_cls, {"use_llm": True, "
|
| 25 |
processor(pdf_document)
|
| 26 |
|
| 27 |
tables = pdf_document.contained_blocks((BlockTypes.Table,))
|
|
|
|
| 21 |
tables = pdf_document.contained_blocks((BlockTypes.Table,))
|
| 22 |
assert len(tables) == 3
|
| 23 |
|
| 24 |
+
processor = LLMTableMergeProcessor(mock_cls, {"use_llm": True, "gemini_api_key": "test"})
|
| 25 |
processor(pdf_document)
|
| 26 |
|
| 27 |
tables = pdf_document.contained_blocks((BlockTypes.Table,))
|
tests/services/test_service_init.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pytest
|
| 2 |
+
|
| 3 |
+
from marker.converters.pdf import PdfConverter
|
| 4 |
+
from marker.services.gemini import GoogleGeminiService
|
| 5 |
+
from marker.services.ollama import OllamaService
|
| 6 |
+
from marker.services.vertex import GoogleVertexService
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@pytest.mark.output_format("markdown")
|
| 10 |
+
@pytest.mark.config({"page_range": [0]})
|
| 11 |
+
def test_empty_llm(pdf_converter: PdfConverter, temp_pdf):
|
| 12 |
+
assert pdf_converter.artifact_dict["llm_service"] is None
|
| 13 |
+
assert pdf_converter.llm_service is None
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def test_llm_no_keys(model_dict, config):
|
| 17 |
+
with pytest.raises(AssertionError):
|
| 18 |
+
PdfConverter(
|
| 19 |
+
artifact_dict=model_dict,
|
| 20 |
+
config={"use_llm": True}
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
@pytest.mark.output_format("markdown")
|
| 24 |
+
@pytest.mark.config({"page_range": [0], "use_llm": True, "gemini_api_key": "test"})
|
| 25 |
+
def test_llm_gemini(pdf_converter: PdfConverter, temp_pdf):
|
| 26 |
+
assert pdf_converter.artifact_dict["llm_service"] is not None
|
| 27 |
+
assert isinstance(pdf_converter.llm_service, GoogleGeminiService)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@pytest.mark.output_format("markdown")
|
| 31 |
+
@pytest.mark.config({"page_range": [0], "use_llm": True, "vertex_project_id": "test", "llm_service": "marker.services.vertex.GoogleVertexService"})
|
| 32 |
+
def test_llm_vertex(pdf_converter: PdfConverter, temp_pdf):
|
| 33 |
+
assert pdf_converter.artifact_dict["llm_service"] is not None
|
| 34 |
+
assert isinstance(pdf_converter.llm_service, GoogleVertexService)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@pytest.mark.output_format("markdown")
|
| 38 |
+
@pytest.mark.config({"page_range": [0], "use_llm": True, "llm_service": "marker.services.ollama.OllamaService"})
|
| 39 |
+
def test_llm_ollama(pdf_converter: PdfConverter, temp_pdf):
|
| 40 |
+
assert pdf_converter.artifact_dict["llm_service"] is not None
|
| 41 |
+
assert isinstance(pdf_converter.llm_service, OllamaService)
|