Vik Paruchuri
commited on
Commit
·
14b3a02
1
Parent(s):
4b7098a
Improve structured extraction
Browse files- README.md +4 -2
- marker/converters/extraction.py +15 -20
- marker/extractors/document.py +1 -5
- marker/extractors/page.py +7 -20
- marker/extractors/util.py +0 -213
- marker/renderers/extraction.py +7 -2
- marker/scripts/extraction_app.py +35 -6
- marker/services/__init__.py +1 -1
- marker/services/claude.py +1 -1
- marker/services/gemini.py +5 -2
- marker/services/ollama.py +4 -2
- marker/services/openai.py +5 -2
- tests/extractors/test_basemodel_gen.py +0 -44
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# Marker
|
| 2 |
|
| 3 |
-
Marker converts documents to markdown, JSON, and HTML quickly and accurately.
|
| 4 |
|
| 5 |
- Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
|
| 6 |
- Formats tables, forms, equations, inline math, links, references, and code blocks
|
|
@@ -276,6 +276,8 @@ converter = ExtractionConverter(
|
|
| 276 |
rendered = converter("FILEPATH")
|
| 277 |
```
|
| 278 |
|
|
|
|
|
|
|
| 279 |
# Output Formats
|
| 280 |
|
| 281 |
## Markdown
|
|
@@ -348,7 +350,7 @@ Note that child blocks of pages can have their own children as well (a tree stru
|
|
| 348 |
|
| 349 |
## Chunks
|
| 350 |
|
| 351 |
-
Chunks format is similar to JSON, but flattens everything into a single list instead of a tree. Only the top level blocks from each page show up. It also has the full HTML of each block inside, so you don't need to crawl the tree to reconstruct it.
|
| 352 |
|
| 353 |
## Metadata
|
| 354 |
|
|
|
|
| 1 |
# Marker
|
| 2 |
|
| 3 |
+
Marker converts documents to markdown, JSON, chunks, and HTML quickly and accurately.
|
| 4 |
|
| 5 |
- Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
|
| 6 |
- Formats tables, forms, equations, inline math, links, references, and code blocks
|
|
|
|
| 276 |
rendered = converter("FILEPATH")
|
| 277 |
```
|
| 278 |
|
| 279 |
+
Rendered will have an `original_markdown` field. If you pass this back in next time you run the converter, as the `existing_markdown` config key, you can skip re-parsing the document.
|
| 280 |
+
|
| 281 |
# Output Formats
|
| 282 |
|
| 283 |
## Markdown
|
|
|
|
| 350 |
|
| 351 |
## Chunks
|
| 352 |
|
| 353 |
+
Chunks format is similar to JSON, but flattens everything into a single list instead of a tree. Only the top level blocks from each page show up. It also has the full HTML of each block inside, so you don't need to crawl the tree to reconstruct it. This enable flexible and easy chunking for RAG.
|
| 354 |
|
| 355 |
## Metadata
|
| 356 |
|
marker/converters/extraction.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
import json
|
| 2 |
import re
|
|
|
|
| 3 |
|
| 4 |
from marker.builders.document import DocumentBuilder
|
| 5 |
from marker.builders.line import LineBuilder
|
|
@@ -8,7 +8,6 @@ from marker.builders.structure import StructureBuilder
|
|
| 8 |
from marker.converters.pdf import PdfConverter
|
| 9 |
from marker.extractors.document import DocumentExtractor
|
| 10 |
from marker.extractors.page import PageExtractor
|
| 11 |
-
from marker.extractors.util import json_schema_to_base_model
|
| 12 |
from marker.providers.registry import provider_from_filepath
|
| 13 |
|
| 14 |
from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
|
|
@@ -21,6 +20,9 @@ logger = get_logger()
|
|
| 21 |
|
| 22 |
class ExtractionConverter(PdfConverter):
|
| 23 |
pattern: str = r"{\d+\}-{48}\n\n"
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def build_document(self, filepath: str):
|
| 26 |
provider_cls = provider_from_filepath(filepath)
|
|
@@ -44,23 +46,16 @@ class ExtractionConverter(PdfConverter):
|
|
| 44 |
self.config["output_format"] = (
|
| 45 |
"markdown" # Output must be markdown for extraction
|
| 46 |
)
|
|
|
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
)
|
| 55 |
-
|
| 56 |
-
document, provider = self.build_document(filepath)
|
| 57 |
-
self.page_count = len(document.pages)
|
| 58 |
-
renderer = self.resolve_dependencies(MarkdownRenderer)
|
| 59 |
-
output = renderer(document)
|
| 60 |
|
| 61 |
-
output_pages = re.split(self.pattern,
|
| 62 |
-
1:
|
| 63 |
-
] # Split output into pages
|
| 64 |
|
| 65 |
# This needs an LLM service for extraction, this sets it in the extractor
|
| 66 |
if not self.artifact_dict["llm_service"]:
|
|
@@ -73,8 +68,8 @@ class ExtractionConverter(PdfConverter):
|
|
| 73 |
renderer = self.resolve_dependencies(ExtractionRenderer)
|
| 74 |
|
| 75 |
# Inference in parallel
|
| 76 |
-
notes = page_extractor(
|
| 77 |
-
document_output = document_extractor(
|
| 78 |
|
| 79 |
-
merged = renderer(document_output)
|
| 80 |
return merged
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
+
from typing import Annotated
|
| 3 |
|
| 4 |
from marker.builders.document import DocumentBuilder
|
| 5 |
from marker.builders.line import LineBuilder
|
|
|
|
| 8 |
from marker.converters.pdf import PdfConverter
|
| 9 |
from marker.extractors.document import DocumentExtractor
|
| 10 |
from marker.extractors.page import PageExtractor
|
|
|
|
| 11 |
from marker.providers.registry import provider_from_filepath
|
| 12 |
|
| 13 |
from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
|
|
|
|
| 20 |
|
| 21 |
class ExtractionConverter(PdfConverter):
|
| 22 |
pattern: str = r"{\d+\}-{48}\n\n"
|
| 23 |
+
existing_markdown: Annotated[
|
| 24 |
+
str, "Markdown that was already converted for extraction."
|
| 25 |
+
] = None
|
| 26 |
|
| 27 |
def build_document(self, filepath: str):
|
| 28 |
provider_cls = provider_from_filepath(filepath)
|
|
|
|
| 46 |
self.config["output_format"] = (
|
| 47 |
"markdown" # Output must be markdown for extraction
|
| 48 |
)
|
| 49 |
+
markdown = self.existing_markdown
|
| 50 |
|
| 51 |
+
if not markdown:
|
| 52 |
+
document, provider = self.build_document(filepath)
|
| 53 |
+
self.page_count = len(document.pages)
|
| 54 |
+
renderer = self.resolve_dependencies(MarkdownRenderer)
|
| 55 |
+
output = renderer(document)
|
| 56 |
+
markdown = output.markdown
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
output_pages = re.split(self.pattern, markdown)[1:] # Split output into pages
|
|
|
|
|
|
|
| 59 |
|
| 60 |
# This needs an LLM service for extraction, this sets it in the extractor
|
| 61 |
if not self.artifact_dict["llm_service"]:
|
|
|
|
| 68 |
renderer = self.resolve_dependencies(ExtractionRenderer)
|
| 69 |
|
| 70 |
# Inference in parallel
|
| 71 |
+
notes = page_extractor(output_pages)
|
| 72 |
+
document_output = document_extractor(notes)
|
| 73 |
|
| 74 |
+
merged = renderer(document_output, markdown)
|
| 75 |
return merged
|
marker/extractors/document.py
CHANGED
|
@@ -6,7 +6,6 @@ from typing import Annotated, Optional, List
|
|
| 6 |
from marker.extractors import BaseExtractor
|
| 7 |
from marker.extractors.page import PageExtractionSchema
|
| 8 |
from marker.logger import get_logger
|
| 9 |
-
from marker.schema.document import Document
|
| 10 |
|
| 11 |
logger = get_logger()
|
| 12 |
|
|
@@ -114,7 +113,6 @@ Schema
|
|
| 114 |
|
| 115 |
def __call__(
|
| 116 |
self,
|
| 117 |
-
document: Document,
|
| 118 |
page_notes: List[PageExtractionSchema],
|
| 119 |
**kwargs,
|
| 120 |
) -> Optional[DocumentExtractionSchema]:
|
|
@@ -123,11 +121,10 @@ Schema
|
|
| 123 |
"Page schema must be defined for structured extraction to work."
|
| 124 |
)
|
| 125 |
|
| 126 |
-
page = document.pages[0]
|
| 127 |
prompt = self.page_extraction_prompt.replace(
|
| 128 |
"{{document_notes}}", self.assemble_document_notes(page_notes)
|
| 129 |
).replace("{{schema}}", json.dumps(self.page_schema))
|
| 130 |
-
response = self.llm_service(prompt, None,
|
| 131 |
|
| 132 |
logger.debug(f"Document extraction response: {response}")
|
| 133 |
|
|
@@ -140,7 +137,6 @@ Schema
|
|
| 140 |
]
|
| 141 |
]
|
| 142 |
):
|
| 143 |
-
page.update_metadata(llm_error_count=1)
|
| 144 |
return None
|
| 145 |
|
| 146 |
json_data = response["document_json"].strip().lstrip("```json").rstrip("```")
|
|
|
|
| 6 |
from marker.extractors import BaseExtractor
|
| 7 |
from marker.extractors.page import PageExtractionSchema
|
| 8 |
from marker.logger import get_logger
|
|
|
|
| 9 |
|
| 10 |
logger = get_logger()
|
| 11 |
|
|
|
|
| 113 |
|
| 114 |
def __call__(
|
| 115 |
self,
|
|
|
|
| 116 |
page_notes: List[PageExtractionSchema],
|
| 117 |
**kwargs,
|
| 118 |
) -> Optional[DocumentExtractionSchema]:
|
|
|
|
| 121 |
"Page schema must be defined for structured extraction to work."
|
| 122 |
)
|
| 123 |
|
|
|
|
| 124 |
prompt = self.page_extraction_prompt.replace(
|
| 125 |
"{{document_notes}}", self.assemble_document_notes(page_notes)
|
| 126 |
).replace("{{schema}}", json.dumps(self.page_schema))
|
| 127 |
+
response = self.llm_service(prompt, None, None, DocumentExtractionSchema)
|
| 128 |
|
| 129 |
logger.debug(f"Document extraction response: {response}")
|
| 130 |
|
|
|
|
| 137 |
]
|
| 138 |
]
|
| 139 |
):
|
|
|
|
| 140 |
return None
|
| 141 |
|
| 142 |
json_data = response["document_json"].strip().lstrip("```json").rstrip("```")
|
marker/extractors/page.py
CHANGED
|
@@ -8,8 +8,6 @@ from tqdm import tqdm
|
|
| 8 |
|
| 9 |
from marker.extractors import BaseExtractor
|
| 10 |
from marker.logger import get_logger
|
| 11 |
-
from marker.schema.document import Document
|
| 12 |
-
from marker.schema.groups.page import PageGroup
|
| 13 |
|
| 14 |
logger = get_logger()
|
| 15 |
|
|
@@ -100,29 +98,25 @@ Schema
|
|
| 100 |
```
|
| 101 |
"""
|
| 102 |
|
| 103 |
-
def chunk_page_markdown(
|
| 104 |
-
self, pages: List[PageGroup], page_markdown: List[str]
|
| 105 |
-
) -> List[tuple]:
|
| 106 |
"""
|
| 107 |
Chunk the page markdown into smaller pieces for processing.
|
| 108 |
"""
|
| 109 |
-
if len(pages) == 0:
|
| 110 |
-
return []
|
| 111 |
|
| 112 |
chunks = []
|
| 113 |
-
for i in range(0, len(
|
| 114 |
chunk = page_markdown[i : i + self.extraction_page_chunk_size]
|
| 115 |
-
chunks.append(
|
| 116 |
|
| 117 |
return chunks
|
| 118 |
|
| 119 |
def inference_single_chunk(
|
| 120 |
-
self,
|
| 121 |
) -> Optional[PageExtractionSchema]:
|
| 122 |
prompt = self.page_extraction_prompt.replace(
|
| 123 |
"{{page_md}}", page_markdown
|
| 124 |
).replace("{{schema}}", json.dumps(self.page_schema))
|
| 125 |
-
response = self.llm_service(prompt, None,
|
| 126 |
logger.debug(f"Page extraction response: {response}")
|
| 127 |
|
| 128 |
if not response or any(
|
|
@@ -134,7 +128,6 @@ Schema
|
|
| 134 |
]
|
| 135 |
]
|
| 136 |
):
|
| 137 |
-
page.update_metadata(llm_error_count=1)
|
| 138 |
return None
|
| 139 |
|
| 140 |
return PageExtractionSchema(
|
|
@@ -144,20 +137,15 @@ Schema
|
|
| 144 |
|
| 145 |
def __call__(
|
| 146 |
self,
|
| 147 |
-
document: Document,
|
| 148 |
-
pages: List[PageGroup],
|
| 149 |
page_markdown: List[str],
|
| 150 |
**kwargs,
|
| 151 |
) -> List[PageExtractionSchema]:
|
| 152 |
-
assert len(page_markdown) == len(pages), (
|
| 153 |
-
f"Mismatch in page markdown and pages length: {len(page_markdown)} vs {len(pages)}"
|
| 154 |
-
)
|
| 155 |
if not self.page_schema:
|
| 156 |
raise ValueError(
|
| 157 |
"Page schema must be defined for structured extraction to work."
|
| 158 |
)
|
| 159 |
|
| 160 |
-
chunks = self.chunk_page_markdown(
|
| 161 |
results = []
|
| 162 |
pbar = tqdm(
|
| 163 |
desc="Running page extraction",
|
|
@@ -167,8 +155,7 @@ Schema
|
|
| 167 |
|
| 168 |
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
|
| 169 |
for future in [
|
| 170 |
-
executor.submit(self.inference_single_chunk, chunk
|
| 171 |
-
for chunk in chunks
|
| 172 |
]:
|
| 173 |
results.append(future.result()) # Raise exceptions if any occurred
|
| 174 |
pbar.update(1)
|
|
|
|
| 8 |
|
| 9 |
from marker.extractors import BaseExtractor
|
| 10 |
from marker.logger import get_logger
|
|
|
|
|
|
|
| 11 |
|
| 12 |
logger = get_logger()
|
| 13 |
|
|
|
|
| 98 |
```
|
| 99 |
"""
|
| 100 |
|
| 101 |
+
def chunk_page_markdown(self, page_markdown: List[str]) -> List[str]:
|
|
|
|
|
|
|
| 102 |
"""
|
| 103 |
Chunk the page markdown into smaller pieces for processing.
|
| 104 |
"""
|
|
|
|
|
|
|
| 105 |
|
| 106 |
chunks = []
|
| 107 |
+
for i in range(0, len(page_markdown), self.extraction_page_chunk_size):
|
| 108 |
chunk = page_markdown[i : i + self.extraction_page_chunk_size]
|
| 109 |
+
chunks.append("\n\n".join(chunk))
|
| 110 |
|
| 111 |
return chunks
|
| 112 |
|
| 113 |
def inference_single_chunk(
|
| 114 |
+
self, page_markdown: str
|
| 115 |
) -> Optional[PageExtractionSchema]:
|
| 116 |
prompt = self.page_extraction_prompt.replace(
|
| 117 |
"{{page_md}}", page_markdown
|
| 118 |
).replace("{{schema}}", json.dumps(self.page_schema))
|
| 119 |
+
response = self.llm_service(prompt, None, None, PageExtractionSchema)
|
| 120 |
logger.debug(f"Page extraction response: {response}")
|
| 121 |
|
| 122 |
if not response or any(
|
|
|
|
| 128 |
]
|
| 129 |
]
|
| 130 |
):
|
|
|
|
| 131 |
return None
|
| 132 |
|
| 133 |
return PageExtractionSchema(
|
|
|
|
| 137 |
|
| 138 |
def __call__(
|
| 139 |
self,
|
|
|
|
|
|
|
| 140 |
page_markdown: List[str],
|
| 141 |
**kwargs,
|
| 142 |
) -> List[PageExtractionSchema]:
|
|
|
|
|
|
|
|
|
|
| 143 |
if not self.page_schema:
|
| 144 |
raise ValueError(
|
| 145 |
"Page schema must be defined for structured extraction to work."
|
| 146 |
)
|
| 147 |
|
| 148 |
+
chunks = self.chunk_page_markdown(page_markdown)
|
| 149 |
results = []
|
| 150 |
pbar = tqdm(
|
| 151 |
desc="Running page extraction",
|
|
|
|
| 155 |
|
| 156 |
with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
|
| 157 |
for future in [
|
| 158 |
+
executor.submit(self.inference_single_chunk, chunk) for chunk in chunks
|
|
|
|
| 159 |
]:
|
| 160 |
results.append(future.result()) # Raise exceptions if any occurred
|
| 161 |
pbar.update(1)
|
marker/extractors/util.py
DELETED
|
@@ -1,213 +0,0 @@
|
|
| 1 |
-
from typing import Any, Type, Union, Optional
|
| 2 |
-
from pydantic import BaseModel, Field, create_model, validator
|
| 3 |
-
from enum import Enum
|
| 4 |
-
import re
|
| 5 |
-
from datetime import datetime
|
| 6 |
-
from uuid import UUID
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
def json_schema_to_base_model(
|
| 10 |
-
schema: dict[str, Any], model_name: str = None
|
| 11 |
-
) -> Type[BaseModel]:
|
| 12 |
-
"""Convert a JSON schema to a Pydantic BaseModel dynamically."""
|
| 13 |
-
|
| 14 |
-
# Enhanced type mapping with format support
|
| 15 |
-
def get_type_from_schema(field_props: dict[str, Any]) -> type:
|
| 16 |
-
json_type = field_props.get("type", "string")
|
| 17 |
-
format_type = field_props.get("format")
|
| 18 |
-
|
| 19 |
-
# Handle format-specific types
|
| 20 |
-
if json_type == "string":
|
| 21 |
-
if format_type == "date-time":
|
| 22 |
-
return datetime
|
| 23 |
-
elif format_type == "uuid":
|
| 24 |
-
return UUID
|
| 25 |
-
else:
|
| 26 |
-
return str
|
| 27 |
-
elif json_type == "integer":
|
| 28 |
-
return int
|
| 29 |
-
elif json_type == "number":
|
| 30 |
-
return float
|
| 31 |
-
elif json_type == "boolean":
|
| 32 |
-
return bool
|
| 33 |
-
elif json_type == "array":
|
| 34 |
-
return list
|
| 35 |
-
elif json_type == "object":
|
| 36 |
-
return dict
|
| 37 |
-
else:
|
| 38 |
-
return str # fallback
|
| 39 |
-
|
| 40 |
-
def handle_union_types(field_props: dict[str, Any]) -> type:
|
| 41 |
-
"""Handle anyOf, oneOf, and type arrays."""
|
| 42 |
-
any_of = field_props.get("anyOf", [])
|
| 43 |
-
one_of = field_props.get("oneOf", [])
|
| 44 |
-
type_list = field_props.get("type", [])
|
| 45 |
-
|
| 46 |
-
if any_of:
|
| 47 |
-
types = [get_type_from_schema(schema) for schema in any_of]
|
| 48 |
-
return Union[tuple(types)]
|
| 49 |
-
elif one_of:
|
| 50 |
-
types = [get_type_from_schema(schema) for schema in one_of]
|
| 51 |
-
return Union[tuple(types)]
|
| 52 |
-
elif isinstance(type_list, list):
|
| 53 |
-
types = [get_type_from_schema({"type": t}) for t in type_list]
|
| 54 |
-
return Union[tuple(types)]
|
| 55 |
-
|
| 56 |
-
return None
|
| 57 |
-
|
| 58 |
-
def create_validator_from_constraints(field_name: str, field_props: dict[str, Any]):
|
| 59 |
-
"""Create Pydantic validators from JSON schema constraints."""
|
| 60 |
-
validators = {}
|
| 61 |
-
|
| 62 |
-
# String constraints
|
| 63 |
-
if "minLength" in field_props:
|
| 64 |
-
min_len = field_props["minLength"]
|
| 65 |
-
|
| 66 |
-
def min_length_validator(cls, v):
|
| 67 |
-
if isinstance(v, str) and len(v) < min_len:
|
| 68 |
-
raise ValueError(
|
| 69 |
-
f"{field_name} must be at least {min_len} characters"
|
| 70 |
-
)
|
| 71 |
-
return v
|
| 72 |
-
|
| 73 |
-
validators[f"{field_name}_min_length"] = validator(
|
| 74 |
-
field_name, allow_reuse=True
|
| 75 |
-
)(min_length_validator)
|
| 76 |
-
|
| 77 |
-
if "maxLength" in field_props:
|
| 78 |
-
max_len = field_props["maxLength"]
|
| 79 |
-
|
| 80 |
-
def max_length_validator(cls, v):
|
| 81 |
-
if isinstance(v, str) and len(v) > max_len:
|
| 82 |
-
raise ValueError(
|
| 83 |
-
f"{field_name} must be at most {max_len} characters"
|
| 84 |
-
)
|
| 85 |
-
return v
|
| 86 |
-
|
| 87 |
-
validators[f"{field_name}_max_length"] = validator(
|
| 88 |
-
field_name, allow_reuse=True
|
| 89 |
-
)(max_length_validator)
|
| 90 |
-
|
| 91 |
-
if "pattern" in field_props:
|
| 92 |
-
pattern = field_props["pattern"]
|
| 93 |
-
|
| 94 |
-
def pattern_validator(cls, v):
|
| 95 |
-
if isinstance(v, str) and not re.match(pattern, v):
|
| 96 |
-
raise ValueError(f"{field_name} must match pattern {pattern}")
|
| 97 |
-
return v
|
| 98 |
-
|
| 99 |
-
validators[f"{field_name}_pattern"] = validator(
|
| 100 |
-
field_name, allow_reuse=True
|
| 101 |
-
)(pattern_validator)
|
| 102 |
-
|
| 103 |
-
# Numeric constraints
|
| 104 |
-
if "minimum" in field_props:
|
| 105 |
-
min_val = field_props["minimum"]
|
| 106 |
-
|
| 107 |
-
def min_validator(cls, v):
|
| 108 |
-
if isinstance(v, (int, float)) and v < min_val:
|
| 109 |
-
raise ValueError(f"{field_name} must be at least {min_val}")
|
| 110 |
-
return v
|
| 111 |
-
|
| 112 |
-
validators[f"{field_name}_minimum"] = validator(
|
| 113 |
-
field_name, allow_reuse=True
|
| 114 |
-
)(min_validator)
|
| 115 |
-
|
| 116 |
-
if "maximum" in field_props:
|
| 117 |
-
max_val = field_props["maximum"]
|
| 118 |
-
|
| 119 |
-
def max_validator(cls, v):
|
| 120 |
-
if isinstance(v, (int, float)) and v > max_val:
|
| 121 |
-
raise ValueError(f"{field_name} must be at most {max_val}")
|
| 122 |
-
return v
|
| 123 |
-
|
| 124 |
-
validators[f"{field_name}_maximum"] = validator(
|
| 125 |
-
field_name, allow_reuse=True
|
| 126 |
-
)(max_validator)
|
| 127 |
-
|
| 128 |
-
return validators
|
| 129 |
-
|
| 130 |
-
def process_field(field_name: str, field_props: dict[str, Any]) -> tuple:
|
| 131 |
-
"""Process a single field from the schema."""
|
| 132 |
-
|
| 133 |
-
# Handle const values
|
| 134 |
-
if "const" in field_props:
|
| 135 |
-
const_value = field_props["const"]
|
| 136 |
-
return type(const_value), Field(default=const_value, const=True)
|
| 137 |
-
|
| 138 |
-
# Handle enums
|
| 139 |
-
enum_values = field_props.get("enum")
|
| 140 |
-
if enum_values:
|
| 141 |
-
enum_name = f"{field_name.capitalize()}Enum"
|
| 142 |
-
field_type = Enum(enum_name, {str(v): v for v in enum_values})
|
| 143 |
-
|
| 144 |
-
# Handle union types (anyOf, oneOf, type arrays)
|
| 145 |
-
elif union_type := handle_union_types(field_props):
|
| 146 |
-
field_type = union_type
|
| 147 |
-
|
| 148 |
-
# Handle nested objects
|
| 149 |
-
elif field_props.get("type") == "object" and "properties" in field_props:
|
| 150 |
-
nested_model_name = f"{field_name.capitalize()}Model"
|
| 151 |
-
field_type = json_schema_to_base_model(field_props, nested_model_name)
|
| 152 |
-
|
| 153 |
-
# Handle arrays
|
| 154 |
-
elif field_props.get("type") == "array" and "items" in field_props:
|
| 155 |
-
item_props = field_props["items"]
|
| 156 |
-
|
| 157 |
-
# Handle array of objects
|
| 158 |
-
if item_props.get("type") == "object" and "properties" in item_props:
|
| 159 |
-
item_model_name = f"{field_name.capitalize()}ItemModel"
|
| 160 |
-
item_type = json_schema_to_base_model(item_props, item_model_name)
|
| 161 |
-
else:
|
| 162 |
-
item_type = get_type_from_schema(item_props)
|
| 163 |
-
|
| 164 |
-
field_type = list[item_type]
|
| 165 |
-
|
| 166 |
-
# Handle primitive types
|
| 167 |
-
else:
|
| 168 |
-
field_type = get_type_from_schema(field_props)
|
| 169 |
-
|
| 170 |
-
# Handle nullable
|
| 171 |
-
if field_props.get("nullable", False):
|
| 172 |
-
field_type = Optional[field_type]
|
| 173 |
-
|
| 174 |
-
# Determine default value
|
| 175 |
-
if "default" in field_props:
|
| 176 |
-
default_value = field_props["default"]
|
| 177 |
-
elif field_name not in schema.get("required", []):
|
| 178 |
-
default_value = None
|
| 179 |
-
if not field_props.get("nullable", False):
|
| 180 |
-
field_type = Optional[field_type]
|
| 181 |
-
else:
|
| 182 |
-
default_value = ...
|
| 183 |
-
|
| 184 |
-
# Create field with metadata
|
| 185 |
-
field_info = Field(
|
| 186 |
-
default=default_value,
|
| 187 |
-
description=field_props.get("description", field_props.get("title", "")),
|
| 188 |
-
title=field_props.get("title"),
|
| 189 |
-
examples=field_props.get("examples"),
|
| 190 |
-
)
|
| 191 |
-
|
| 192 |
-
return field_type, field_info
|
| 193 |
-
|
| 194 |
-
# Process schema
|
| 195 |
-
properties = schema.get("properties", {})
|
| 196 |
-
model_fields = {}
|
| 197 |
-
validators = {}
|
| 198 |
-
|
| 199 |
-
# Process each field
|
| 200 |
-
for field_name, field_props in properties.items():
|
| 201 |
-
model_fields[field_name] = process_field(field_name, field_props)
|
| 202 |
-
|
| 203 |
-
# Add validators for constraints
|
| 204 |
-
field_validators = create_validator_from_constraints(field_name, field_props)
|
| 205 |
-
validators.update(field_validators)
|
| 206 |
-
|
| 207 |
-
# Create the model
|
| 208 |
-
model_name = model_name or schema.get("title", "DynamicModel")
|
| 209 |
-
|
| 210 |
-
# Create model with validators
|
| 211 |
-
model_class = create_model(model_name, **model_fields, __validators__=validators)
|
| 212 |
-
|
| 213 |
-
return model_class
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
marker/renderers/extraction.py
CHANGED
|
@@ -7,11 +7,16 @@ from marker.renderers import BaseRenderer
|
|
| 7 |
class ExtractionOutput(BaseModel):
|
| 8 |
analysis: str
|
| 9 |
document_json: str
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class ExtractionRenderer(BaseRenderer):
|
| 13 |
-
def __call__(
|
|
|
|
|
|
|
| 14 |
# We definitely want to do more complex stuff here soon, so leave it in
|
| 15 |
return ExtractionOutput(
|
| 16 |
-
analysis=output.analysis,
|
|
|
|
|
|
|
| 17 |
)
|
|
|
|
| 7 |
class ExtractionOutput(BaseModel):
|
| 8 |
analysis: str
|
| 9 |
document_json: str
|
| 10 |
+
original_markdown: str
|
| 11 |
|
| 12 |
|
| 13 |
class ExtractionRenderer(BaseRenderer):
|
| 14 |
+
def __call__(
|
| 15 |
+
self, output: DocumentExtractionSchema, markdown: str
|
| 16 |
+
) -> ExtractionOutput:
|
| 17 |
# We definitely want to do more complex stuff here soon, so leave it in
|
| 18 |
return ExtractionOutput(
|
| 19 |
+
analysis=output.analysis,
|
| 20 |
+
document_json=output.document_json,
|
| 21 |
+
original_markdown=markdown,
|
| 22 |
)
|
marker/scripts/extraction_app.py
CHANGED
|
@@ -26,9 +26,12 @@ import streamlit as st
|
|
| 26 |
from marker.config.parser import ConfigParser
|
| 27 |
|
| 28 |
|
| 29 |
-
def extract_data(
|
|
|
|
|
|
|
| 30 |
config["pdftext_workers"] = 1
|
| 31 |
config["page_schema"] = schema
|
|
|
|
| 32 |
config_parser = ConfigParser(config)
|
| 33 |
config_dict = config_parser.generate_config_dict()
|
| 34 |
|
|
@@ -62,12 +65,35 @@ in_file: UploadedFile = st.sidebar.file_uploader(
|
|
| 62 |
type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"],
|
| 63 |
)
|
| 64 |
|
| 65 |
-
|
| 66 |
-
st.stop()
|
| 67 |
-
|
| 68 |
if "rendered_pydantic_schema" not in st.session_state:
|
| 69 |
st.session_state.rendered_pydantic_schema = ""
|
| 70 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
filetype = in_file.type
|
| 72 |
|
| 73 |
with col1:
|
|
@@ -196,11 +222,14 @@ if run_marker:
|
|
| 196 |
)
|
| 197 |
|
| 198 |
try:
|
| 199 |
-
rendered = extract_data(
|
|
|
|
|
|
|
| 200 |
|
| 201 |
with col2:
|
| 202 |
st.write("## Output JSON")
|
| 203 |
-
st.json(rendered.model_dump())
|
|
|
|
| 204 |
|
| 205 |
except Exception as e:
|
| 206 |
st.error(f"❌ Extraction failed: {e}")
|
|
|
|
| 26 |
from marker.config.parser import ConfigParser
|
| 27 |
|
| 28 |
|
| 29 |
+
def extract_data(
|
| 30 |
+
fname: str, config: dict, schema: str, markdown: str | None = None
|
| 31 |
+
) -> (str, Dict[str, Any], dict):
|
| 32 |
config["pdftext_workers"] = 1
|
| 33 |
config["page_schema"] = schema
|
| 34 |
+
config["existing_markdown"] = markdown
|
| 35 |
config_parser = ConfigParser(config)
|
| 36 |
config_dict = config_parser.generate_config_dict()
|
| 37 |
|
|
|
|
| 65 |
type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"],
|
| 66 |
)
|
| 67 |
|
| 68 |
+
# Initialize session state variables
|
|
|
|
|
|
|
| 69 |
if "rendered_pydantic_schema" not in st.session_state:
|
| 70 |
st.session_state.rendered_pydantic_schema = ""
|
| 71 |
|
| 72 |
+
if "markdown" not in st.session_state:
|
| 73 |
+
st.session_state.markdown = ""
|
| 74 |
+
|
| 75 |
+
if "current_file_id" not in st.session_state:
|
| 76 |
+
st.session_state.current_file_id = None
|
| 77 |
+
|
| 78 |
+
# Detect file changes and clear markdown when new file is uploaded
|
| 79 |
+
if in_file is not None:
|
| 80 |
+
# Create a unique identifier for the current file
|
| 81 |
+
current_file_id = f"{in_file.name}_{in_file.size}_{hash(in_file.getvalue())}"
|
| 82 |
+
|
| 83 |
+
# Check if this is a new file
|
| 84 |
+
if st.session_state.current_file_id != current_file_id:
|
| 85 |
+
st.session_state.current_file_id = current_file_id
|
| 86 |
+
st.session_state.markdown = "" # Clear markdown for new file
|
| 87 |
+
else:
|
| 88 |
+
# No file uploaded, clear the current file ID
|
| 89 |
+
if st.session_state.current_file_id is not None:
|
| 90 |
+
st.session_state.current_file_id = None
|
| 91 |
+
st.session_state.markdown = "" # Clear markdown when no file
|
| 92 |
+
st.session_state.rendered_pydantic_schema = ""
|
| 93 |
+
|
| 94 |
+
if in_file is None:
|
| 95 |
+
st.stop()
|
| 96 |
+
|
| 97 |
filetype = in_file.type
|
| 98 |
|
| 99 |
with col1:
|
|
|
|
| 222 |
)
|
| 223 |
|
| 224 |
try:
|
| 225 |
+
rendered = extract_data(
|
| 226 |
+
temp_pdf, cli_options, schema, st.session_state.markdown
|
| 227 |
+
)
|
| 228 |
|
| 229 |
with col2:
|
| 230 |
st.write("## Output JSON")
|
| 231 |
+
st.json(rendered.model_dump(exclude=["original_markdown"]))
|
| 232 |
+
st.session_state.markdown = rendered.original_markdown
|
| 233 |
|
| 234 |
except Exception as e:
|
| 235 |
st.error(f"❌ Extraction failed: {e}")
|
marker/services/__init__.py
CHANGED
|
@@ -37,7 +37,7 @@ class BaseService:
|
|
| 37 |
self,
|
| 38 |
prompt: str,
|
| 39 |
image: PIL.Image.Image | List[PIL.Image.Image] | None,
|
| 40 |
-
block: Block,
|
| 41 |
response_schema: type[BaseModel],
|
| 42 |
max_retries: int | None = None,
|
| 43 |
timeout: int | None = None,
|
|
|
|
| 37 |
self,
|
| 38 |
prompt: str,
|
| 39 |
image: PIL.Image.Image | List[PIL.Image.Image] | None,
|
| 40 |
+
block: Block | None,
|
| 41 |
response_schema: type[BaseModel],
|
| 42 |
max_retries: int | None = None,
|
| 43 |
timeout: int | None = None,
|
marker/services/claude.py
CHANGED
|
@@ -74,7 +74,7 @@ class ClaudeService(BaseService):
|
|
| 74 |
self,
|
| 75 |
prompt: str,
|
| 76 |
image: PIL.Image.Image | List[PIL.Image.Image] | None,
|
| 77 |
-
block: Block,
|
| 78 |
response_schema: type[BaseModel],
|
| 79 |
max_retries: int | None = None,
|
| 80 |
timeout: int | None = None,
|
|
|
|
| 74 |
self,
|
| 75 |
prompt: str,
|
| 76 |
image: PIL.Image.Image | List[PIL.Image.Image] | None,
|
| 77 |
+
block: Block | None,
|
| 78 |
response_schema: type[BaseModel],
|
| 79 |
max_retries: int | None = None,
|
| 80 |
timeout: int | None = None,
|
marker/services/gemini.py
CHANGED
|
@@ -41,7 +41,7 @@ class BaseGeminiService(BaseService):
|
|
| 41 |
self,
|
| 42 |
prompt: str,
|
| 43 |
image: PIL.Image.Image | List[PIL.Image.Image] | None,
|
| 44 |
-
block: Block,
|
| 45 |
response_schema: type[BaseModel],
|
| 46 |
max_retries: int | None = None,
|
| 47 |
timeout: int | None = None,
|
|
@@ -72,7 +72,10 @@ class BaseGeminiService(BaseService):
|
|
| 72 |
)
|
| 73 |
output = responses.candidates[0].content.parts[0].text
|
| 74 |
total_tokens = responses.usage_metadata.total_token_count
|
| 75 |
-
block
|
|
|
|
|
|
|
|
|
|
| 76 |
return json.loads(output)
|
| 77 |
except APIError as e:
|
| 78 |
if e.code in [429, 443, 503]:
|
|
|
|
| 41 |
self,
|
| 42 |
prompt: str,
|
| 43 |
image: PIL.Image.Image | List[PIL.Image.Image] | None,
|
| 44 |
+
block: Block | None,
|
| 45 |
response_schema: type[BaseModel],
|
| 46 |
max_retries: int | None = None,
|
| 47 |
timeout: int | None = None,
|
|
|
|
| 72 |
)
|
| 73 |
output = responses.candidates[0].content.parts[0].text
|
| 74 |
total_tokens = responses.usage_metadata.total_token_count
|
| 75 |
+
if block:
|
| 76 |
+
block.update_metadata(
|
| 77 |
+
llm_tokens_used=total_tokens, llm_request_count=1
|
| 78 |
+
)
|
| 79 |
return json.loads(output)
|
| 80 |
except APIError as e:
|
| 81 |
if e.code in [429, 443, 503]:
|
marker/services/ollama.py
CHANGED
|
@@ -35,7 +35,7 @@ class OllamaService(BaseService):
|
|
| 35 |
self,
|
| 36 |
prompt: str,
|
| 37 |
image: PIL.Image.Image | List[PIL.Image.Image] | None,
|
| 38 |
-
block: Block,
|
| 39 |
response_schema: type[BaseModel],
|
| 40 |
max_retries: int | None = None,
|
| 41 |
timeout: int | None = None,
|
|
@@ -68,7 +68,9 @@ class OllamaService(BaseService):
|
|
| 68 |
total_tokens = (
|
| 69 |
response_data["prompt_eval_count"] + response_data["eval_count"]
|
| 70 |
)
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
|
| 73 |
data = response_data["response"]
|
| 74 |
return json.loads(data)
|
|
|
|
| 35 |
self,
|
| 36 |
prompt: str,
|
| 37 |
image: PIL.Image.Image | List[PIL.Image.Image] | None,
|
| 38 |
+
block: Block | None,
|
| 39 |
response_schema: type[BaseModel],
|
| 40 |
max_retries: int | None = None,
|
| 41 |
timeout: int | None = None,
|
|
|
|
| 68 |
total_tokens = (
|
| 69 |
response_data["prompt_eval_count"] + response_data["eval_count"]
|
| 70 |
)
|
| 71 |
+
|
| 72 |
+
if block:
|
| 73 |
+
block.update_metadata(llm_request_count=1, llm_tokens_used=total_tokens)
|
| 74 |
|
| 75 |
data = response_data["response"]
|
| 76 |
return json.loads(data)
|
marker/services/openai.py
CHANGED
|
@@ -78,7 +78,7 @@ class OpenAIService(BaseService):
|
|
| 78 |
self,
|
| 79 |
prompt: str,
|
| 80 |
image: PIL.Image.Image | List[PIL.Image.Image] | None,
|
| 81 |
-
block: Block,
|
| 82 |
response_schema: type[BaseModel],
|
| 83 |
max_retries: int | None = None,
|
| 84 |
timeout: int | None = None,
|
|
@@ -117,7 +117,10 @@ class OpenAIService(BaseService):
|
|
| 117 |
)
|
| 118 |
response_text = response.choices[0].message.content
|
| 119 |
total_tokens = response.usage.total_tokens
|
| 120 |
-
block
|
|
|
|
|
|
|
|
|
|
| 121 |
return json.loads(response_text)
|
| 122 |
except (APITimeoutError, RateLimitError) as e:
|
| 123 |
# Rate limit exceeded
|
|
|
|
| 78 |
self,
|
| 79 |
prompt: str,
|
| 80 |
image: PIL.Image.Image | List[PIL.Image.Image] | None,
|
| 81 |
+
block: Block | None,
|
| 82 |
response_schema: type[BaseModel],
|
| 83 |
max_retries: int | None = None,
|
| 84 |
timeout: int | None = None,
|
|
|
|
| 117 |
)
|
| 118 |
response_text = response.choices[0].message.content
|
| 119 |
total_tokens = response.usage.total_tokens
|
| 120 |
+
if block:
|
| 121 |
+
block.update_metadata(
|
| 122 |
+
llm_tokens_used=total_tokens, llm_request_count=1
|
| 123 |
+
)
|
| 124 |
return json.loads(response_text)
|
| 125 |
except (APITimeoutError, RateLimitError) as e:
|
| 126 |
# Rate limit exceeded
|
tests/extractors/test_basemodel_gen.py
DELETED
|
@@ -1,44 +0,0 @@
|
|
| 1 |
-
from marker.extractors.util import json_schema_to_base_model
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
def test_model_generator():
|
| 5 |
-
test_schema = {
|
| 6 |
-
"title": "UserModel",
|
| 7 |
-
"type": "object",
|
| 8 |
-
"properties": {
|
| 9 |
-
"email": {
|
| 10 |
-
"type": "string",
|
| 11 |
-
"format": "email",
|
| 12 |
-
"description": "User's email address",
|
| 13 |
-
},
|
| 14 |
-
"age": {"type": "integer", "minimum": 0, "maximum": 150},
|
| 15 |
-
"name": {"type": "string", "minLength": 1, "maxLength": 100},
|
| 16 |
-
"status": {"anyOf": [{"type": "string"}, {"type": "null"}]},
|
| 17 |
-
"tags": {"type": "array", "items": {"type": "string"}},
|
| 18 |
-
"preferences": {
|
| 19 |
-
"type": "object",
|
| 20 |
-
"properties": {
|
| 21 |
-
"theme": {"type": "string", "enum": ["dark", "light"]},
|
| 22 |
-
"notifications": {"type": "boolean", "default": True},
|
| 23 |
-
},
|
| 24 |
-
},
|
| 25 |
-
"role": {
|
| 26 |
-
"type": "string",
|
| 27 |
-
"enum": ["admin", "user", "guest"],
|
| 28 |
-
"default": "user",
|
| 29 |
-
},
|
| 30 |
-
},
|
| 31 |
-
"required": ["email", "name"],
|
| 32 |
-
}
|
| 33 |
-
|
| 34 |
-
# Create the model
|
| 35 |
-
UserModel = json_schema_to_base_model(test_schema)
|
| 36 |
-
user = UserModel(
|
| 37 |
-
email="test@example.com",
|
| 38 |
-
name="John Doe",
|
| 39 |
-
age=30,
|
| 40 |
-
tags=["python", "pydantic"],
|
| 41 |
-
preferences={"theme": "dark"},
|
| 42 |
-
role="admin",
|
| 43 |
-
)
|
| 44 |
-
assert user is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|