Vik Paruchuri
commited on
Commit
·
94b8583
1
Parent(s):
5783857
Improve structured extraction alpha
Browse files- README.md +27 -1
- marker/converters/extraction.py +24 -10
- marker/extractors/__init__.py +13 -2
- marker/extractors/page.py +26 -10
- marker/renderers/extraction.py +59 -1
- marker/scripts/common.py +3 -1
- marker/scripts/extraction_app.py +10 -12
README.md
CHANGED
|
@@ -86,7 +86,7 @@ First, some configuration:
|
|
| 86 |
I've included a streamlit app that lets you interactively try marker with some basic options. Run it with:
|
| 87 |
|
| 88 |
```shell
|
| 89 |
-
pip install streamlit
|
| 90 |
marker_gui
|
| 91 |
```
|
| 92 |
|
|
@@ -249,6 +249,32 @@ You can also run this via the CLI with
|
|
| 249 |
marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
|
| 250 |
```
|
| 251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
# Output Formats
|
| 253 |
|
| 254 |
## Markdown
|
|
|
|
| 86 |
I've included a streamlit app that lets you interactively try marker with some basic options. Run it with:
|
| 87 |
|
| 88 |
```shell
|
| 89 |
+
pip install streamlit streamlit-ace
|
| 90 |
marker_gui
|
| 91 |
```
|
| 92 |
|
|
|
|
| 249 |
marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
|
| 250 |
```
|
| 251 |
|
| 252 |
+
### Structured Extraction (alpha)
|
| 253 |
+
|
| 254 |
+
You can run structured extraction via the `ExtractionConverter`. This requires an llm service to be setup first (see [here](#llm-services) for details). You'll get a JSON output with the extracted values.
|
| 255 |
+
|
| 256 |
+
```python
|
| 257 |
+
from marker.converters.extraction import ExtractionConverter
|
| 258 |
+
from marker.models import create_model_dict
|
| 259 |
+
from marker.config.parser import ConfigParser
|
| 260 |
+
from pydantic import BaseModel
|
| 261 |
+
|
| 262 |
+
class Links(BaseModel):
|
| 263 |
+
links: list[str]
|
| 264 |
+
|
| 265 |
+
schema = Links.model_json_schema()
|
| 266 |
+
config_parser = ConfigParser({
|
| 267 |
+
"page_schema": schema
|
| 268 |
+
})
|
| 269 |
+
|
| 270 |
+
converter = ExtractionConverter(
|
| 271 |
+
artifact_dict=create_model_dict(),
|
| 272 |
+
config=config_parser.generate_config_dict(),
|
| 273 |
+
llm_service=config_parser.get_llm_service(),
|
| 274 |
+
)
|
| 275 |
+
rendered = converter("FILEPATH")
|
| 276 |
+
```
|
| 277 |
+
|
| 278 |
# Output Formats
|
| 279 |
|
| 280 |
## Markdown
|
marker/converters/extraction.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
import json
|
| 2 |
import re
|
| 3 |
|
| 4 |
from marker.builders.document import DocumentBuilder
|
|
@@ -6,12 +5,16 @@ from marker.builders.line import LineBuilder
|
|
| 6 |
from marker.builders.ocr import OcrBuilder
|
| 7 |
from marker.builders.structure import StructureBuilder
|
| 8 |
from marker.converters.pdf import PdfConverter
|
| 9 |
-
from marker.extractors.page import PageExtractor
|
| 10 |
from marker.providers.registry import provider_from_filepath
|
| 11 |
|
| 12 |
-
from marker.renderers.extraction import
|
| 13 |
from marker.renderers.markdown import MarkdownRenderer
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
class ExtractionConverter(PdfConverter):
|
| 17 |
pattern: str = r"{\d+\}-{48}\n\n"
|
|
@@ -33,11 +36,19 @@ class ExtractionConverter(PdfConverter):
|
|
| 33 |
|
| 34 |
return document, provider
|
| 35 |
|
| 36 |
-
def __call__(self, filepath: str):
|
| 37 |
self.config["paginate_output"] = True # Ensure we can split the output properly
|
| 38 |
self.config["output_format"] = (
|
| 39 |
"markdown" # Output must be markdown for extraction
|
| 40 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
document, provider = self.build_document(filepath)
|
| 42 |
renderer = self.resolve_dependencies(MarkdownRenderer)
|
| 43 |
output = renderer(document)
|
|
@@ -53,10 +64,13 @@ class ExtractionConverter(PdfConverter):
|
|
| 53 |
)
|
| 54 |
|
| 55 |
extractor = self.resolve_dependencies(PageExtractor)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
extracted_model = extractor(document, page, page_md.strip())
|
| 60 |
-
extracted_json = extracted_model.model_dump_json()
|
| 61 |
-
all_json.append(extracted_json)
|
| 62 |
-
return ExtractionOutput(json=json.dumps(all_json, indent=4, ensure_ascii=False))
|
|
|
|
|
|
|
| 1 |
import re
|
| 2 |
|
| 3 |
from marker.builders.document import DocumentBuilder
|
|
|
|
| 5 |
from marker.builders.ocr import OcrBuilder
|
| 6 |
from marker.builders.structure import StructureBuilder
|
| 7 |
from marker.converters.pdf import PdfConverter
|
| 8 |
+
from marker.extractors.page import PageExtractor, json_schema_to_base_model
|
| 9 |
from marker.providers.registry import provider_from_filepath
|
| 10 |
|
| 11 |
+
from marker.renderers.extraction import ExtractionMerger
|
| 12 |
from marker.renderers.markdown import MarkdownRenderer
|
| 13 |
|
| 14 |
+
from marker.logger import get_logger
|
| 15 |
+
|
| 16 |
+
logger = get_logger()
|
| 17 |
+
|
| 18 |
|
| 19 |
class ExtractionConverter(PdfConverter):
|
| 20 |
pattern: str = r"{\d+\}-{48}\n\n"
|
|
|
|
| 36 |
|
| 37 |
return document, provider
|
| 38 |
|
| 39 |
+
def __call__(self, filepath: str) -> str:
|
| 40 |
self.config["paginate_output"] = True # Ensure we can split the output properly
|
| 41 |
self.config["output_format"] = (
|
| 42 |
"markdown" # Output must be markdown for extraction
|
| 43 |
)
|
| 44 |
+
try:
|
| 45 |
+
json_schema_to_base_model(self.config["page_schema"])
|
| 46 |
+
except Exception as e:
|
| 47 |
+
logger.error(f"Could not parse page schema: {e}")
|
| 48 |
+
raise ValueError(
|
| 49 |
+
"Could not parse your page schema. Please check the schema format."
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
document, provider = self.build_document(filepath)
|
| 53 |
renderer = self.resolve_dependencies(MarkdownRenderer)
|
| 54 |
output = renderer(document)
|
|
|
|
| 64 |
)
|
| 65 |
|
| 66 |
extractor = self.resolve_dependencies(PageExtractor)
|
| 67 |
+
merger = self.resolve_dependencies(ExtractionMerger)
|
| 68 |
+
|
| 69 |
+
pnums = provider.page_range
|
| 70 |
+
all_json = {}
|
| 71 |
+
for page, page_md, pnum in zip(document.pages, output_pages, pnums):
|
| 72 |
+
extracted_json = extractor(document, page, page_md.strip())
|
| 73 |
+
all_json[pnum] = extracted_json
|
| 74 |
|
| 75 |
+
merged = merger(all_json)
|
| 76 |
+
return merged
|
|
|
|
|
|
|
|
|
|
|
|
marker/extractors/__init__.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
-
from typing import Annotated, Sequence
|
|
|
|
|
|
|
|
|
|
| 2 |
from marker.schema import BlockTypes
|
| 3 |
from marker.schema.document import Document
|
| 4 |
from marker.schema.groups import PageGroup
|
|
@@ -8,6 +11,12 @@ from marker.services import BaseService
|
|
| 8 |
from marker.util import assign_config
|
| 9 |
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
class BaseExtractor:
|
| 12 |
"""
|
| 13 |
An extractor that uses a provided service to extract structured data from documents.
|
|
@@ -38,5 +47,7 @@ class BaseExtractor:
|
|
| 38 |
remove_blocks=remove_blocks,
|
| 39 |
)
|
| 40 |
|
| 41 |
-
def __call__(
|
|
|
|
|
|
|
| 42 |
raise NotImplementedError
|
|
|
|
| 1 |
+
from typing import Annotated, Sequence, Optional
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
|
| 5 |
from marker.schema import BlockTypes
|
| 6 |
from marker.schema.document import Document
|
| 7 |
from marker.schema.groups import PageGroup
|
|
|
|
| 11 |
from marker.util import assign_config
|
| 12 |
|
| 13 |
|
| 14 |
+
class ExtractionResult(BaseModel):
|
| 15 |
+
extracted_data: dict | list
|
| 16 |
+
value_confidence: int
|
| 17 |
+
existence_confidence: int
|
| 18 |
+
|
| 19 |
+
|
| 20 |
class BaseExtractor:
|
| 21 |
"""
|
| 22 |
An extractor that uses a provided service to extract structured data from documents.
|
|
|
|
| 47 |
remove_blocks=remove_blocks,
|
| 48 |
)
|
| 49 |
|
| 50 |
+
def __call__(
|
| 51 |
+
self, document: Document, *args, **kwargs
|
| 52 |
+
) -> Optional[ExtractionResult]:
|
| 53 |
raise NotImplementedError
|
marker/extractors/page.py
CHANGED
|
@@ -4,7 +4,7 @@ from pydantic import create_model, BaseModel, Field, ValidationError
|
|
| 4 |
from typing import Annotated, Type, Optional, Any, Dict
|
| 5 |
from enum import Enum
|
| 6 |
|
| 7 |
-
from marker.extractors import BaseExtractor
|
| 8 |
from marker.schema.document import Document
|
| 9 |
from marker.schema.groups.page import PageGroup
|
| 10 |
|
|
@@ -118,7 +118,8 @@ Some guidelines:
|
|
| 118 |
3. Analyze the JSON schema.
|
| 119 |
4. Write a short description of the fields in the schema, and the associated values in the image.
|
| 120 |
5. Extract the data in the schema that can be found in the image and output the data in JSON format.
|
| 121 |
-
6. Output
|
|
|
|
| 122 |
|
| 123 |
**Example:**
|
| 124 |
Input:
|
|
@@ -158,7 +159,8 @@ Description: The schema has a list of cars, each with a make, sales, and color.
|
|
| 158 |
}
|
| 159 |
```
|
| 160 |
|
| 161 |
-
|
|
|
|
| 162 |
|
| 163 |
**Input:**
|
| 164 |
|
|
@@ -175,7 +177,7 @@ Schema
|
|
| 175 |
|
| 176 |
def __call__(
|
| 177 |
self, document: Document, page: PageGroup, page_markdown: str, **kwargs
|
| 178 |
-
) -> Optional[
|
| 179 |
page_image = self.extract_image(document, page)
|
| 180 |
if not self.page_schema:
|
| 181 |
raise ValueError(
|
|
@@ -189,23 +191,37 @@ Schema
|
|
| 189 |
).replace("{schema}", json.dumps(optional_schema))
|
| 190 |
response = self.llm_service(prompt, page_image, page, PageExtractionSchema)
|
| 191 |
|
| 192 |
-
if not response or
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
page.update_metadata(llm_error_count=1)
|
| 194 |
-
return
|
| 195 |
|
| 196 |
extracted_json = response["extracted_json"]
|
| 197 |
|
| 198 |
OptionalPageModel = json_schema_to_base_model(optional_schema)
|
| 199 |
try:
|
| 200 |
-
|
| 201 |
except ValidationError as e:
|
| 202 |
print(f"Validation error with extracted data: {e}")
|
| 203 |
-
return
|
| 204 |
|
| 205 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
|
| 208 |
class PageExtractionSchema(BaseModel):
|
| 209 |
description: str
|
| 210 |
extracted_json: str
|
| 211 |
-
|
|
|
|
|
|
| 4 |
from typing import Annotated, Type, Optional, Any, Dict
|
| 5 |
from enum import Enum
|
| 6 |
|
| 7 |
+
from marker.extractors import BaseExtractor, ExtractionResult
|
| 8 |
from marker.schema.document import Document
|
| 9 |
from marker.schema.groups.page import PageGroup
|
| 10 |
|
|
|
|
| 118 |
3. Analyze the JSON schema.
|
| 119 |
4. Write a short description of the fields in the schema, and the associated values in the image.
|
| 120 |
5. Extract the data in the schema that can be found in the image and output the data in JSON format.
|
| 121 |
+
6. Output an existence confidence score 1 to 5, where 1 is very low confidence that the values exist on the page, and 5 is very high confidence that the values exist on the page.
|
| 122 |
+
7. Output a value confidence score from 1 to 5, where 1 is very low confidence that the values are correct, and 5 is very high confidence that the values are correct.
|
| 123 |
|
| 124 |
**Example:**
|
| 125 |
Input:
|
|
|
|
| 159 |
}
|
| 160 |
```
|
| 161 |
|
| 162 |
+
Existence confidence: 5
|
| 163 |
+
Value confidence: 5
|
| 164 |
|
| 165 |
**Input:**
|
| 166 |
|
|
|
|
| 177 |
|
| 178 |
def __call__(
|
| 179 |
self, document: Document, page: PageGroup, page_markdown: str, **kwargs
|
| 180 |
+
) -> Optional[ExtractionResult]:
|
| 181 |
page_image = self.extract_image(document, page)
|
| 182 |
if not self.page_schema:
|
| 183 |
raise ValueError(
|
|
|
|
| 191 |
).replace("{schema}", json.dumps(optional_schema))
|
| 192 |
response = self.llm_service(prompt, page_image, page, PageExtractionSchema)
|
| 193 |
|
| 194 |
+
if not response or any(
|
| 195 |
+
[
|
| 196 |
+
key not in response
|
| 197 |
+
for key in [
|
| 198 |
+
"extracted_json",
|
| 199 |
+
"existence_confidence",
|
| 200 |
+
"value_confidence",
|
| 201 |
+
]
|
| 202 |
+
]
|
| 203 |
+
):
|
| 204 |
page.update_metadata(llm_error_count=1)
|
| 205 |
+
return None
|
| 206 |
|
| 207 |
extracted_json = response["extracted_json"]
|
| 208 |
|
| 209 |
OptionalPageModel = json_schema_to_base_model(optional_schema)
|
| 210 |
try:
|
| 211 |
+
OptionalPageModel.model_validate_json(extracted_json)
|
| 212 |
except ValidationError as e:
|
| 213 |
print(f"Validation error with extracted data: {e}")
|
| 214 |
+
return None
|
| 215 |
|
| 216 |
+
return ExtractionResult(
|
| 217 |
+
extracted_data=json.loads(extracted_json),
|
| 218 |
+
existence_confidence=response["existence_confidence"],
|
| 219 |
+
value_confidence=response["value_confidence"],
|
| 220 |
+
)
|
| 221 |
|
| 222 |
|
| 223 |
class PageExtractionSchema(BaseModel):
|
| 224 |
description: str
|
| 225 |
extracted_json: str
|
| 226 |
+
existence_confidence: int
|
| 227 |
+
value_confidence: int
|
marker/renderers/extraction.py
CHANGED
|
@@ -1,5 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from pydantic import BaseModel
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
class ExtractionOutput(BaseModel):
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from typing import Dict
|
| 3 |
+
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
| 6 |
+
from marker.extractors import ExtractionResult
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
@dataclass
|
| 10 |
+
class MergeData:
|
| 11 |
+
confidence_exists_1: float
|
| 12 |
+
confidence_exists_2: float
|
| 13 |
+
confidence_value_1: float
|
| 14 |
+
confidence_value_2: float
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def merge_keys(json: dict | list, json2: dict, merge_data: MergeData):
|
| 18 |
+
if isinstance(json, list):
|
| 19 |
+
json.extend(json2)
|
| 20 |
+
|
| 21 |
+
elif isinstance(json, dict):
|
| 22 |
+
for key in json:
|
| 23 |
+
if isinstance(json[key], dict):
|
| 24 |
+
merge_keys(json[key], json2[key], merge_data)
|
| 25 |
+
elif isinstance(json[key], list):
|
| 26 |
+
json[key] = json[key] + json2[key]
|
| 27 |
+
else:
|
| 28 |
+
if (
|
| 29 |
+
merge_data.confidence_exists_2 > 3
|
| 30 |
+
and merge_data.confidence_value_2 > 3
|
| 31 |
+
and json2[key]
|
| 32 |
+
):
|
| 33 |
+
json[key] = json2[key]
|
| 34 |
+
|
| 35 |
+
if not json[key] and json2[key]:
|
| 36 |
+
json[key] = json2[key]
|
| 37 |
+
|
| 38 |
|
| 39 |
class ExtractionOutput(BaseModel):
|
| 40 |
+
pages: Dict[int, ExtractionResult]
|
| 41 |
+
json: dict
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
class ExtractionMerger:
|
| 45 |
+
def __init__(self):
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
def __call__(self, outputs: Dict[int, ExtractionResult]):
|
| 49 |
+
pnums = sorted(list(outputs.keys()))
|
| 50 |
+
merged_result = outputs[pnums[0]].extracted_data.copy()
|
| 51 |
+
confidence_exists = outputs[pnums[0]].existence_confidence
|
| 52 |
+
confidence_value = outputs[pnums[0]].value_confidence
|
| 53 |
+
|
| 54 |
+
for pnum in pnums[1:]:
|
| 55 |
+
merge_data = MergeData(
|
| 56 |
+
confidence_exists_1=confidence_exists,
|
| 57 |
+
confidence_exists_2=outputs[pnum].existence_confidence,
|
| 58 |
+
confidence_value_1=confidence_value,
|
| 59 |
+
confidence_value_2=outputs[pnum].value_confidence,
|
| 60 |
+
)
|
| 61 |
+
merge_keys(merged_result, outputs[pnum].extracted_data, merge_data)
|
| 62 |
+
|
| 63 |
+
return ExtractionOutput(pages=outputs, json=merged_result)
|
marker/scripts/common.py
CHANGED
|
@@ -165,8 +165,10 @@ def get_root_class(schema_code: str) -> Optional[BaseModel]:
|
|
| 165 |
return None
|
| 166 |
|
| 167 |
if "from pydantic" not in schema_code:
|
|
|
|
|
|
|
| 168 |
schema_code = (
|
| 169 |
-
"from
|
| 170 |
+ schema_code
|
| 171 |
)
|
| 172 |
|
|
|
|
| 165 |
return None
|
| 166 |
|
| 167 |
if "from pydantic" not in schema_code:
|
| 168 |
+
schema_code = "from pydantic import BaseModel\n" + schema_code
|
| 169 |
+
if "from typing" not in schema_code:
|
| 170 |
schema_code = (
|
| 171 |
+
"from typing import List, Dict, Optional, Set, Tuple, Union, Any\n\n"
|
| 172 |
+ schema_code
|
| 173 |
)
|
| 174 |
|
marker/scripts/extraction_app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
|
|
|
|
| 4 |
from pydantic import BaseModel
|
| 5 |
|
| 6 |
from marker.converters.extraction import ExtractionConverter
|
|
@@ -23,7 +24,6 @@ from typing import Any, Dict
|
|
| 23 |
import streamlit as st
|
| 24 |
|
| 25 |
from marker.config.parser import ConfigParser
|
| 26 |
-
from marker.output import text_from_rendered
|
| 27 |
|
| 28 |
|
| 29 |
def extract_data(fname: str, config: dict, schema: str) -> (str, Dict[str, Any], dict):
|
|
@@ -55,9 +55,9 @@ cli_options = parse_args()
|
|
| 55 |
st.markdown("""
|
| 56 |
# Marker Extraction Demo
|
| 57 |
|
| 58 |
-
This app will let you
|
| 59 |
|
| 60 |
-
|
| 61 |
""")
|
| 62 |
|
| 63 |
in_file: UploadedFile = st.sidebar.file_uploader(
|
|
@@ -79,18 +79,18 @@ with col1:
|
|
| 79 |
st.image(pil_image, use_container_width=True)
|
| 80 |
|
| 81 |
with col2:
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
value="""
|
| 85 |
class Schema(BaseModel):
|
| 86 |
-
pass
|
| 87 |
-
""
|
| 88 |
)
|
| 89 |
|
| 90 |
run_marker = st.sidebar.button("Run Extraction")
|
| 91 |
|
| 92 |
use_llm = st.sidebar.checkbox(
|
| 93 |
-
"Use LLM", help="Use LLM for higher quality
|
| 94 |
)
|
| 95 |
force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
|
| 96 |
strip_existing_ocr = st.sidebar.checkbox(
|
|
@@ -123,8 +123,6 @@ with tempfile.TemporaryDirectory() as tmp_dir:
|
|
| 123 |
)
|
| 124 |
rendered = extract_data(temp_pdf, cli_options, schema)
|
| 125 |
|
| 126 |
-
text, ext, images = text_from_rendered(rendered)
|
| 127 |
-
|
| 128 |
with col2:
|
| 129 |
st.write("Output JSON")
|
| 130 |
-
st.json(
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
|
| 4 |
+
from streamlit_ace import st_ace
|
| 5 |
from pydantic import BaseModel
|
| 6 |
|
| 7 |
from marker.converters.extraction import ExtractionConverter
|
|
|
|
| 24 |
import streamlit as st
|
| 25 |
|
| 26 |
from marker.config.parser import ConfigParser
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
def extract_data(fname: str, config: dict, schema: str) -> (str, Dict[str, Any], dict):
|
|
|
|
| 55 |
st.markdown("""
|
| 56 |
# Marker Extraction Demo
|
| 57 |
|
| 58 |
+
This app will let you use marker to do structured extraction.
|
| 59 |
|
| 60 |
+
Warning: This can execute untrusted code entered into the schema panel.
|
| 61 |
""")
|
| 62 |
|
| 63 |
in_file: UploadedFile = st.sidebar.file_uploader(
|
|
|
|
| 79 |
st.image(pil_image, use_container_width=True)
|
| 80 |
|
| 81 |
with col2:
|
| 82 |
+
st.write("Enter pydantic schema here")
|
| 83 |
+
schema = st_ace(
|
| 84 |
+
value="""from pydantic import BaseModel
|
| 85 |
class Schema(BaseModel):
|
| 86 |
+
pass""",
|
| 87 |
+
language="python",
|
| 88 |
)
|
| 89 |
|
| 90 |
run_marker = st.sidebar.button("Run Extraction")
|
| 91 |
|
| 92 |
use_llm = st.sidebar.checkbox(
|
| 93 |
+
"Use LLM", help="Use LLM for higher quality text", value=False
|
| 94 |
)
|
| 95 |
force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
|
| 96 |
strip_existing_ocr = st.sidebar.checkbox(
|
|
|
|
| 123 |
)
|
| 124 |
rendered = extract_data(temp_pdf, cli_options, schema)
|
| 125 |
|
|
|
|
|
|
|
| 126 |
with col2:
|
| 127 |
st.write("Output JSON")
|
| 128 |
+
st.json(rendered.model_dump())
|