Vik Paruchuri commited on
Commit
94b8583
·
1 Parent(s): 5783857

Improve structured extraction alpha

Browse files
README.md CHANGED
@@ -86,7 +86,7 @@ First, some configuration:
86
  I've included a streamlit app that lets you interactively try marker with some basic options. Run it with:
87
 
88
  ```shell
89
- pip install streamlit
90
  marker_gui
91
  ```
92
 
@@ -249,6 +249,32 @@ You can also run this via the CLI with
249
  marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
250
  ```
251
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
  # Output Formats
253
 
254
  ## Markdown
 
86
  I've included a streamlit app that lets you interactively try marker with some basic options. Run it with:
87
 
88
  ```shell
89
+ pip install streamlit streamlit-ace
90
  marker_gui
91
  ```
92
 
 
249
  marker_single FILENAME --converter_cls marker.converters.ocr.OCRConverter
250
  ```
251
 
252
+ ### Structured Extraction (alpha)
253
+
254
+ You can run structured extraction via the `ExtractionConverter`. This requires an llm service to be setup first (see [here](#llm-services) for details). You'll get a JSON output with the extracted values.
255
+
256
+ ```python
257
+ from marker.converters.extraction import ExtractionConverter
258
+ from marker.models import create_model_dict
259
+ from marker.config.parser import ConfigParser
260
+ from pydantic import BaseModel
261
+
262
+ class Links(BaseModel):
263
+ links: list[str]
264
+
265
+ schema = Links.model_json_schema()
266
+ config_parser = ConfigParser({
267
+ "page_schema": schema
268
+ })
269
+
270
+ converter = ExtractionConverter(
271
+ artifact_dict=create_model_dict(),
272
+ config=config_parser.generate_config_dict(),
273
+ llm_service=config_parser.get_llm_service(),
274
+ )
275
+ rendered = converter("FILEPATH")
276
+ ```
277
+
278
  # Output Formats
279
 
280
  ## Markdown
marker/converters/extraction.py CHANGED
@@ -1,4 +1,3 @@
1
- import json
2
  import re
3
 
4
  from marker.builders.document import DocumentBuilder
@@ -6,12 +5,16 @@ from marker.builders.line import LineBuilder
6
  from marker.builders.ocr import OcrBuilder
7
  from marker.builders.structure import StructureBuilder
8
  from marker.converters.pdf import PdfConverter
9
- from marker.extractors.page import PageExtractor
10
  from marker.providers.registry import provider_from_filepath
11
 
12
- from marker.renderers.extraction import ExtractionOutput
13
  from marker.renderers.markdown import MarkdownRenderer
14
 
 
 
 
 
15
 
16
  class ExtractionConverter(PdfConverter):
17
  pattern: str = r"{\d+\}-{48}\n\n"
@@ -33,11 +36,19 @@ class ExtractionConverter(PdfConverter):
33
 
34
  return document, provider
35
 
36
- def __call__(self, filepath: str):
37
  self.config["paginate_output"] = True # Ensure we can split the output properly
38
  self.config["output_format"] = (
39
  "markdown" # Output must be markdown for extraction
40
  )
 
 
 
 
 
 
 
 
41
  document, provider = self.build_document(filepath)
42
  renderer = self.resolve_dependencies(MarkdownRenderer)
43
  output = renderer(document)
@@ -53,10 +64,13 @@ class ExtractionConverter(PdfConverter):
53
  )
54
 
55
  extractor = self.resolve_dependencies(PageExtractor)
 
 
 
 
 
 
 
56
 
57
- all_json = []
58
- for page, page_md in zip(document.pages, output_pages):
59
- extracted_model = extractor(document, page, page_md.strip())
60
- extracted_json = extracted_model.model_dump_json()
61
- all_json.append(extracted_json)
62
- return ExtractionOutput(json=json.dumps(all_json, indent=4, ensure_ascii=False))
 
 
1
  import re
2
 
3
  from marker.builders.document import DocumentBuilder
 
5
  from marker.builders.ocr import OcrBuilder
6
  from marker.builders.structure import StructureBuilder
7
  from marker.converters.pdf import PdfConverter
8
+ from marker.extractors.page import PageExtractor, json_schema_to_base_model
9
  from marker.providers.registry import provider_from_filepath
10
 
11
+ from marker.renderers.extraction import ExtractionMerger
12
  from marker.renderers.markdown import MarkdownRenderer
13
 
14
+ from marker.logger import get_logger
15
+
16
+ logger = get_logger()
17
+
18
 
19
  class ExtractionConverter(PdfConverter):
20
  pattern: str = r"{\d+\}-{48}\n\n"
 
36
 
37
  return document, provider
38
 
39
+ def __call__(self, filepath: str) -> str:
40
  self.config["paginate_output"] = True # Ensure we can split the output properly
41
  self.config["output_format"] = (
42
  "markdown" # Output must be markdown for extraction
43
  )
44
+ try:
45
+ json_schema_to_base_model(self.config["page_schema"])
46
+ except Exception as e:
47
+ logger.error(f"Could not parse page schema: {e}")
48
+ raise ValueError(
49
+ "Could not parse your page schema. Please check the schema format."
50
+ )
51
+
52
  document, provider = self.build_document(filepath)
53
  renderer = self.resolve_dependencies(MarkdownRenderer)
54
  output = renderer(document)
 
64
  )
65
 
66
  extractor = self.resolve_dependencies(PageExtractor)
67
+ merger = self.resolve_dependencies(ExtractionMerger)
68
+
69
+ pnums = provider.page_range
70
+ all_json = {}
71
+ for page, page_md, pnum in zip(document.pages, output_pages, pnums):
72
+ extracted_json = extractor(document, page, page_md.strip())
73
+ all_json[pnum] = extracted_json
74
 
75
+ merged = merger(all_json)
76
+ return merged
 
 
 
 
marker/extractors/__init__.py CHANGED
@@ -1,4 +1,7 @@
1
- from typing import Annotated, Sequence
 
 
 
2
  from marker.schema import BlockTypes
3
  from marker.schema.document import Document
4
  from marker.schema.groups import PageGroup
@@ -8,6 +11,12 @@ from marker.services import BaseService
8
  from marker.util import assign_config
9
 
10
 
 
 
 
 
 
 
11
  class BaseExtractor:
12
  """
13
  An extractor that uses a provided service to extract structured data from documents.
@@ -38,5 +47,7 @@ class BaseExtractor:
38
  remove_blocks=remove_blocks,
39
  )
40
 
41
- def __call__(self, document: Document, *args, **kwargs):
 
 
42
  raise NotImplementedError
 
1
+ from typing import Annotated, Sequence, Optional
2
+
3
+ from pydantic import BaseModel
4
+
5
  from marker.schema import BlockTypes
6
  from marker.schema.document import Document
7
  from marker.schema.groups import PageGroup
 
11
  from marker.util import assign_config
12
 
13
 
14
+ class ExtractionResult(BaseModel):
15
+ extracted_data: dict | list
16
+ value_confidence: int
17
+ existence_confidence: int
18
+
19
+
20
  class BaseExtractor:
21
  """
22
  An extractor that uses a provided service to extract structured data from documents.
 
47
  remove_blocks=remove_blocks,
48
  )
49
 
50
+ def __call__(
51
+ self, document: Document, *args, **kwargs
52
+ ) -> Optional[ExtractionResult]:
53
  raise NotImplementedError
marker/extractors/page.py CHANGED
@@ -4,7 +4,7 @@ from pydantic import create_model, BaseModel, Field, ValidationError
4
  from typing import Annotated, Type, Optional, Any, Dict
5
  from enum import Enum
6
 
7
- from marker.extractors import BaseExtractor
8
  from marker.schema.document import Document
9
  from marker.schema.groups.page import PageGroup
10
 
@@ -118,7 +118,8 @@ Some guidelines:
118
  3. Analyze the JSON schema.
119
  4. Write a short description of the fields in the schema, and the associated values in the image.
120
  5. Extract the data in the schema that can be found in the image and output the data in JSON format.
121
- 6. Output a confidence score from 1 to 5, where 1 is very low confidence in your extracted values, and 5 is very high confidence in your extracted values.
 
122
 
123
  **Example:**
124
  Input:
@@ -158,7 +159,8 @@ Description: The schema has a list of cars, each with a make, sales, and color.
158
  }
159
  ```
160
 
161
- Confidence: 5
 
162
 
163
  **Input:**
164
 
@@ -175,7 +177,7 @@ Schema
175
 
176
  def __call__(
177
  self, document: Document, page: PageGroup, page_markdown: str, **kwargs
178
- ) -> Optional[BaseModel]:
179
  page_image = self.extract_image(document, page)
180
  if not self.page_schema:
181
  raise ValueError(
@@ -189,23 +191,37 @@ Schema
189
  ).replace("{schema}", json.dumps(optional_schema))
190
  response = self.llm_service(prompt, page_image, page, PageExtractionSchema)
191
 
192
- if not response or "extracted_json" not in response:
 
 
 
 
 
 
 
 
 
193
  page.update_metadata(llm_error_count=1)
194
- return
195
 
196
  extracted_json = response["extracted_json"]
197
 
198
  OptionalPageModel = json_schema_to_base_model(optional_schema)
199
  try:
200
- parsed_json = OptionalPageModel.model_validate_json(extracted_json)
201
  except ValidationError as e:
202
  print(f"Validation error with extracted data: {e}")
203
- return
204
 
205
- return parsed_json
 
 
 
 
206
 
207
 
208
  class PageExtractionSchema(BaseModel):
209
  description: str
210
  extracted_json: str
211
- confidence: int
 
 
4
  from typing import Annotated, Type, Optional, Any, Dict
5
  from enum import Enum
6
 
7
+ from marker.extractors import BaseExtractor, ExtractionResult
8
  from marker.schema.document import Document
9
  from marker.schema.groups.page import PageGroup
10
 
 
118
  3. Analyze the JSON schema.
119
  4. Write a short description of the fields in the schema, and the associated values in the image.
120
  5. Extract the data in the schema that can be found in the image and output the data in JSON format.
121
+ 6. Output an existence confidence score 1 to 5, where 1 is very low confidence that the values exist on the page, and 5 is very high confidence that the values exist on the page.
122
+ 7. Output a value confidence score from 1 to 5, where 1 is very low confidence that the values are correct, and 5 is very high confidence that the values are correct.
123
 
124
  **Example:**
125
  Input:
 
159
  }
160
  ```
161
 
162
+ Existence confidence: 5
163
+ Value confidence: 5
164
 
165
  **Input:**
166
 
 
177
 
178
  def __call__(
179
  self, document: Document, page: PageGroup, page_markdown: str, **kwargs
180
+ ) -> Optional[ExtractionResult]:
181
  page_image = self.extract_image(document, page)
182
  if not self.page_schema:
183
  raise ValueError(
 
191
  ).replace("{schema}", json.dumps(optional_schema))
192
  response = self.llm_service(prompt, page_image, page, PageExtractionSchema)
193
 
194
+ if not response or any(
195
+ [
196
+ key not in response
197
+ for key in [
198
+ "extracted_json",
199
+ "existence_confidence",
200
+ "value_confidence",
201
+ ]
202
+ ]
203
+ ):
204
  page.update_metadata(llm_error_count=1)
205
+ return None
206
 
207
  extracted_json = response["extracted_json"]
208
 
209
  OptionalPageModel = json_schema_to_base_model(optional_schema)
210
  try:
211
+ OptionalPageModel.model_validate_json(extracted_json)
212
  except ValidationError as e:
213
  print(f"Validation error with extracted data: {e}")
214
+ return None
215
 
216
+ return ExtractionResult(
217
+ extracted_data=json.loads(extracted_json),
218
+ existence_confidence=response["existence_confidence"],
219
+ value_confidence=response["value_confidence"],
220
+ )
221
 
222
 
223
  class PageExtractionSchema(BaseModel):
224
  description: str
225
  extracted_json: str
226
+ existence_confidence: int
227
+ value_confidence: int
marker/renderers/extraction.py CHANGED
@@ -1,5 +1,63 @@
 
 
 
1
  from pydantic import BaseModel
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  class ExtractionOutput(BaseModel):
5
- json: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Dict
3
+
4
  from pydantic import BaseModel
5
 
6
+ from marker.extractors import ExtractionResult
7
+
8
+
9
+ @dataclass
10
+ class MergeData:
11
+ confidence_exists_1: float
12
+ confidence_exists_2: float
13
+ confidence_value_1: float
14
+ confidence_value_2: float
15
+
16
+
17
+ def merge_keys(json: dict | list, json2: dict, merge_data: MergeData):
18
+ if isinstance(json, list):
19
+ json.extend(json2)
20
+
21
+ elif isinstance(json, dict):
22
+ for key in json:
23
+ if isinstance(json[key], dict):
24
+ merge_keys(json[key], json2[key], merge_data)
25
+ elif isinstance(json[key], list):
26
+ json[key] = json[key] + json2[key]
27
+ else:
28
+ if (
29
+ merge_data.confidence_exists_2 > 3
30
+ and merge_data.confidence_value_2 > 3
31
+ and json2[key]
32
+ ):
33
+ json[key] = json2[key]
34
+
35
+ if not json[key] and json2[key]:
36
+ json[key] = json2[key]
37
+
38
 
39
  class ExtractionOutput(BaseModel):
40
+ pages: Dict[int, ExtractionResult]
41
+ json: dict
42
+
43
+
44
+ class ExtractionMerger:
45
+ def __init__(self):
46
+ pass
47
+
48
+ def __call__(self, outputs: Dict[int, ExtractionResult]):
49
+ pnums = sorted(list(outputs.keys()))
50
+ merged_result = outputs[pnums[0]].extracted_data.copy()
51
+ confidence_exists = outputs[pnums[0]].existence_confidence
52
+ confidence_value = outputs[pnums[0]].value_confidence
53
+
54
+ for pnum in pnums[1:]:
55
+ merge_data = MergeData(
56
+ confidence_exists_1=confidence_exists,
57
+ confidence_exists_2=outputs[pnum].existence_confidence,
58
+ confidence_value_1=confidence_value,
59
+ confidence_value_2=outputs[pnum].value_confidence,
60
+ )
61
+ merge_keys(merged_result, outputs[pnum].extracted_data, merge_data)
62
+
63
+ return ExtractionOutput(pages=outputs, json=merged_result)
marker/scripts/common.py CHANGED
@@ -165,8 +165,10 @@ def get_root_class(schema_code: str) -> Optional[BaseModel]:
165
  return None
166
 
167
  if "from pydantic" not in schema_code:
 
 
168
  schema_code = (
169
- "from pydantic import BaseModel\nfrom typing import List, Dict, Optional, Set, Tuple, Union, Any\n\n"
170
  + schema_code
171
  )
172
 
 
165
  return None
166
 
167
  if "from pydantic" not in schema_code:
168
+ schema_code = "from pydantic import BaseModel\n" + schema_code
169
+ if "from typing" not in schema_code:
170
  schema_code = (
171
+ "from typing import List, Dict, Optional, Set, Tuple, Union, Any\n\n"
172
  + schema_code
173
  )
174
 
marker/scripts/extraction_app.py CHANGED
@@ -1,6 +1,7 @@
1
  import json
2
  import os
3
 
 
4
  from pydantic import BaseModel
5
 
6
  from marker.converters.extraction import ExtractionConverter
@@ -23,7 +24,6 @@ from typing import Any, Dict
23
  import streamlit as st
24
 
25
  from marker.config.parser import ConfigParser
26
- from marker.output import text_from_rendered
27
 
28
 
29
  def extract_data(fname: str, config: dict, schema: str) -> (str, Dict[str, Any], dict):
@@ -55,9 +55,9 @@ cli_options = parse_args()
55
  st.markdown("""
56
  # Marker Extraction Demo
57
 
58
- This app will let you try marker, a PDF or image -> Markdown, HTML, JSON converter. It works with any language, and extracts images, tables, equations, etc.
59
 
60
- Find the project [here](https://github.com/VikParuchuri/marker).
61
  """)
62
 
63
  in_file: UploadedFile = st.sidebar.file_uploader(
@@ -79,18 +79,18 @@ with col1:
79
  st.image(pil_image, use_container_width=True)
80
 
81
  with col2:
82
- schema = st.text_area(
83
- "Pyantic schema for extraction",
84
- value="""
85
  class Schema(BaseModel):
86
- pass
87
- """,
88
  )
89
 
90
  run_marker = st.sidebar.button("Run Extraction")
91
 
92
  use_llm = st.sidebar.checkbox(
93
- "Use LLM", help="Use LLM for higher quality processing", value=False
94
  )
95
  force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
96
  strip_existing_ocr = st.sidebar.checkbox(
@@ -123,8 +123,6 @@ with tempfile.TemporaryDirectory() as tmp_dir:
123
  )
124
  rendered = extract_data(temp_pdf, cli_options, schema)
125
 
126
- text, ext, images = text_from_rendered(rendered)
127
-
128
  with col2:
129
  st.write("Output JSON")
130
- st.json(text)
 
1
  import json
2
  import os
3
 
4
+ from streamlit_ace import st_ace
5
  from pydantic import BaseModel
6
 
7
  from marker.converters.extraction import ExtractionConverter
 
24
  import streamlit as st
25
 
26
  from marker.config.parser import ConfigParser
 
27
 
28
 
29
  def extract_data(fname: str, config: dict, schema: str) -> (str, Dict[str, Any], dict):
 
55
  st.markdown("""
56
  # Marker Extraction Demo
57
 
58
+ This app will let you use marker to do structured extraction.
59
 
60
+ Warning: This can execute untrusted code entered into the schema panel.
61
  """)
62
 
63
  in_file: UploadedFile = st.sidebar.file_uploader(
 
79
  st.image(pil_image, use_container_width=True)
80
 
81
  with col2:
82
+ st.write("Enter pydantic schema here")
83
+ schema = st_ace(
84
+ value="""from pydantic import BaseModel
85
  class Schema(BaseModel):
86
+ pass""",
87
+ language="python",
88
  )
89
 
90
  run_marker = st.sidebar.button("Run Extraction")
91
 
92
  use_llm = st.sidebar.checkbox(
93
+ "Use LLM", help="Use LLM for higher quality text", value=False
94
  )
95
  force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
96
  strip_existing_ocr = st.sidebar.checkbox(
 
123
  )
124
  rendered = extract_data(temp_pdf, cli_options, schema)
125
 
 
 
126
  with col2:
127
  st.write("Output JSON")
128
+ st.json(rendered.model_dump())