Vik Paruchuri commited on
Commit
14b3a02
·
1 Parent(s): 4b7098a

Improve structured extraction

Browse files
README.md CHANGED
@@ -1,6 +1,6 @@
1
  # Marker
2
 
3
- Marker converts documents to markdown, JSON, and HTML quickly and accurately.
4
 
5
  - Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
6
  - Formats tables, forms, equations, inline math, links, references, and code blocks
@@ -276,6 +276,8 @@ converter = ExtractionConverter(
276
  rendered = converter("FILEPATH")
277
  ```
278
 
 
 
279
  # Output Formats
280
 
281
  ## Markdown
@@ -348,7 +350,7 @@ Note that child blocks of pages can have their own children as well (a tree stru
348
 
349
  ## Chunks
350
 
351
- Chunks format is similar to JSON, but flattens everything into a single list instead of a tree. Only the top level blocks from each page show up. It also has the full HTML of each block inside, so you don't need to crawl the tree to reconstruct it.
352
 
353
  ## Metadata
354
 
 
1
  # Marker
2
 
3
+ Marker converts documents to markdown, JSON, chunks, and HTML quickly and accurately.
4
 
5
  - Converts PDF, image, PPTX, DOCX, XLSX, HTML, EPUB files in all languages
6
  - Formats tables, forms, equations, inline math, links, references, and code blocks
 
276
  rendered = converter("FILEPATH")
277
  ```
278
 
279
+ Rendered will have an `original_markdown` field. If you pass this back in next time you run the converter, as the `existing_markdown` config key, you can skip re-parsing the document.
280
+
281
  # Output Formats
282
 
283
  ## Markdown
 
350
 
351
  ## Chunks
352
 
353
+ Chunks format is similar to JSON, but flattens everything into a single list instead of a tree. Only the top level blocks from each page show up. It also has the full HTML of each block inside, so you don't need to crawl the tree to reconstruct it. This enable flexible and easy chunking for RAG.
354
 
355
  ## Metadata
356
 
marker/converters/extraction.py CHANGED
@@ -1,5 +1,5 @@
1
- import json
2
  import re
 
3
 
4
  from marker.builders.document import DocumentBuilder
5
  from marker.builders.line import LineBuilder
@@ -8,7 +8,6 @@ from marker.builders.structure import StructureBuilder
8
  from marker.converters.pdf import PdfConverter
9
  from marker.extractors.document import DocumentExtractor
10
  from marker.extractors.page import PageExtractor
11
- from marker.extractors.util import json_schema_to_base_model
12
  from marker.providers.registry import provider_from_filepath
13
 
14
  from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
@@ -21,6 +20,9 @@ logger = get_logger()
21
 
22
  class ExtractionConverter(PdfConverter):
23
  pattern: str = r"{\d+\}-{48}\n\n"
 
 
 
24
 
25
  def build_document(self, filepath: str):
26
  provider_cls = provider_from_filepath(filepath)
@@ -44,23 +46,16 @@ class ExtractionConverter(PdfConverter):
44
  self.config["output_format"] = (
45
  "markdown" # Output must be markdown for extraction
46
  )
 
47
 
48
- try:
49
- json_schema_to_base_model(json.loads(self.config["page_schema"]))
50
- except Exception as e:
51
- logger.error(f"Could not parse page schema: {e}")
52
- raise ValueError(
53
- "Could not parse your page schema. Please check the schema format."
54
- )
55
-
56
- document, provider = self.build_document(filepath)
57
- self.page_count = len(document.pages)
58
- renderer = self.resolve_dependencies(MarkdownRenderer)
59
- output = renderer(document)
60
 
61
- output_pages = re.split(self.pattern, output.markdown)[
62
- 1:
63
- ] # Split output into pages
64
 
65
  # This needs an LLM service for extraction, this sets it in the extractor
66
  if not self.artifact_dict["llm_service"]:
@@ -73,8 +68,8 @@ class ExtractionConverter(PdfConverter):
73
  renderer = self.resolve_dependencies(ExtractionRenderer)
74
 
75
  # Inference in parallel
76
- notes = page_extractor(document, document.pages, output_pages)
77
- document_output = document_extractor(document, notes)
78
 
79
- merged = renderer(document_output)
80
  return merged
 
 
1
  import re
2
+ from typing import Annotated
3
 
4
  from marker.builders.document import DocumentBuilder
5
  from marker.builders.line import LineBuilder
 
8
  from marker.converters.pdf import PdfConverter
9
  from marker.extractors.document import DocumentExtractor
10
  from marker.extractors.page import PageExtractor
 
11
  from marker.providers.registry import provider_from_filepath
12
 
13
  from marker.renderers.extraction import ExtractionRenderer, ExtractionOutput
 
20
 
21
  class ExtractionConverter(PdfConverter):
22
  pattern: str = r"{\d+\}-{48}\n\n"
23
+ existing_markdown: Annotated[
24
+ str, "Markdown that was already converted for extraction."
25
+ ] = None
26
 
27
  def build_document(self, filepath: str):
28
  provider_cls = provider_from_filepath(filepath)
 
46
  self.config["output_format"] = (
47
  "markdown" # Output must be markdown for extraction
48
  )
49
+ markdown = self.existing_markdown
50
 
51
+ if not markdown:
52
+ document, provider = self.build_document(filepath)
53
+ self.page_count = len(document.pages)
54
+ renderer = self.resolve_dependencies(MarkdownRenderer)
55
+ output = renderer(document)
56
+ markdown = output.markdown
 
 
 
 
 
 
57
 
58
+ output_pages = re.split(self.pattern, markdown)[1:] # Split output into pages
 
 
59
 
60
  # This needs an LLM service for extraction, this sets it in the extractor
61
  if not self.artifact_dict["llm_service"]:
 
68
  renderer = self.resolve_dependencies(ExtractionRenderer)
69
 
70
  # Inference in parallel
71
+ notes = page_extractor(output_pages)
72
+ document_output = document_extractor(notes)
73
 
74
+ merged = renderer(document_output, markdown)
75
  return merged
marker/extractors/document.py CHANGED
@@ -6,7 +6,6 @@ from typing import Annotated, Optional, List
6
  from marker.extractors import BaseExtractor
7
  from marker.extractors.page import PageExtractionSchema
8
  from marker.logger import get_logger
9
- from marker.schema.document import Document
10
 
11
  logger = get_logger()
12
 
@@ -114,7 +113,6 @@ Schema
114
 
115
  def __call__(
116
  self,
117
- document: Document,
118
  page_notes: List[PageExtractionSchema],
119
  **kwargs,
120
  ) -> Optional[DocumentExtractionSchema]:
@@ -123,11 +121,10 @@ Schema
123
  "Page schema must be defined for structured extraction to work."
124
  )
125
 
126
- page = document.pages[0]
127
  prompt = self.page_extraction_prompt.replace(
128
  "{{document_notes}}", self.assemble_document_notes(page_notes)
129
  ).replace("{{schema}}", json.dumps(self.page_schema))
130
- response = self.llm_service(prompt, None, page, DocumentExtractionSchema)
131
 
132
  logger.debug(f"Document extraction response: {response}")
133
 
@@ -140,7 +137,6 @@ Schema
140
  ]
141
  ]
142
  ):
143
- page.update_metadata(llm_error_count=1)
144
  return None
145
 
146
  json_data = response["document_json"].strip().lstrip("```json").rstrip("```")
 
6
  from marker.extractors import BaseExtractor
7
  from marker.extractors.page import PageExtractionSchema
8
  from marker.logger import get_logger
 
9
 
10
  logger = get_logger()
11
 
 
113
 
114
  def __call__(
115
  self,
 
116
  page_notes: List[PageExtractionSchema],
117
  **kwargs,
118
  ) -> Optional[DocumentExtractionSchema]:
 
121
  "Page schema must be defined for structured extraction to work."
122
  )
123
 
 
124
  prompt = self.page_extraction_prompt.replace(
125
  "{{document_notes}}", self.assemble_document_notes(page_notes)
126
  ).replace("{{schema}}", json.dumps(self.page_schema))
127
+ response = self.llm_service(prompt, None, None, DocumentExtractionSchema)
128
 
129
  logger.debug(f"Document extraction response: {response}")
130
 
 
137
  ]
138
  ]
139
  ):
 
140
  return None
141
 
142
  json_data = response["document_json"].strip().lstrip("```json").rstrip("```")
marker/extractors/page.py CHANGED
@@ -8,8 +8,6 @@ from tqdm import tqdm
8
 
9
  from marker.extractors import BaseExtractor
10
  from marker.logger import get_logger
11
- from marker.schema.document import Document
12
- from marker.schema.groups.page import PageGroup
13
 
14
  logger = get_logger()
15
 
@@ -100,29 +98,25 @@ Schema
100
  ```
101
  """
102
 
103
- def chunk_page_markdown(
104
- self, pages: List[PageGroup], page_markdown: List[str]
105
- ) -> List[tuple]:
106
  """
107
  Chunk the page markdown into smaller pieces for processing.
108
  """
109
- if len(pages) == 0:
110
- return []
111
 
112
  chunks = []
113
- for i in range(0, len(pages), self.extraction_page_chunk_size):
114
  chunk = page_markdown[i : i + self.extraction_page_chunk_size]
115
- chunks.append((pages[i], "\n\n".join(chunk)))
116
 
117
  return chunks
118
 
119
  def inference_single_chunk(
120
- self, page: PageGroup, page_markdown: str
121
  ) -> Optional[PageExtractionSchema]:
122
  prompt = self.page_extraction_prompt.replace(
123
  "{{page_md}}", page_markdown
124
  ).replace("{{schema}}", json.dumps(self.page_schema))
125
- response = self.llm_service(prompt, None, page, PageExtractionSchema)
126
  logger.debug(f"Page extraction response: {response}")
127
 
128
  if not response or any(
@@ -134,7 +128,6 @@ Schema
134
  ]
135
  ]
136
  ):
137
- page.update_metadata(llm_error_count=1)
138
  return None
139
 
140
  return PageExtractionSchema(
@@ -144,20 +137,15 @@ Schema
144
 
145
  def __call__(
146
  self,
147
- document: Document,
148
- pages: List[PageGroup],
149
  page_markdown: List[str],
150
  **kwargs,
151
  ) -> List[PageExtractionSchema]:
152
- assert len(page_markdown) == len(pages), (
153
- f"Mismatch in page markdown and pages length: {len(page_markdown)} vs {len(pages)}"
154
- )
155
  if not self.page_schema:
156
  raise ValueError(
157
  "Page schema must be defined for structured extraction to work."
158
  )
159
 
160
- chunks = self.chunk_page_markdown(pages, page_markdown)
161
  results = []
162
  pbar = tqdm(
163
  desc="Running page extraction",
@@ -167,8 +155,7 @@ Schema
167
 
168
  with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
169
  for future in [
170
- executor.submit(self.inference_single_chunk, chunk[0], chunk[1])
171
- for chunk in chunks
172
  ]:
173
  results.append(future.result()) # Raise exceptions if any occurred
174
  pbar.update(1)
 
8
 
9
  from marker.extractors import BaseExtractor
10
  from marker.logger import get_logger
 
 
11
 
12
  logger = get_logger()
13
 
 
98
  ```
99
  """
100
 
101
+ def chunk_page_markdown(self, page_markdown: List[str]) -> List[str]:
 
 
102
  """
103
  Chunk the page markdown into smaller pieces for processing.
104
  """
 
 
105
 
106
  chunks = []
107
+ for i in range(0, len(page_markdown), self.extraction_page_chunk_size):
108
  chunk = page_markdown[i : i + self.extraction_page_chunk_size]
109
+ chunks.append("\n\n".join(chunk))
110
 
111
  return chunks
112
 
113
  def inference_single_chunk(
114
+ self, page_markdown: str
115
  ) -> Optional[PageExtractionSchema]:
116
  prompt = self.page_extraction_prompt.replace(
117
  "{{page_md}}", page_markdown
118
  ).replace("{{schema}}", json.dumps(self.page_schema))
119
+ response = self.llm_service(prompt, None, None, PageExtractionSchema)
120
  logger.debug(f"Page extraction response: {response}")
121
 
122
  if not response or any(
 
128
  ]
129
  ]
130
  ):
 
131
  return None
132
 
133
  return PageExtractionSchema(
 
137
 
138
  def __call__(
139
  self,
 
 
140
  page_markdown: List[str],
141
  **kwargs,
142
  ) -> List[PageExtractionSchema]:
 
 
 
143
  if not self.page_schema:
144
  raise ValueError(
145
  "Page schema must be defined for structured extraction to work."
146
  )
147
 
148
+ chunks = self.chunk_page_markdown(page_markdown)
149
  results = []
150
  pbar = tqdm(
151
  desc="Running page extraction",
 
155
 
156
  with ThreadPoolExecutor(max_workers=self.max_concurrency) as executor:
157
  for future in [
158
+ executor.submit(self.inference_single_chunk, chunk) for chunk in chunks
 
159
  ]:
160
  results.append(future.result()) # Raise exceptions if any occurred
161
  pbar.update(1)
marker/extractors/util.py DELETED
@@ -1,213 +0,0 @@
1
- from typing import Any, Type, Union, Optional
2
- from pydantic import BaseModel, Field, create_model, validator
3
- from enum import Enum
4
- import re
5
- from datetime import datetime
6
- from uuid import UUID
7
-
8
-
9
- def json_schema_to_base_model(
10
- schema: dict[str, Any], model_name: str = None
11
- ) -> Type[BaseModel]:
12
- """Convert a JSON schema to a Pydantic BaseModel dynamically."""
13
-
14
- # Enhanced type mapping with format support
15
- def get_type_from_schema(field_props: dict[str, Any]) -> type:
16
- json_type = field_props.get("type", "string")
17
- format_type = field_props.get("format")
18
-
19
- # Handle format-specific types
20
- if json_type == "string":
21
- if format_type == "date-time":
22
- return datetime
23
- elif format_type == "uuid":
24
- return UUID
25
- else:
26
- return str
27
- elif json_type == "integer":
28
- return int
29
- elif json_type == "number":
30
- return float
31
- elif json_type == "boolean":
32
- return bool
33
- elif json_type == "array":
34
- return list
35
- elif json_type == "object":
36
- return dict
37
- else:
38
- return str # fallback
39
-
40
- def handle_union_types(field_props: dict[str, Any]) -> type:
41
- """Handle anyOf, oneOf, and type arrays."""
42
- any_of = field_props.get("anyOf", [])
43
- one_of = field_props.get("oneOf", [])
44
- type_list = field_props.get("type", [])
45
-
46
- if any_of:
47
- types = [get_type_from_schema(schema) for schema in any_of]
48
- return Union[tuple(types)]
49
- elif one_of:
50
- types = [get_type_from_schema(schema) for schema in one_of]
51
- return Union[tuple(types)]
52
- elif isinstance(type_list, list):
53
- types = [get_type_from_schema({"type": t}) for t in type_list]
54
- return Union[tuple(types)]
55
-
56
- return None
57
-
58
- def create_validator_from_constraints(field_name: str, field_props: dict[str, Any]):
59
- """Create Pydantic validators from JSON schema constraints."""
60
- validators = {}
61
-
62
- # String constraints
63
- if "minLength" in field_props:
64
- min_len = field_props["minLength"]
65
-
66
- def min_length_validator(cls, v):
67
- if isinstance(v, str) and len(v) < min_len:
68
- raise ValueError(
69
- f"{field_name} must be at least {min_len} characters"
70
- )
71
- return v
72
-
73
- validators[f"{field_name}_min_length"] = validator(
74
- field_name, allow_reuse=True
75
- )(min_length_validator)
76
-
77
- if "maxLength" in field_props:
78
- max_len = field_props["maxLength"]
79
-
80
- def max_length_validator(cls, v):
81
- if isinstance(v, str) and len(v) > max_len:
82
- raise ValueError(
83
- f"{field_name} must be at most {max_len} characters"
84
- )
85
- return v
86
-
87
- validators[f"{field_name}_max_length"] = validator(
88
- field_name, allow_reuse=True
89
- )(max_length_validator)
90
-
91
- if "pattern" in field_props:
92
- pattern = field_props["pattern"]
93
-
94
- def pattern_validator(cls, v):
95
- if isinstance(v, str) and not re.match(pattern, v):
96
- raise ValueError(f"{field_name} must match pattern {pattern}")
97
- return v
98
-
99
- validators[f"{field_name}_pattern"] = validator(
100
- field_name, allow_reuse=True
101
- )(pattern_validator)
102
-
103
- # Numeric constraints
104
- if "minimum" in field_props:
105
- min_val = field_props["minimum"]
106
-
107
- def min_validator(cls, v):
108
- if isinstance(v, (int, float)) and v < min_val:
109
- raise ValueError(f"{field_name} must be at least {min_val}")
110
- return v
111
-
112
- validators[f"{field_name}_minimum"] = validator(
113
- field_name, allow_reuse=True
114
- )(min_validator)
115
-
116
- if "maximum" in field_props:
117
- max_val = field_props["maximum"]
118
-
119
- def max_validator(cls, v):
120
- if isinstance(v, (int, float)) and v > max_val:
121
- raise ValueError(f"{field_name} must be at most {max_val}")
122
- return v
123
-
124
- validators[f"{field_name}_maximum"] = validator(
125
- field_name, allow_reuse=True
126
- )(max_validator)
127
-
128
- return validators
129
-
130
- def process_field(field_name: str, field_props: dict[str, Any]) -> tuple:
131
- """Process a single field from the schema."""
132
-
133
- # Handle const values
134
- if "const" in field_props:
135
- const_value = field_props["const"]
136
- return type(const_value), Field(default=const_value, const=True)
137
-
138
- # Handle enums
139
- enum_values = field_props.get("enum")
140
- if enum_values:
141
- enum_name = f"{field_name.capitalize()}Enum"
142
- field_type = Enum(enum_name, {str(v): v for v in enum_values})
143
-
144
- # Handle union types (anyOf, oneOf, type arrays)
145
- elif union_type := handle_union_types(field_props):
146
- field_type = union_type
147
-
148
- # Handle nested objects
149
- elif field_props.get("type") == "object" and "properties" in field_props:
150
- nested_model_name = f"{field_name.capitalize()}Model"
151
- field_type = json_schema_to_base_model(field_props, nested_model_name)
152
-
153
- # Handle arrays
154
- elif field_props.get("type") == "array" and "items" in field_props:
155
- item_props = field_props["items"]
156
-
157
- # Handle array of objects
158
- if item_props.get("type") == "object" and "properties" in item_props:
159
- item_model_name = f"{field_name.capitalize()}ItemModel"
160
- item_type = json_schema_to_base_model(item_props, item_model_name)
161
- else:
162
- item_type = get_type_from_schema(item_props)
163
-
164
- field_type = list[item_type]
165
-
166
- # Handle primitive types
167
- else:
168
- field_type = get_type_from_schema(field_props)
169
-
170
- # Handle nullable
171
- if field_props.get("nullable", False):
172
- field_type = Optional[field_type]
173
-
174
- # Determine default value
175
- if "default" in field_props:
176
- default_value = field_props["default"]
177
- elif field_name not in schema.get("required", []):
178
- default_value = None
179
- if not field_props.get("nullable", False):
180
- field_type = Optional[field_type]
181
- else:
182
- default_value = ...
183
-
184
- # Create field with metadata
185
- field_info = Field(
186
- default=default_value,
187
- description=field_props.get("description", field_props.get("title", "")),
188
- title=field_props.get("title"),
189
- examples=field_props.get("examples"),
190
- )
191
-
192
- return field_type, field_info
193
-
194
- # Process schema
195
- properties = schema.get("properties", {})
196
- model_fields = {}
197
- validators = {}
198
-
199
- # Process each field
200
- for field_name, field_props in properties.items():
201
- model_fields[field_name] = process_field(field_name, field_props)
202
-
203
- # Add validators for constraints
204
- field_validators = create_validator_from_constraints(field_name, field_props)
205
- validators.update(field_validators)
206
-
207
- # Create the model
208
- model_name = model_name or schema.get("title", "DynamicModel")
209
-
210
- # Create model with validators
211
- model_class = create_model(model_name, **model_fields, __validators__=validators)
212
-
213
- return model_class
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
marker/renderers/extraction.py CHANGED
@@ -7,11 +7,16 @@ from marker.renderers import BaseRenderer
7
  class ExtractionOutput(BaseModel):
8
  analysis: str
9
  document_json: str
 
10
 
11
 
12
  class ExtractionRenderer(BaseRenderer):
13
- def __call__(self, output: DocumentExtractionSchema) -> ExtractionOutput:
 
 
14
  # We definitely want to do more complex stuff here soon, so leave it in
15
  return ExtractionOutput(
16
- analysis=output.analysis, document_json=output.document_json
 
 
17
  )
 
7
  class ExtractionOutput(BaseModel):
8
  analysis: str
9
  document_json: str
10
+ original_markdown: str
11
 
12
 
13
  class ExtractionRenderer(BaseRenderer):
14
+ def __call__(
15
+ self, output: DocumentExtractionSchema, markdown: str
16
+ ) -> ExtractionOutput:
17
  # We definitely want to do more complex stuff here soon, so leave it in
18
  return ExtractionOutput(
19
+ analysis=output.analysis,
20
+ document_json=output.document_json,
21
+ original_markdown=markdown,
22
  )
marker/scripts/extraction_app.py CHANGED
@@ -26,9 +26,12 @@ import streamlit as st
26
  from marker.config.parser import ConfigParser
27
 
28
 
29
- def extract_data(fname: str, config: dict, schema: str) -> (str, Dict[str, Any], dict):
 
 
30
  config["pdftext_workers"] = 1
31
  config["page_schema"] = schema
 
32
  config_parser = ConfigParser(config)
33
  config_dict = config_parser.generate_config_dict()
34
 
@@ -62,12 +65,35 @@ in_file: UploadedFile = st.sidebar.file_uploader(
62
  type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"],
63
  )
64
 
65
- if in_file is None:
66
- st.stop()
67
-
68
  if "rendered_pydantic_schema" not in st.session_state:
69
  st.session_state.rendered_pydantic_schema = ""
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  filetype = in_file.type
72
 
73
  with col1:
@@ -196,11 +222,14 @@ if run_marker:
196
  )
197
 
198
  try:
199
- rendered = extract_data(temp_pdf, cli_options, schema)
 
 
200
 
201
  with col2:
202
  st.write("## Output JSON")
203
- st.json(rendered.model_dump())
 
204
 
205
  except Exception as e:
206
  st.error(f"❌ Extraction failed: {e}")
 
26
  from marker.config.parser import ConfigParser
27
 
28
 
29
+ def extract_data(
30
+ fname: str, config: dict, schema: str, markdown: str | None = None
31
+ ) -> (str, Dict[str, Any], dict):
32
  config["pdftext_workers"] = 1
33
  config["page_schema"] = schema
34
+ config["existing_markdown"] = markdown
35
  config_parser = ConfigParser(config)
36
  config_dict = config_parser.generate_config_dict()
37
 
 
65
  type=["pdf", "png", "jpg", "jpeg", "gif", "pptx", "docx", "xlsx", "html", "epub"],
66
  )
67
 
68
+ # Initialize session state variables
 
 
69
  if "rendered_pydantic_schema" not in st.session_state:
70
  st.session_state.rendered_pydantic_schema = ""
71
 
72
+ if "markdown" not in st.session_state:
73
+ st.session_state.markdown = ""
74
+
75
+ if "current_file_id" not in st.session_state:
76
+ st.session_state.current_file_id = None
77
+
78
+ # Detect file changes and clear markdown when new file is uploaded
79
+ if in_file is not None:
80
+ # Create a unique identifier for the current file
81
+ current_file_id = f"{in_file.name}_{in_file.size}_{hash(in_file.getvalue())}"
82
+
83
+ # Check if this is a new file
84
+ if st.session_state.current_file_id != current_file_id:
85
+ st.session_state.current_file_id = current_file_id
86
+ st.session_state.markdown = "" # Clear markdown for new file
87
+ else:
88
+ # No file uploaded, clear the current file ID
89
+ if st.session_state.current_file_id is not None:
90
+ st.session_state.current_file_id = None
91
+ st.session_state.markdown = "" # Clear markdown when no file
92
+ st.session_state.rendered_pydantic_schema = ""
93
+
94
+ if in_file is None:
95
+ st.stop()
96
+
97
  filetype = in_file.type
98
 
99
  with col1:
 
222
  )
223
 
224
  try:
225
+ rendered = extract_data(
226
+ temp_pdf, cli_options, schema, st.session_state.markdown
227
+ )
228
 
229
  with col2:
230
  st.write("## Output JSON")
231
+ st.json(rendered.model_dump(exclude=["original_markdown"]))
232
+ st.session_state.markdown = rendered.original_markdown
233
 
234
  except Exception as e:
235
  st.error(f"❌ Extraction failed: {e}")
marker/services/__init__.py CHANGED
@@ -37,7 +37,7 @@ class BaseService:
37
  self,
38
  prompt: str,
39
  image: PIL.Image.Image | List[PIL.Image.Image] | None,
40
- block: Block,
41
  response_schema: type[BaseModel],
42
  max_retries: int | None = None,
43
  timeout: int | None = None,
 
37
  self,
38
  prompt: str,
39
  image: PIL.Image.Image | List[PIL.Image.Image] | None,
40
+ block: Block | None,
41
  response_schema: type[BaseModel],
42
  max_retries: int | None = None,
43
  timeout: int | None = None,
marker/services/claude.py CHANGED
@@ -74,7 +74,7 @@ class ClaudeService(BaseService):
74
  self,
75
  prompt: str,
76
  image: PIL.Image.Image | List[PIL.Image.Image] | None,
77
- block: Block,
78
  response_schema: type[BaseModel],
79
  max_retries: int | None = None,
80
  timeout: int | None = None,
 
74
  self,
75
  prompt: str,
76
  image: PIL.Image.Image | List[PIL.Image.Image] | None,
77
+ block: Block | None,
78
  response_schema: type[BaseModel],
79
  max_retries: int | None = None,
80
  timeout: int | None = None,
marker/services/gemini.py CHANGED
@@ -41,7 +41,7 @@ class BaseGeminiService(BaseService):
41
  self,
42
  prompt: str,
43
  image: PIL.Image.Image | List[PIL.Image.Image] | None,
44
- block: Block,
45
  response_schema: type[BaseModel],
46
  max_retries: int | None = None,
47
  timeout: int | None = None,
@@ -72,7 +72,10 @@ class BaseGeminiService(BaseService):
72
  )
73
  output = responses.candidates[0].content.parts[0].text
74
  total_tokens = responses.usage_metadata.total_token_count
75
- block.update_metadata(llm_tokens_used=total_tokens, llm_request_count=1)
 
 
 
76
  return json.loads(output)
77
  except APIError as e:
78
  if e.code in [429, 443, 503]:
 
41
  self,
42
  prompt: str,
43
  image: PIL.Image.Image | List[PIL.Image.Image] | None,
44
+ block: Block | None,
45
  response_schema: type[BaseModel],
46
  max_retries: int | None = None,
47
  timeout: int | None = None,
 
72
  )
73
  output = responses.candidates[0].content.parts[0].text
74
  total_tokens = responses.usage_metadata.total_token_count
75
+ if block:
76
+ block.update_metadata(
77
+ llm_tokens_used=total_tokens, llm_request_count=1
78
+ )
79
  return json.loads(output)
80
  except APIError as e:
81
  if e.code in [429, 443, 503]:
marker/services/ollama.py CHANGED
@@ -35,7 +35,7 @@ class OllamaService(BaseService):
35
  self,
36
  prompt: str,
37
  image: PIL.Image.Image | List[PIL.Image.Image] | None,
38
- block: Block,
39
  response_schema: type[BaseModel],
40
  max_retries: int | None = None,
41
  timeout: int | None = None,
@@ -68,7 +68,9 @@ class OllamaService(BaseService):
68
  total_tokens = (
69
  response_data["prompt_eval_count"] + response_data["eval_count"]
70
  )
71
- block.update_metadata(llm_request_count=1, llm_tokens_used=total_tokens)
 
 
72
 
73
  data = response_data["response"]
74
  return json.loads(data)
 
35
  self,
36
  prompt: str,
37
  image: PIL.Image.Image | List[PIL.Image.Image] | None,
38
+ block: Block | None,
39
  response_schema: type[BaseModel],
40
  max_retries: int | None = None,
41
  timeout: int | None = None,
 
68
  total_tokens = (
69
  response_data["prompt_eval_count"] + response_data["eval_count"]
70
  )
71
+
72
+ if block:
73
+ block.update_metadata(llm_request_count=1, llm_tokens_used=total_tokens)
74
 
75
  data = response_data["response"]
76
  return json.loads(data)
marker/services/openai.py CHANGED
@@ -78,7 +78,7 @@ class OpenAIService(BaseService):
78
  self,
79
  prompt: str,
80
  image: PIL.Image.Image | List[PIL.Image.Image] | None,
81
- block: Block,
82
  response_schema: type[BaseModel],
83
  max_retries: int | None = None,
84
  timeout: int | None = None,
@@ -117,7 +117,10 @@ class OpenAIService(BaseService):
117
  )
118
  response_text = response.choices[0].message.content
119
  total_tokens = response.usage.total_tokens
120
- block.update_metadata(llm_tokens_used=total_tokens, llm_request_count=1)
 
 
 
121
  return json.loads(response_text)
122
  except (APITimeoutError, RateLimitError) as e:
123
  # Rate limit exceeded
 
78
  self,
79
  prompt: str,
80
  image: PIL.Image.Image | List[PIL.Image.Image] | None,
81
+ block: Block | None,
82
  response_schema: type[BaseModel],
83
  max_retries: int | None = None,
84
  timeout: int | None = None,
 
117
  )
118
  response_text = response.choices[0].message.content
119
  total_tokens = response.usage.total_tokens
120
+ if block:
121
+ block.update_metadata(
122
+ llm_tokens_used=total_tokens, llm_request_count=1
123
+ )
124
  return json.loads(response_text)
125
  except (APITimeoutError, RateLimitError) as e:
126
  # Rate limit exceeded
tests/extractors/test_basemodel_gen.py DELETED
@@ -1,44 +0,0 @@
1
- from marker.extractors.util import json_schema_to_base_model
2
-
3
-
4
- def test_model_generator():
5
- test_schema = {
6
- "title": "UserModel",
7
- "type": "object",
8
- "properties": {
9
- "email": {
10
- "type": "string",
11
- "format": "email",
12
- "description": "User's email address",
13
- },
14
- "age": {"type": "integer", "minimum": 0, "maximum": 150},
15
- "name": {"type": "string", "minLength": 1, "maxLength": 100},
16
- "status": {"anyOf": [{"type": "string"}, {"type": "null"}]},
17
- "tags": {"type": "array", "items": {"type": "string"}},
18
- "preferences": {
19
- "type": "object",
20
- "properties": {
21
- "theme": {"type": "string", "enum": ["dark", "light"]},
22
- "notifications": {"type": "boolean", "default": True},
23
- },
24
- },
25
- "role": {
26
- "type": "string",
27
- "enum": ["admin", "user", "guest"],
28
- "default": "user",
29
- },
30
- },
31
- "required": ["email", "name"],
32
- }
33
-
34
- # Create the model
35
- UserModel = json_schema_to_base_model(test_schema)
36
- user = UserModel(
37
- email="test@example.com",
38
- name="John Doe",
39
- age=30,
40
- tags=["python", "pydantic"],
41
- preferences={"theme": "dark"},
42
- role="admin",
43
- )
44
- assert user is not None