Victor Yang commited on
Commit
4381ae5
·
1 Parent(s): d836ba5

Add initial JSON and Markdown files for multicolumn CNN and switch transformers, including metadata and example images

Browse files
Files changed (1) hide show
  1. marker/scripts/server.py +93 -3
marker/scripts/server.py CHANGED
@@ -103,9 +103,75 @@ class CommonParams(BaseModel):
103
  output_format: Annotated[
104
  str,
105
  Field(
106
- description="The format to output the text in. Can be 'markdown', 'json', or 'html'. Defaults to 'markdown'."
107
  ),
108
  ] = "markdown"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
 
111
  async def _convert_pdf(params: CommonParams):
@@ -113,11 +179,13 @@ async def _convert_pdf(params: CommonParams):
113
  "Invalid output format"
114
  )
115
  try:
116
- options = params.model_dump()
 
 
117
  config_parser = ConfigParser(options)
118
  config_dict = config_parser.generate_config_dict()
119
  config_dict["pdftext_workers"] = 1
120
- converter_cls = PdfConverter
121
  converter = converter_cls(
122
  config=config_dict,
123
  artifact_dict=app_data["models"],
@@ -163,6 +231,17 @@ async def convert_pdf_upload(
163
  force_ocr: Optional[bool] = Form(default=False),
164
  paginate_output: Optional[bool] = Form(default=False),
165
  output_format: Optional[str] = Form(default="markdown"),
 
 
 
 
 
 
 
 
 
 
 
166
  file: UploadFile = File(
167
  ..., description="The PDF file to convert.", media_type="application/pdf"
168
  ),
@@ -179,6 +258,17 @@ async def convert_pdf_upload(
179
  force_ocr=force_ocr,
180
  paginate_output=paginate_output,
181
  output_format=output_format,
 
 
 
 
 
 
 
 
 
 
 
182
  )
183
  results = await _convert_pdf(params)
184
  os.remove(upload_path)
 
103
  output_format: Annotated[
104
  str,
105
  Field(
106
+ description="The format to output the text in. Can be 'markdown', 'json', 'html', or 'chunks'. Defaults to 'markdown'."
107
  ),
108
  ] = "markdown"
109
+ output_dir: Annotated[
110
+ Optional[str],
111
+ Field(
112
+ description="Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR."
113
+ ),
114
+ ] = None
115
+ use_llm: Annotated[
116
+ bool,
117
+ Field(
118
+ description="Uses an LLM to improve accuracy. You will need to configure the LLM backend."
119
+ ),
120
+ ] = False
121
+ block_correction_prompt: Annotated[
122
+ Optional[str],
123
+ Field(
124
+ description="If LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output."
125
+ ),
126
+ ] = None
127
+ strip_existing_ocr: Annotated[
128
+ bool,
129
+ Field(
130
+ description="Remove all existing OCR text in the document and re-OCR with surya."
131
+ ),
132
+ ] = False
133
+ redo_inline_math: Annotated[
134
+ bool,
135
+ Field(
136
+ description="If you want the absolute highest quality inline math conversion, use this along with --use_llm."
137
+ ),
138
+ ] = False
139
+ disable_image_extraction: Annotated[
140
+ bool,
141
+ Field(
142
+ description="Don't extract images from the PDF. If you also specify --use_llm, then images will be replaced with a description."
143
+ ),
144
+ ] = False
145
+ debug: Annotated[
146
+ bool,
147
+ Field(
148
+ description="Enable debug mode for additional logging and diagnostic information."
149
+ ),
150
+ ] = False
151
+ processors: Annotated[
152
+ Optional[str],
153
+ Field(
154
+ description="Override the default processors by providing their full module paths, separated by commas. Example: 'module1.processor1,module2.processor2'"
155
+ ),
156
+ ] = None
157
+ config_json: Annotated[
158
+ Optional[str],
159
+ Field(
160
+ description="Path to a JSON configuration file containing additional settings."
161
+ ),
162
+ ] = None
163
+ converter_cls: Annotated[
164
+ Optional[str],
165
+ Field(
166
+ description="Converter class to use. One of 'marker.converters.pdf.PdfConverter' (default) or 'marker.converters.table.TableConverter'. The PdfConverter will convert the whole PDF, the TableConverter will only extract and convert tables."
167
+ ),
168
+ ] = None
169
+ llm_service: Annotated[
170
+ Optional[str],
171
+ Field(
172
+ description="Which LLM service to use if --use_llm is passed. This defaults to 'marker.services.gemini.GoogleGeminiService'. Should be full import path, like 'marker.services.gemini.GoogleGeminiService'."
173
+ ),
174
+ ] = None
175
 
176
 
177
  async def _convert_pdf(params: CommonParams):
 
179
  "Invalid output format"
180
  )
181
  try:
182
+ options = params.model_dump(exclude_none=True)
183
+ # Remove None values to avoid passing them to ConfigParser
184
+ options = {k: v for k, v in options.items() if v is not None}
185
  config_parser = ConfigParser(options)
186
  config_dict = config_parser.generate_config_dict()
187
  config_dict["pdftext_workers"] = 1
188
+ converter_cls = config_parser.get_converter_cls()
189
  converter = converter_cls(
190
  config=config_dict,
191
  artifact_dict=app_data["models"],
 
231
  force_ocr: Optional[bool] = Form(default=False),
232
  paginate_output: Optional[bool] = Form(default=False),
233
  output_format: Optional[str] = Form(default="markdown"),
234
+ output_dir: Optional[str] = Form(default=None),
235
+ use_llm: Optional[bool] = Form(default=False),
236
+ block_correction_prompt: Optional[str] = Form(default=None),
237
+ strip_existing_ocr: Optional[bool] = Form(default=False),
238
+ redo_inline_math: Optional[bool] = Form(default=False),
239
+ disable_image_extraction: Optional[bool] = Form(default=False),
240
+ debug: Optional[bool] = Form(default=False),
241
+ processors: Optional[str] = Form(default=None),
242
+ config_json: Optional[str] = Form(default=None),
243
+ converter_cls: Optional[str] = Form(default=None),
244
+ llm_service: Optional[str] = Form(default=None),
245
  file: UploadFile = File(
246
  ..., description="The PDF file to convert.", media_type="application/pdf"
247
  ),
 
258
  force_ocr=force_ocr,
259
  paginate_output=paginate_output,
260
  output_format=output_format,
261
+ output_dir=output_dir,
262
+ use_llm=use_llm,
263
+ block_correction_prompt=block_correction_prompt,
264
+ strip_existing_ocr=strip_existing_ocr,
265
+ redo_inline_math=redo_inline_math,
266
+ disable_image_extraction=disable_image_extraction,
267
+ debug=debug,
268
+ processors=processors,
269
+ config_json=config_json,
270
+ converter_cls=converter_cls,
271
+ llm_service=llm_service,
272
  )
273
  results = await _convert_pdf(params)
274
  os.remove(upload_path)