Victor Yang
commited on
Commit
·
4381ae5
1
Parent(s):
d836ba5
Add initial JSON and Markdown files for multicolumn CNN and switch transformers, including metadata and example images
Browse files- marker/scripts/server.py +93 -3
marker/scripts/server.py
CHANGED
|
@@ -103,9 +103,75 @@ class CommonParams(BaseModel):
|
|
| 103 |
output_format: Annotated[
|
| 104 |
str,
|
| 105 |
Field(
|
| 106 |
-
description="The format to output the text in. Can be 'markdown', 'json', or '
|
| 107 |
),
|
| 108 |
] = "markdown"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
|
| 111 |
async def _convert_pdf(params: CommonParams):
|
|
@@ -113,11 +179,13 @@ async def _convert_pdf(params: CommonParams):
|
|
| 113 |
"Invalid output format"
|
| 114 |
)
|
| 115 |
try:
|
| 116 |
-
options = params.model_dump()
|
|
|
|
|
|
|
| 117 |
config_parser = ConfigParser(options)
|
| 118 |
config_dict = config_parser.generate_config_dict()
|
| 119 |
config_dict["pdftext_workers"] = 1
|
| 120 |
-
converter_cls =
|
| 121 |
converter = converter_cls(
|
| 122 |
config=config_dict,
|
| 123 |
artifact_dict=app_data["models"],
|
|
@@ -163,6 +231,17 @@ async def convert_pdf_upload(
|
|
| 163 |
force_ocr: Optional[bool] = Form(default=False),
|
| 164 |
paginate_output: Optional[bool] = Form(default=False),
|
| 165 |
output_format: Optional[str] = Form(default="markdown"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
file: UploadFile = File(
|
| 167 |
..., description="The PDF file to convert.", media_type="application/pdf"
|
| 168 |
),
|
|
@@ -179,6 +258,17 @@ async def convert_pdf_upload(
|
|
| 179 |
force_ocr=force_ocr,
|
| 180 |
paginate_output=paginate_output,
|
| 181 |
output_format=output_format,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
)
|
| 183 |
results = await _convert_pdf(params)
|
| 184 |
os.remove(upload_path)
|
|
|
|
| 103 |
output_format: Annotated[
|
| 104 |
str,
|
| 105 |
Field(
|
| 106 |
+
description="The format to output the text in. Can be 'markdown', 'json', 'html', or 'chunks'. Defaults to 'markdown'."
|
| 107 |
),
|
| 108 |
] = "markdown"
|
| 109 |
+
output_dir: Annotated[
|
| 110 |
+
Optional[str],
|
| 111 |
+
Field(
|
| 112 |
+
description="Directory where output files will be saved. Defaults to the value specified in settings.OUTPUT_DIR."
|
| 113 |
+
),
|
| 114 |
+
] = None
|
| 115 |
+
use_llm: Annotated[
|
| 116 |
+
bool,
|
| 117 |
+
Field(
|
| 118 |
+
description="Uses an LLM to improve accuracy. You will need to configure the LLM backend."
|
| 119 |
+
),
|
| 120 |
+
] = False
|
| 121 |
+
block_correction_prompt: Annotated[
|
| 122 |
+
Optional[str],
|
| 123 |
+
Field(
|
| 124 |
+
description="If LLM mode is active, an optional prompt that will be used to correct the output of marker. This is useful for custom formatting or logic that you want to apply to the output."
|
| 125 |
+
),
|
| 126 |
+
] = None
|
| 127 |
+
strip_existing_ocr: Annotated[
|
| 128 |
+
bool,
|
| 129 |
+
Field(
|
| 130 |
+
description="Remove all existing OCR text in the document and re-OCR with surya."
|
| 131 |
+
),
|
| 132 |
+
] = False
|
| 133 |
+
redo_inline_math: Annotated[
|
| 134 |
+
bool,
|
| 135 |
+
Field(
|
| 136 |
+
description="If you want the absolute highest quality inline math conversion, use this along with --use_llm."
|
| 137 |
+
),
|
| 138 |
+
] = False
|
| 139 |
+
disable_image_extraction: Annotated[
|
| 140 |
+
bool,
|
| 141 |
+
Field(
|
| 142 |
+
description="Don't extract images from the PDF. If you also specify --use_llm, then images will be replaced with a description."
|
| 143 |
+
),
|
| 144 |
+
] = False
|
| 145 |
+
debug: Annotated[
|
| 146 |
+
bool,
|
| 147 |
+
Field(
|
| 148 |
+
description="Enable debug mode for additional logging and diagnostic information."
|
| 149 |
+
),
|
| 150 |
+
] = False
|
| 151 |
+
processors: Annotated[
|
| 152 |
+
Optional[str],
|
| 153 |
+
Field(
|
| 154 |
+
description="Override the default processors by providing their full module paths, separated by commas. Example: 'module1.processor1,module2.processor2'"
|
| 155 |
+
),
|
| 156 |
+
] = None
|
| 157 |
+
config_json: Annotated[
|
| 158 |
+
Optional[str],
|
| 159 |
+
Field(
|
| 160 |
+
description="Path to a JSON configuration file containing additional settings."
|
| 161 |
+
),
|
| 162 |
+
] = None
|
| 163 |
+
converter_cls: Annotated[
|
| 164 |
+
Optional[str],
|
| 165 |
+
Field(
|
| 166 |
+
description="Converter class to use. One of 'marker.converters.pdf.PdfConverter' (default) or 'marker.converters.table.TableConverter'. The PdfConverter will convert the whole PDF, the TableConverter will only extract and convert tables."
|
| 167 |
+
),
|
| 168 |
+
] = None
|
| 169 |
+
llm_service: Annotated[
|
| 170 |
+
Optional[str],
|
| 171 |
+
Field(
|
| 172 |
+
description="Which LLM service to use if --use_llm is passed. This defaults to 'marker.services.gemini.GoogleGeminiService'. Should be full import path, like 'marker.services.gemini.GoogleGeminiService'."
|
| 173 |
+
),
|
| 174 |
+
] = None
|
| 175 |
|
| 176 |
|
| 177 |
async def _convert_pdf(params: CommonParams):
|
|
|
|
| 179 |
"Invalid output format"
|
| 180 |
)
|
| 181 |
try:
|
| 182 |
+
options = params.model_dump(exclude_none=True)
|
| 183 |
+
# Remove None values to avoid passing them to ConfigParser
|
| 184 |
+
options = {k: v for k, v in options.items() if v is not None}
|
| 185 |
config_parser = ConfigParser(options)
|
| 186 |
config_dict = config_parser.generate_config_dict()
|
| 187 |
config_dict["pdftext_workers"] = 1
|
| 188 |
+
converter_cls = config_parser.get_converter_cls()
|
| 189 |
converter = converter_cls(
|
| 190 |
config=config_dict,
|
| 191 |
artifact_dict=app_data["models"],
|
|
|
|
| 231 |
force_ocr: Optional[bool] = Form(default=False),
|
| 232 |
paginate_output: Optional[bool] = Form(default=False),
|
| 233 |
output_format: Optional[str] = Form(default="markdown"),
|
| 234 |
+
output_dir: Optional[str] = Form(default=None),
|
| 235 |
+
use_llm: Optional[bool] = Form(default=False),
|
| 236 |
+
block_correction_prompt: Optional[str] = Form(default=None),
|
| 237 |
+
strip_existing_ocr: Optional[bool] = Form(default=False),
|
| 238 |
+
redo_inline_math: Optional[bool] = Form(default=False),
|
| 239 |
+
disable_image_extraction: Optional[bool] = Form(default=False),
|
| 240 |
+
debug: Optional[bool] = Form(default=False),
|
| 241 |
+
processors: Optional[str] = Form(default=None),
|
| 242 |
+
config_json: Optional[str] = Form(default=None),
|
| 243 |
+
converter_cls: Optional[str] = Form(default=None),
|
| 244 |
+
llm_service: Optional[str] = Form(default=None),
|
| 245 |
file: UploadFile = File(
|
| 246 |
..., description="The PDF file to convert.", media_type="application/pdf"
|
| 247 |
),
|
|
|
|
| 258 |
force_ocr=force_ocr,
|
| 259 |
paginate_output=paginate_output,
|
| 260 |
output_format=output_format,
|
| 261 |
+
output_dir=output_dir,
|
| 262 |
+
use_llm=use_llm,
|
| 263 |
+
block_correction_prompt=block_correction_prompt,
|
| 264 |
+
strip_existing_ocr=strip_existing_ocr,
|
| 265 |
+
redo_inline_math=redo_inline_math,
|
| 266 |
+
disable_image_extraction=disable_image_extraction,
|
| 267 |
+
debug=debug,
|
| 268 |
+
processors=processors,
|
| 269 |
+
config_json=config_json,
|
| 270 |
+
converter_cls=converter_cls,
|
| 271 |
+
llm_service=llm_service,
|
| 272 |
)
|
| 273 |
results = await _convert_pdf(params)
|
| 274 |
os.remove(upload_path)
|