Victor Yang commited on
Commit
c408da0
·
1 Parent(s): 4381ae5

Update .gitignore to include Ruby and PDF files, enhance README with synchronous and asynchronous API endpoint details, and implement task management for PDF conversions in server.py with logging and error handling.

Browse files
Files changed (3) hide show
  1. .gitignore +5 -1
  2. README.md +49 -3
  3. marker/scripts/server.py +241 -20
.gitignore CHANGED
@@ -176,4 +176,8 @@ cython_debug/
176
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
177
  .idea/
178
 
179
- .vscode/
 
 
 
 
 
176
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
177
  .idea/
178
 
179
+ .vscode/
180
+
181
+ # Test scripts
182
+ *.rb
183
+ *.pdf
README.md CHANGED
@@ -505,12 +505,58 @@ result = response.json()
505
 
506
  ### API Endpoints
507
 
508
- - `GET /` - API information page (no authentication required)
509
- - `GET /docs` - Interactive API documentation (no authentication required)
510
  - `POST /marker` - Convert PDF using file path (requires authentication)
511
  - `POST /marker/upload` - Convert PDF via file upload (requires authentication)
512
 
513
- All POST endpoints require the `Authorization: Bearer <token>` header with a valid token.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
 
515
  # Troubleshooting
516
 
 
505
 
506
  ### API Endpoints
507
 
508
+ **Synchronous Endpoints** (wait for result, may take a long time):
 
509
  - `POST /marker` - Convert PDF using file path (requires authentication)
510
  - `POST /marker/upload` - Convert PDF via file upload (requires authentication)
511
 
512
+ **Asynchronous Endpoints** (return immediately with task ID):
513
+ - `POST /marker/async` - Submit PDF conversion task using file path (returns task ID)
514
+ - `POST /marker/upload/async` - Submit PDF conversion task via file upload (returns task ID)
515
+ - `GET /marker/task/{task_id}` - Check task status and get results
516
+ - `DELETE /marker/task/{task_id}` - Delete a completed or failed task
517
+
518
+ **Other Endpoints**:
519
+ - `GET /` - API information page (no authentication required)
520
+ - `GET /docs` - Interactive API documentation (no authentication required)
521
+
522
+ All POST/GET/DELETE endpoints (except `/` and `/docs`) require the `Authorization: Bearer <token>` header with a valid token.
523
+
524
+ ### Using Asynchronous Endpoints
525
+
526
+ For long-running conversions, use the async endpoints:
527
+
528
+ ```python
529
+ import requests
530
+ import time
531
+
532
+ # 1. Submit task
533
+ url = "https://your-space.hf.space/marker/upload/async"
534
+ headers = {"Authorization": f"Bearer {token}"}
535
+ files = {"file": ("document.pdf", open("document.pdf", "rb"), "application/pdf")}
536
+ data = {"output_format": "markdown"}
537
+
538
+ response = requests.post(url, headers=headers, files=files, data=data)
539
+ task = response.json()
540
+ task_id = task["task_id"]
541
+ print(f"Task created: {task_id}")
542
+
543
+ # 2. Poll for status
544
+ status_url = f"https://your-space.hf.space/marker/task/{task_id}"
545
+ while True:
546
+ response = requests.get(status_url, headers=headers)
547
+ status = response.json()
548
+
549
+ if status["status"] == "completed":
550
+ result = status["result"]
551
+ print(f"Success! Output: {result['output'][:100]}...")
552
+ break
553
+ elif status["status"] == "failed":
554
+ print(f"Failed: {status.get('error', 'Unknown error')}")
555
+ break
556
+ else:
557
+ print(f"Status: {status['status']}, elapsed: {status.get('elapsed_time', 0)}s")
558
+ time.sleep(2) # Wait 2 seconds before checking again
559
+ ```
560
 
561
  # Troubleshooting
562
 
marker/scripts/server.py CHANGED
@@ -1,10 +1,17 @@
1
  import traceback
 
 
 
 
 
 
 
2
 
3
  import click
4
  import os
5
 
6
  from pydantic import BaseModel, Field
7
- from starlette.responses import HTMLResponse
8
  from fastapi import FastAPI, Form, File, UploadFile, Depends, HTTPException, status
9
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
10
 
@@ -13,13 +20,29 @@ from marker.output import text_from_rendered
13
 
14
  import base64
15
  from contextlib import asynccontextmanager
16
- from typing import Optional, Annotated
17
  import io
18
 
19
  from marker.converters.pdf import PdfConverter
20
  from marker.models import create_model_dict
21
  from marker.settings import settings
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  app_data = {}
24
 
25
 
@@ -174,18 +197,35 @@ class CommonParams(BaseModel):
174
  ] = None
175
 
176
 
177
- async def _convert_pdf(params: CommonParams):
 
178
  assert params.output_format in ["markdown", "json", "html", "chunks"], (
179
  "Invalid output format"
180
  )
 
 
 
 
 
 
 
 
181
  try:
 
 
 
 
182
  options = params.model_dump(exclude_none=True)
183
  # Remove None values to avoid passing them to ConfigParser
184
  options = {k: v for k, v in options.items() if v is not None}
 
 
185
  config_parser = ConfigParser(options)
186
  config_dict = config_parser.generate_config_dict()
187
  config_dict["pdftext_workers"] = 1
188
  converter_cls = config_parser.get_converter_cls()
 
 
189
  converter = converter_cls(
190
  config=config_dict,
191
  artifact_dict=app_data["models"],
@@ -193,36 +233,76 @@ async def _convert_pdf(params: CommonParams):
193
  renderer=config_parser.get_renderer(),
194
  llm_service=config_parser.get_llm_service(),
195
  )
 
 
 
196
  rendered = converter(params.filepath)
 
 
 
 
197
  text, _, images = text_from_rendered(rendered)
198
  metadata = rendered.metadata
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  except Exception as e:
 
200
  traceback.print_exc()
201
- return {
202
  "success": False,
203
  "error": str(e),
204
  }
 
 
 
 
 
 
 
 
 
205
 
206
- encoded = {}
207
- for k, v in images.items():
208
- byte_stream = io.BytesIO()
209
- v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
210
- encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(
211
- settings.OUTPUT_ENCODING
212
- )
213
 
214
- return {
215
- "format": params.output_format,
216
- "output": text,
217
- "images": encoded,
218
- "metadata": metadata,
219
- "success": True,
220
- }
221
 
222
 
223
  @app.post("/marker")
224
  async def convert_pdf(params: CommonParams, token_verified: bool = Depends(verify_token)):
225
- return await _convert_pdf(params)
 
 
226
 
227
 
228
  @app.post("/marker/upload")
@@ -247,6 +327,7 @@ async def convert_pdf_upload(
247
  ),
248
  token_verified: bool = Depends(verify_token),
249
  ):
 
250
  upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
251
  with open(upload_path, "wb+") as upload_file:
252
  file_contents = await file.read()
@@ -270,11 +351,151 @@ async def convert_pdf_upload(
270
  converter_cls=converter_cls,
271
  llm_service=llm_service,
272
  )
273
- results = await _convert_pdf(params)
 
274
  os.remove(upload_path)
275
  return results
276
 
277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  @click.command()
279
  @click.option("--port", type=int, default=8000, help="Port to run the server on")
280
  @click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
 
1
  import traceback
2
+ import time
3
+ import logging
4
+ import uuid
5
+ import asyncio
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from threading import Lock
8
+ from enum import Enum
9
 
10
  import click
11
  import os
12
 
13
  from pydantic import BaseModel, Field
14
+ from starlette.responses import HTMLResponse, JSONResponse
15
  from fastapi import FastAPI, Form, File, UploadFile, Depends, HTTPException, status
16
  from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
17
 
 
20
 
21
  import base64
22
  from contextlib import asynccontextmanager
23
+ from typing import Optional, Annotated, Dict, Any
24
  import io
25
 
26
  from marker.converters.pdf import PdfConverter
27
  from marker.models import create_model_dict
28
  from marker.settings import settings
29
 
30
+ # Configure logging
31
+ logging.basicConfig(level=logging.INFO)
32
+ logger = logging.getLogger(__name__)
33
+
34
+ # Task status enum
35
+ class TaskStatus(str, Enum):
36
+ PENDING = "pending"
37
+ PROCESSING = "processing"
38
+ COMPLETED = "completed"
39
+ FAILED = "failed"
40
+
41
+ # Global task storage and executor
42
+ task_storage: Dict[str, Dict[str, Any]] = {}
43
+ task_lock = Lock()
44
+ executor = ThreadPoolExecutor(max_workers=2) # Limit concurrent conversions
45
+
46
  app_data = {}
47
 
48
 
 
197
  ] = None
198
 
199
 
200
+ def _convert_pdf_sync(params: CommonParams, task_id: str):
201
+ """Synchronous PDF conversion function to run in thread pool"""
202
  assert params.output_format in ["markdown", "json", "html", "chunks"], (
203
  "Invalid output format"
204
  )
205
+ start_time = time.time()
206
+ logger.info(f"[Task {task_id}] Starting PDF conversion for file: {params.filepath}")
207
+
208
+ # Update task status
209
+ with task_lock:
210
+ task_storage[task_id]["status"] = TaskStatus.PROCESSING
211
+ task_storage[task_id]["started_at"] = time.time()
212
+
213
  try:
214
+ # Get file size for logging
215
+ file_size = os.path.getsize(params.filepath) if params.filepath and os.path.exists(params.filepath) else 0
216
+ logger.info(f"[Task {task_id}] File size: {file_size / 1024:.2f} KB")
217
+
218
  options = params.model_dump(exclude_none=True)
219
  # Remove None values to avoid passing them to ConfigParser
220
  options = {k: v for k, v in options.items() if v is not None}
221
+ logger.info(f"[Task {task_id}] Conversion options: {options}")
222
+
223
  config_parser = ConfigParser(options)
224
  config_dict = config_parser.generate_config_dict()
225
  config_dict["pdftext_workers"] = 1
226
  converter_cls = config_parser.get_converter_cls()
227
+
228
+ logger.info(f"[Task {task_id}] Creating converter...")
229
  converter = converter_cls(
230
  config=config_dict,
231
  artifact_dict=app_data["models"],
 
233
  renderer=config_parser.get_renderer(),
234
  llm_service=config_parser.get_llm_service(),
235
  )
236
+
237
+ logger.info(f"[Task {task_id}] Starting PDF conversion (this may take a while)...")
238
+ conversion_start = time.time()
239
  rendered = converter(params.filepath)
240
+ conversion_time = time.time() - conversion_start
241
+ logger.info(f"[Task {task_id}] PDF conversion completed in {conversion_time:.2f} seconds")
242
+
243
+ logger.info(f"[Task {task_id}] Extracting text and images...")
244
  text, _, images = text_from_rendered(rendered)
245
  metadata = rendered.metadata
246
+ logger.info(f"[Task {task_id}] Extracted {len(text)} characters and {len(images)} images")
247
+
248
+ logger.info(f"[Task {task_id}] Encoding images...")
249
+ encoded = {}
250
+ for k, v in images.items():
251
+ byte_stream = io.BytesIO()
252
+ v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
253
+ encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(
254
+ settings.OUTPUT_ENCODING
255
+ )
256
+
257
+ total_time = time.time() - start_time
258
+ logger.info(f"[Task {task_id}] Total processing time: {total_time:.2f} seconds")
259
+
260
+ result = {
261
+ "format": params.output_format,
262
+ "output": text,
263
+ "images": encoded,
264
+ "metadata": metadata,
265
+ "success": True,
266
+ "processing_time": round(total_time, 2),
267
+ }
268
+
269
+ # Update task with result
270
+ with task_lock:
271
+ task_storage[task_id]["status"] = TaskStatus.COMPLETED
272
+ task_storage[task_id]["result"] = result
273
+ task_storage[task_id]["completed_at"] = time.time()
274
+
275
+ return result
276
+
277
  except Exception as e:
278
+ logger.error(f"[Task {task_id}] Error during conversion: {str(e)}")
279
  traceback.print_exc()
280
+ error_result = {
281
  "success": False,
282
  "error": str(e),
283
  }
284
+
285
+ # Update task with error
286
+ with task_lock:
287
+ task_storage[task_id]["status"] = TaskStatus.FAILED
288
+ task_storage[task_id]["result"] = error_result
289
+ task_storage[task_id]["completed_at"] = time.time()
290
+ task_storage[task_id]["error"] = str(e)
291
+
292
+ return error_result
293
 
 
 
 
 
 
 
 
294
 
295
+ async def _convert_pdf_async(params: CommonParams, task_id: str):
296
+ """Async wrapper for PDF conversion"""
297
+ loop = asyncio.get_event_loop()
298
+ return await loop.run_in_executor(executor, _convert_pdf_sync, params, task_id)
 
 
 
299
 
300
 
301
  @app.post("/marker")
302
  async def convert_pdf(params: CommonParams, token_verified: bool = Depends(verify_token)):
303
+ """Synchronous endpoint - returns result directly (may take a long time)"""
304
+ task_id = f"sync-{uuid.uuid4().hex[:8]}"
305
+ return await _convert_pdf_async(params, task_id)
306
 
307
 
308
  @app.post("/marker/upload")
 
327
  ),
328
  token_verified: bool = Depends(verify_token),
329
  ):
330
+ """Synchronous upload endpoint - returns result directly (may take a long time)"""
331
  upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
332
  with open(upload_path, "wb+") as upload_file:
333
  file_contents = await file.read()
 
351
  converter_cls=converter_cls,
352
  llm_service=llm_service,
353
  )
354
+ task_id = f"upload-{uuid.uuid4().hex[:8]}"
355
+ results = await _convert_pdf_async(params, task_id)
356
  os.remove(upload_path)
357
  return results
358
 
359
 
360
+ @app.post("/marker/upload/async")
361
+ async def convert_pdf_upload_async(
362
+ page_range: Optional[str] = Form(default=None),
363
+ force_ocr: Optional[bool] = Form(default=False),
364
+ paginate_output: Optional[bool] = Form(default=False),
365
+ output_format: Optional[str] = Form(default="markdown"),
366
+ output_dir: Optional[str] = Form(default=None),
367
+ use_llm: Optional[bool] = Form(default=False),
368
+ block_correction_prompt: Optional[str] = Form(default=None),
369
+ strip_existing_ocr: Optional[bool] = Form(default=False),
370
+ redo_inline_math: Optional[bool] = Form(default=False),
371
+ disable_image_extraction: Optional[bool] = Form(default=False),
372
+ debug: Optional[bool] = Form(default=False),
373
+ processors: Optional[str] = Form(default=None),
374
+ config_json: Optional[str] = Form(default=None),
375
+ converter_cls: Optional[str] = Form(default=None),
376
+ llm_service: Optional[str] = Form(default=None),
377
+ file: UploadFile = File(
378
+ ..., description="The PDF file to convert.", media_type="application/pdf"
379
+ ),
380
+ token_verified: bool = Depends(verify_token),
381
+ ):
382
+ """Asynchronous upload endpoint - returns task ID immediately"""
383
+ task_id = str(uuid.uuid4())
384
+ upload_path = os.path.join(UPLOAD_DIRECTORY, f"{task_id}_{file.filename}")
385
+
386
+ # Save uploaded file
387
+ with open(upload_path, "wb+") as upload_file:
388
+ file_contents = await file.read()
389
+ upload_file.write(file_contents)
390
+
391
+ params = CommonParams(
392
+ filepath=upload_path,
393
+ page_range=page_range,
394
+ force_ocr=force_ocr,
395
+ paginate_output=paginate_output,
396
+ output_format=output_format,
397
+ output_dir=output_dir,
398
+ use_llm=use_llm,
399
+ block_correction_prompt=block_correction_prompt,
400
+ strip_existing_ocr=strip_existing_ocr,
401
+ redo_inline_math=redo_inline_math,
402
+ disable_image_extraction=disable_image_extraction,
403
+ debug=debug,
404
+ processors=processors,
405
+ config_json=config_json,
406
+ converter_cls=converter_cls,
407
+ llm_service=llm_service,
408
+ )
409
+
410
+ # Initialize task
411
+ with task_lock:
412
+ task_storage[task_id] = {
413
+ "status": TaskStatus.PENDING,
414
+ "created_at": time.time(),
415
+ "params": params.model_dump(),
416
+ "result": None,
417
+ "error": None,
418
+ "upload_path": upload_path, # Store path for cleanup
419
+ }
420
+
421
+ # Start conversion in background
422
+ asyncio.create_task(_convert_pdf_async_with_cleanup(params, task_id, upload_path))
423
+
424
+ return {
425
+ "task_id": task_id,
426
+ "status": TaskStatus.PENDING,
427
+ "message": "Task created. Use /marker/task/{task_id} to check status and get results.",
428
+ "status_url": f"/marker/task/{task_id}",
429
+ }
430
+
431
+
432
+ async def _convert_pdf_async_with_cleanup(params: CommonParams, task_id: str, upload_path: str):
433
+ """Async wrapper with file cleanup"""
434
+ try:
435
+ await _convert_pdf_async(params, task_id)
436
+ finally:
437
+ # Clean up uploaded file
438
+ try:
439
+ if os.path.exists(upload_path):
440
+ os.remove(upload_path)
441
+ except Exception as e:
442
+ logger.warning(f"[Task {task_id}] Failed to cleanup file {upload_path}: {e}")
443
+
444
+
445
+ @app.get("/marker/task/{task_id}")
446
+ async def get_task_status(task_id: str, token_verified: bool = Depends(verify_token)):
447
+ """Get task status and result"""
448
+ with task_lock:
449
+ task = task_storage.get(task_id)
450
+
451
+ if not task:
452
+ raise HTTPException(status_code=404, detail="Task not found")
453
+
454
+ response = {
455
+ "task_id": task_id,
456
+ "status": task["status"],
457
+ "created_at": task.get("created_at"),
458
+ "started_at": task.get("started_at"),
459
+ "completed_at": task.get("completed_at"),
460
+ }
461
+
462
+ if task["status"] == TaskStatus.COMPLETED:
463
+ response["result"] = task["result"]
464
+ elif task["status"] == TaskStatus.FAILED:
465
+ response["error"] = task.get("error", "Unknown error")
466
+ response["result"] = task.get("result")
467
+
468
+ # Calculate elapsed time if processing
469
+ if task["status"] == TaskStatus.PROCESSING and "started_at" in task:
470
+ response["elapsed_time"] = round(time.time() - task["started_at"], 2)
471
+
472
+ return response
473
+
474
+
475
+ @app.delete("/marker/task/{task_id}")
476
+ async def delete_task(task_id: str, token_verified: bool = Depends(verify_token)):
477
+ """Delete a completed or failed task"""
478
+ with task_lock:
479
+ task = task_storage.get(task_id)
480
+ if not task:
481
+ raise HTTPException(status_code=404, detail="Task not found")
482
+
483
+ if task["status"] == TaskStatus.PROCESSING:
484
+ raise HTTPException(status_code=400, detail="Cannot delete task that is currently processing")
485
+
486
+ # Clean up uploaded file if exists
487
+ upload_path = task.get("upload_path")
488
+ if upload_path and os.path.exists(upload_path):
489
+ try:
490
+ os.remove(upload_path)
491
+ except Exception as e:
492
+ logger.warning(f"Failed to cleanup file {upload_path}: {e}")
493
+
494
+ del task_storage[task_id]
495
+
496
+ return {"message": "Task deleted successfully"}
497
+
498
+
499
  @click.command()
500
  @click.option("--port", type=int, default=8000, help="Port to run the server on")
501
  @click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")