Victor Yang
commited on
Commit
·
c408da0
1
Parent(s):
4381ae5
Update .gitignore to include Ruby and PDF files, enhance README with synchronous and asynchronous API endpoint details, and implement task management for PDF conversions in server.py with logging and error handling.
Browse files- .gitignore +5 -1
- README.md +49 -3
- marker/scripts/server.py +241 -20
.gitignore
CHANGED
|
@@ -176,4 +176,8 @@ cython_debug/
|
|
| 176 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 177 |
.idea/
|
| 178 |
|
| 179 |
-
.vscode/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 177 |
.idea/
|
| 178 |
|
| 179 |
+
.vscode/
|
| 180 |
+
|
| 181 |
+
# Test scripts
|
| 182 |
+
*.rb
|
| 183 |
+
*.pdf
|
README.md
CHANGED
|
@@ -505,12 +505,58 @@ result = response.json()
|
|
| 505 |
|
| 506 |
### API Endpoints
|
| 507 |
|
| 508 |
-
|
| 509 |
-
- `GET /docs` - Interactive API documentation (no authentication required)
|
| 510 |
- `POST /marker` - Convert PDF using file path (requires authentication)
|
| 511 |
- `POST /marker/upload` - Convert PDF via file upload (requires authentication)
|
| 512 |
|
| 513 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 514 |
|
| 515 |
# Troubleshooting
|
| 516 |
|
|
|
|
| 505 |
|
| 506 |
### API Endpoints
|
| 507 |
|
| 508 |
+
**Synchronous Endpoints** (wait for result, may take a long time):
|
|
|
|
| 509 |
- `POST /marker` - Convert PDF using file path (requires authentication)
|
| 510 |
- `POST /marker/upload` - Convert PDF via file upload (requires authentication)
|
| 511 |
|
| 512 |
+
**Asynchronous Endpoints** (return immediately with task ID):
|
| 513 |
+
- `POST /marker/async` - Submit PDF conversion task using file path (returns task ID)
|
| 514 |
+
- `POST /marker/upload/async` - Submit PDF conversion task via file upload (returns task ID)
|
| 515 |
+
- `GET /marker/task/{task_id}` - Check task status and get results
|
| 516 |
+
- `DELETE /marker/task/{task_id}` - Delete a completed or failed task
|
| 517 |
+
|
| 518 |
+
**Other Endpoints**:
|
| 519 |
+
- `GET /` - API information page (no authentication required)
|
| 520 |
+
- `GET /docs` - Interactive API documentation (no authentication required)
|
| 521 |
+
|
| 522 |
+
All POST/GET/DELETE endpoints (except `/` and `/docs`) require the `Authorization: Bearer <token>` header with a valid token.
|
| 523 |
+
|
| 524 |
+
### Using Asynchronous Endpoints
|
| 525 |
+
|
| 526 |
+
For long-running conversions, use the async endpoints:
|
| 527 |
+
|
| 528 |
+
```python
|
| 529 |
+
import requests
|
| 530 |
+
import time
|
| 531 |
+
|
| 532 |
+
# 1. Submit task
|
| 533 |
+
url = "https://your-space.hf.space/marker/upload/async"
|
| 534 |
+
headers = {"Authorization": f"Bearer {token}"}
|
| 535 |
+
files = {"file": ("document.pdf", open("document.pdf", "rb"), "application/pdf")}
|
| 536 |
+
data = {"output_format": "markdown"}
|
| 537 |
+
|
| 538 |
+
response = requests.post(url, headers=headers, files=files, data=data)
|
| 539 |
+
task = response.json()
|
| 540 |
+
task_id = task["task_id"]
|
| 541 |
+
print(f"Task created: {task_id}")
|
| 542 |
+
|
| 543 |
+
# 2. Poll for status
|
| 544 |
+
status_url = f"https://your-space.hf.space/marker/task/{task_id}"
|
| 545 |
+
while True:
|
| 546 |
+
response = requests.get(status_url, headers=headers)
|
| 547 |
+
status = response.json()
|
| 548 |
+
|
| 549 |
+
if status["status"] == "completed":
|
| 550 |
+
result = status["result"]
|
| 551 |
+
print(f"Success! Output: {result['output'][:100]}...")
|
| 552 |
+
break
|
| 553 |
+
elif status["status"] == "failed":
|
| 554 |
+
print(f"Failed: {status.get('error', 'Unknown error')}")
|
| 555 |
+
break
|
| 556 |
+
else:
|
| 557 |
+
print(f"Status: {status['status']}, elapsed: {status.get('elapsed_time', 0)}s")
|
| 558 |
+
time.sleep(2) # Wait 2 seconds before checking again
|
| 559 |
+
```
|
| 560 |
|
| 561 |
# Troubleshooting
|
| 562 |
|
marker/scripts/server.py
CHANGED
|
@@ -1,10 +1,17 @@
|
|
| 1 |
import traceback
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import click
|
| 4 |
import os
|
| 5 |
|
| 6 |
from pydantic import BaseModel, Field
|
| 7 |
-
from starlette.responses import HTMLResponse
|
| 8 |
from fastapi import FastAPI, Form, File, UploadFile, Depends, HTTPException, status
|
| 9 |
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 10 |
|
|
@@ -13,13 +20,29 @@ from marker.output import text_from_rendered
|
|
| 13 |
|
| 14 |
import base64
|
| 15 |
from contextlib import asynccontextmanager
|
| 16 |
-
from typing import Optional, Annotated
|
| 17 |
import io
|
| 18 |
|
| 19 |
from marker.converters.pdf import PdfConverter
|
| 20 |
from marker.models import create_model_dict
|
| 21 |
from marker.settings import settings
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
app_data = {}
|
| 24 |
|
| 25 |
|
|
@@ -174,18 +197,35 @@ class CommonParams(BaseModel):
|
|
| 174 |
] = None
|
| 175 |
|
| 176 |
|
| 177 |
-
|
|
|
|
| 178 |
assert params.output_format in ["markdown", "json", "html", "chunks"], (
|
| 179 |
"Invalid output format"
|
| 180 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
options = params.model_dump(exclude_none=True)
|
| 183 |
# Remove None values to avoid passing them to ConfigParser
|
| 184 |
options = {k: v for k, v in options.items() if v is not None}
|
|
|
|
|
|
|
| 185 |
config_parser = ConfigParser(options)
|
| 186 |
config_dict = config_parser.generate_config_dict()
|
| 187 |
config_dict["pdftext_workers"] = 1
|
| 188 |
converter_cls = config_parser.get_converter_cls()
|
|
|
|
|
|
|
| 189 |
converter = converter_cls(
|
| 190 |
config=config_dict,
|
| 191 |
artifact_dict=app_data["models"],
|
|
@@ -193,36 +233,76 @@ async def _convert_pdf(params: CommonParams):
|
|
| 193 |
renderer=config_parser.get_renderer(),
|
| 194 |
llm_service=config_parser.get_llm_service(),
|
| 195 |
)
|
|
|
|
|
|
|
|
|
|
| 196 |
rendered = converter(params.filepath)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
text, _, images = text_from_rendered(rendered)
|
| 198 |
metadata = rendered.metadata
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
except Exception as e:
|
|
|
|
| 200 |
traceback.print_exc()
|
| 201 |
-
|
| 202 |
"success": False,
|
| 203 |
"error": str(e),
|
| 204 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
|
| 206 |
-
encoded = {}
|
| 207 |
-
for k, v in images.items():
|
| 208 |
-
byte_stream = io.BytesIO()
|
| 209 |
-
v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
|
| 210 |
-
encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(
|
| 211 |
-
settings.OUTPUT_ENCODING
|
| 212 |
-
)
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
"metadata": metadata,
|
| 219 |
-
"success": True,
|
| 220 |
-
}
|
| 221 |
|
| 222 |
|
| 223 |
@app.post("/marker")
|
| 224 |
async def convert_pdf(params: CommonParams, token_verified: bool = Depends(verify_token)):
|
| 225 |
-
|
|
|
|
|
|
|
| 226 |
|
| 227 |
|
| 228 |
@app.post("/marker/upload")
|
|
@@ -247,6 +327,7 @@ async def convert_pdf_upload(
|
|
| 247 |
),
|
| 248 |
token_verified: bool = Depends(verify_token),
|
| 249 |
):
|
|
|
|
| 250 |
upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
|
| 251 |
with open(upload_path, "wb+") as upload_file:
|
| 252 |
file_contents = await file.read()
|
|
@@ -270,11 +351,151 @@ async def convert_pdf_upload(
|
|
| 270 |
converter_cls=converter_cls,
|
| 271 |
llm_service=llm_service,
|
| 272 |
)
|
| 273 |
-
|
|
|
|
| 274 |
os.remove(upload_path)
|
| 275 |
return results
|
| 276 |
|
| 277 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
@click.command()
|
| 279 |
@click.option("--port", type=int, default=8000, help="Port to run the server on")
|
| 280 |
@click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
|
|
|
|
| 1 |
import traceback
|
| 2 |
+
import time
|
| 3 |
+
import logging
|
| 4 |
+
import uuid
|
| 5 |
+
import asyncio
|
| 6 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 7 |
+
from threading import Lock
|
| 8 |
+
from enum import Enum
|
| 9 |
|
| 10 |
import click
|
| 11 |
import os
|
| 12 |
|
| 13 |
from pydantic import BaseModel, Field
|
| 14 |
+
from starlette.responses import HTMLResponse, JSONResponse
|
| 15 |
from fastapi import FastAPI, Form, File, UploadFile, Depends, HTTPException, status
|
| 16 |
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 17 |
|
|
|
|
| 20 |
|
| 21 |
import base64
|
| 22 |
from contextlib import asynccontextmanager
|
| 23 |
+
from typing import Optional, Annotated, Dict, Any
|
| 24 |
import io
|
| 25 |
|
| 26 |
from marker.converters.pdf import PdfConverter
|
| 27 |
from marker.models import create_model_dict
|
| 28 |
from marker.settings import settings
|
| 29 |
|
| 30 |
+
# Configure logging
|
| 31 |
+
logging.basicConfig(level=logging.INFO)
|
| 32 |
+
logger = logging.getLogger(__name__)
|
| 33 |
+
|
| 34 |
+
# Task status enum
|
| 35 |
+
class TaskStatus(str, Enum):
|
| 36 |
+
PENDING = "pending"
|
| 37 |
+
PROCESSING = "processing"
|
| 38 |
+
COMPLETED = "completed"
|
| 39 |
+
FAILED = "failed"
|
| 40 |
+
|
| 41 |
+
# Global task storage and executor
|
| 42 |
+
task_storage: Dict[str, Dict[str, Any]] = {}
|
| 43 |
+
task_lock = Lock()
|
| 44 |
+
executor = ThreadPoolExecutor(max_workers=2) # Limit concurrent conversions
|
| 45 |
+
|
| 46 |
app_data = {}
|
| 47 |
|
| 48 |
|
|
|
|
| 197 |
] = None
|
| 198 |
|
| 199 |
|
| 200 |
+
def _convert_pdf_sync(params: CommonParams, task_id: str):
|
| 201 |
+
"""Synchronous PDF conversion function to run in thread pool"""
|
| 202 |
assert params.output_format in ["markdown", "json", "html", "chunks"], (
|
| 203 |
"Invalid output format"
|
| 204 |
)
|
| 205 |
+
start_time = time.time()
|
| 206 |
+
logger.info(f"[Task {task_id}] Starting PDF conversion for file: {params.filepath}")
|
| 207 |
+
|
| 208 |
+
# Update task status
|
| 209 |
+
with task_lock:
|
| 210 |
+
task_storage[task_id]["status"] = TaskStatus.PROCESSING
|
| 211 |
+
task_storage[task_id]["started_at"] = time.time()
|
| 212 |
+
|
| 213 |
try:
|
| 214 |
+
# Get file size for logging
|
| 215 |
+
file_size = os.path.getsize(params.filepath) if params.filepath and os.path.exists(params.filepath) else 0
|
| 216 |
+
logger.info(f"[Task {task_id}] File size: {file_size / 1024:.2f} KB")
|
| 217 |
+
|
| 218 |
options = params.model_dump(exclude_none=True)
|
| 219 |
# Remove None values to avoid passing them to ConfigParser
|
| 220 |
options = {k: v for k, v in options.items() if v is not None}
|
| 221 |
+
logger.info(f"[Task {task_id}] Conversion options: {options}")
|
| 222 |
+
|
| 223 |
config_parser = ConfigParser(options)
|
| 224 |
config_dict = config_parser.generate_config_dict()
|
| 225 |
config_dict["pdftext_workers"] = 1
|
| 226 |
converter_cls = config_parser.get_converter_cls()
|
| 227 |
+
|
| 228 |
+
logger.info(f"[Task {task_id}] Creating converter...")
|
| 229 |
converter = converter_cls(
|
| 230 |
config=config_dict,
|
| 231 |
artifact_dict=app_data["models"],
|
|
|
|
| 233 |
renderer=config_parser.get_renderer(),
|
| 234 |
llm_service=config_parser.get_llm_service(),
|
| 235 |
)
|
| 236 |
+
|
| 237 |
+
logger.info(f"[Task {task_id}] Starting PDF conversion (this may take a while)...")
|
| 238 |
+
conversion_start = time.time()
|
| 239 |
rendered = converter(params.filepath)
|
| 240 |
+
conversion_time = time.time() - conversion_start
|
| 241 |
+
logger.info(f"[Task {task_id}] PDF conversion completed in {conversion_time:.2f} seconds")
|
| 242 |
+
|
| 243 |
+
logger.info(f"[Task {task_id}] Extracting text and images...")
|
| 244 |
text, _, images = text_from_rendered(rendered)
|
| 245 |
metadata = rendered.metadata
|
| 246 |
+
logger.info(f"[Task {task_id}] Extracted {len(text)} characters and {len(images)} images")
|
| 247 |
+
|
| 248 |
+
logger.info(f"[Task {task_id}] Encoding images...")
|
| 249 |
+
encoded = {}
|
| 250 |
+
for k, v in images.items():
|
| 251 |
+
byte_stream = io.BytesIO()
|
| 252 |
+
v.save(byte_stream, format=settings.OUTPUT_IMAGE_FORMAT)
|
| 253 |
+
encoded[k] = base64.b64encode(byte_stream.getvalue()).decode(
|
| 254 |
+
settings.OUTPUT_ENCODING
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
total_time = time.time() - start_time
|
| 258 |
+
logger.info(f"[Task {task_id}] Total processing time: {total_time:.2f} seconds")
|
| 259 |
+
|
| 260 |
+
result = {
|
| 261 |
+
"format": params.output_format,
|
| 262 |
+
"output": text,
|
| 263 |
+
"images": encoded,
|
| 264 |
+
"metadata": metadata,
|
| 265 |
+
"success": True,
|
| 266 |
+
"processing_time": round(total_time, 2),
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
# Update task with result
|
| 270 |
+
with task_lock:
|
| 271 |
+
task_storage[task_id]["status"] = TaskStatus.COMPLETED
|
| 272 |
+
task_storage[task_id]["result"] = result
|
| 273 |
+
task_storage[task_id]["completed_at"] = time.time()
|
| 274 |
+
|
| 275 |
+
return result
|
| 276 |
+
|
| 277 |
except Exception as e:
|
| 278 |
+
logger.error(f"[Task {task_id}] Error during conversion: {str(e)}")
|
| 279 |
traceback.print_exc()
|
| 280 |
+
error_result = {
|
| 281 |
"success": False,
|
| 282 |
"error": str(e),
|
| 283 |
}
|
| 284 |
+
|
| 285 |
+
# Update task with error
|
| 286 |
+
with task_lock:
|
| 287 |
+
task_storage[task_id]["status"] = TaskStatus.FAILED
|
| 288 |
+
task_storage[task_id]["result"] = error_result
|
| 289 |
+
task_storage[task_id]["completed_at"] = time.time()
|
| 290 |
+
task_storage[task_id]["error"] = str(e)
|
| 291 |
+
|
| 292 |
+
return error_result
|
| 293 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
+
async def _convert_pdf_async(params: CommonParams, task_id: str):
|
| 296 |
+
"""Async wrapper for PDF conversion"""
|
| 297 |
+
loop = asyncio.get_event_loop()
|
| 298 |
+
return await loop.run_in_executor(executor, _convert_pdf_sync, params, task_id)
|
|
|
|
|
|
|
|
|
|
| 299 |
|
| 300 |
|
| 301 |
@app.post("/marker")
|
| 302 |
async def convert_pdf(params: CommonParams, token_verified: bool = Depends(verify_token)):
|
| 303 |
+
"""Synchronous endpoint - returns result directly (may take a long time)"""
|
| 304 |
+
task_id = f"sync-{uuid.uuid4().hex[:8]}"
|
| 305 |
+
return await _convert_pdf_async(params, task_id)
|
| 306 |
|
| 307 |
|
| 308 |
@app.post("/marker/upload")
|
|
|
|
| 327 |
),
|
| 328 |
token_verified: bool = Depends(verify_token),
|
| 329 |
):
|
| 330 |
+
"""Synchronous upload endpoint - returns result directly (may take a long time)"""
|
| 331 |
upload_path = os.path.join(UPLOAD_DIRECTORY, file.filename)
|
| 332 |
with open(upload_path, "wb+") as upload_file:
|
| 333 |
file_contents = await file.read()
|
|
|
|
| 351 |
converter_cls=converter_cls,
|
| 352 |
llm_service=llm_service,
|
| 353 |
)
|
| 354 |
+
task_id = f"upload-{uuid.uuid4().hex[:8]}"
|
| 355 |
+
results = await _convert_pdf_async(params, task_id)
|
| 356 |
os.remove(upload_path)
|
| 357 |
return results
|
| 358 |
|
| 359 |
|
| 360 |
+
@app.post("/marker/upload/async")
|
| 361 |
+
async def convert_pdf_upload_async(
|
| 362 |
+
page_range: Optional[str] = Form(default=None),
|
| 363 |
+
force_ocr: Optional[bool] = Form(default=False),
|
| 364 |
+
paginate_output: Optional[bool] = Form(default=False),
|
| 365 |
+
output_format: Optional[str] = Form(default="markdown"),
|
| 366 |
+
output_dir: Optional[str] = Form(default=None),
|
| 367 |
+
use_llm: Optional[bool] = Form(default=False),
|
| 368 |
+
block_correction_prompt: Optional[str] = Form(default=None),
|
| 369 |
+
strip_existing_ocr: Optional[bool] = Form(default=False),
|
| 370 |
+
redo_inline_math: Optional[bool] = Form(default=False),
|
| 371 |
+
disable_image_extraction: Optional[bool] = Form(default=False),
|
| 372 |
+
debug: Optional[bool] = Form(default=False),
|
| 373 |
+
processors: Optional[str] = Form(default=None),
|
| 374 |
+
config_json: Optional[str] = Form(default=None),
|
| 375 |
+
converter_cls: Optional[str] = Form(default=None),
|
| 376 |
+
llm_service: Optional[str] = Form(default=None),
|
| 377 |
+
file: UploadFile = File(
|
| 378 |
+
..., description="The PDF file to convert.", media_type="application/pdf"
|
| 379 |
+
),
|
| 380 |
+
token_verified: bool = Depends(verify_token),
|
| 381 |
+
):
|
| 382 |
+
"""Asynchronous upload endpoint - returns task ID immediately"""
|
| 383 |
+
task_id = str(uuid.uuid4())
|
| 384 |
+
upload_path = os.path.join(UPLOAD_DIRECTORY, f"{task_id}_{file.filename}")
|
| 385 |
+
|
| 386 |
+
# Save uploaded file
|
| 387 |
+
with open(upload_path, "wb+") as upload_file:
|
| 388 |
+
file_contents = await file.read()
|
| 389 |
+
upload_file.write(file_contents)
|
| 390 |
+
|
| 391 |
+
params = CommonParams(
|
| 392 |
+
filepath=upload_path,
|
| 393 |
+
page_range=page_range,
|
| 394 |
+
force_ocr=force_ocr,
|
| 395 |
+
paginate_output=paginate_output,
|
| 396 |
+
output_format=output_format,
|
| 397 |
+
output_dir=output_dir,
|
| 398 |
+
use_llm=use_llm,
|
| 399 |
+
block_correction_prompt=block_correction_prompt,
|
| 400 |
+
strip_existing_ocr=strip_existing_ocr,
|
| 401 |
+
redo_inline_math=redo_inline_math,
|
| 402 |
+
disable_image_extraction=disable_image_extraction,
|
| 403 |
+
debug=debug,
|
| 404 |
+
processors=processors,
|
| 405 |
+
config_json=config_json,
|
| 406 |
+
converter_cls=converter_cls,
|
| 407 |
+
llm_service=llm_service,
|
| 408 |
+
)
|
| 409 |
+
|
| 410 |
+
# Initialize task
|
| 411 |
+
with task_lock:
|
| 412 |
+
task_storage[task_id] = {
|
| 413 |
+
"status": TaskStatus.PENDING,
|
| 414 |
+
"created_at": time.time(),
|
| 415 |
+
"params": params.model_dump(),
|
| 416 |
+
"result": None,
|
| 417 |
+
"error": None,
|
| 418 |
+
"upload_path": upload_path, # Store path for cleanup
|
| 419 |
+
}
|
| 420 |
+
|
| 421 |
+
# Start conversion in background
|
| 422 |
+
asyncio.create_task(_convert_pdf_async_with_cleanup(params, task_id, upload_path))
|
| 423 |
+
|
| 424 |
+
return {
|
| 425 |
+
"task_id": task_id,
|
| 426 |
+
"status": TaskStatus.PENDING,
|
| 427 |
+
"message": "Task created. Use /marker/task/{task_id} to check status and get results.",
|
| 428 |
+
"status_url": f"/marker/task/{task_id}",
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
|
| 432 |
+
async def _convert_pdf_async_with_cleanup(params: CommonParams, task_id: str, upload_path: str):
|
| 433 |
+
"""Async wrapper with file cleanup"""
|
| 434 |
+
try:
|
| 435 |
+
await _convert_pdf_async(params, task_id)
|
| 436 |
+
finally:
|
| 437 |
+
# Clean up uploaded file
|
| 438 |
+
try:
|
| 439 |
+
if os.path.exists(upload_path):
|
| 440 |
+
os.remove(upload_path)
|
| 441 |
+
except Exception as e:
|
| 442 |
+
logger.warning(f"[Task {task_id}] Failed to cleanup file {upload_path}: {e}")
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
@app.get("/marker/task/{task_id}")
|
| 446 |
+
async def get_task_status(task_id: str, token_verified: bool = Depends(verify_token)):
|
| 447 |
+
"""Get task status and result"""
|
| 448 |
+
with task_lock:
|
| 449 |
+
task = task_storage.get(task_id)
|
| 450 |
+
|
| 451 |
+
if not task:
|
| 452 |
+
raise HTTPException(status_code=404, detail="Task not found")
|
| 453 |
+
|
| 454 |
+
response = {
|
| 455 |
+
"task_id": task_id,
|
| 456 |
+
"status": task["status"],
|
| 457 |
+
"created_at": task.get("created_at"),
|
| 458 |
+
"started_at": task.get("started_at"),
|
| 459 |
+
"completed_at": task.get("completed_at"),
|
| 460 |
+
}
|
| 461 |
+
|
| 462 |
+
if task["status"] == TaskStatus.COMPLETED:
|
| 463 |
+
response["result"] = task["result"]
|
| 464 |
+
elif task["status"] == TaskStatus.FAILED:
|
| 465 |
+
response["error"] = task.get("error", "Unknown error")
|
| 466 |
+
response["result"] = task.get("result")
|
| 467 |
+
|
| 468 |
+
# Calculate elapsed time if processing
|
| 469 |
+
if task["status"] == TaskStatus.PROCESSING and "started_at" in task:
|
| 470 |
+
response["elapsed_time"] = round(time.time() - task["started_at"], 2)
|
| 471 |
+
|
| 472 |
+
return response
|
| 473 |
+
|
| 474 |
+
|
| 475 |
+
@app.delete("/marker/task/{task_id}")
|
| 476 |
+
async def delete_task(task_id: str, token_verified: bool = Depends(verify_token)):
|
| 477 |
+
"""Delete a completed or failed task"""
|
| 478 |
+
with task_lock:
|
| 479 |
+
task = task_storage.get(task_id)
|
| 480 |
+
if not task:
|
| 481 |
+
raise HTTPException(status_code=404, detail="Task not found")
|
| 482 |
+
|
| 483 |
+
if task["status"] == TaskStatus.PROCESSING:
|
| 484 |
+
raise HTTPException(status_code=400, detail="Cannot delete task that is currently processing")
|
| 485 |
+
|
| 486 |
+
# Clean up uploaded file if exists
|
| 487 |
+
upload_path = task.get("upload_path")
|
| 488 |
+
if upload_path and os.path.exists(upload_path):
|
| 489 |
+
try:
|
| 490 |
+
os.remove(upload_path)
|
| 491 |
+
except Exception as e:
|
| 492 |
+
logger.warning(f"Failed to cleanup file {upload_path}: {e}")
|
| 493 |
+
|
| 494 |
+
del task_storage[task_id]
|
| 495 |
+
|
| 496 |
+
return {"message": "Task deleted successfully"}
|
| 497 |
+
|
| 498 |
+
|
| 499 |
@click.command()
|
| 500 |
@click.option("--port", type=int, default=8000, help="Port to run the server on")
|
| 501 |
@click.option("--host", type=str, default="127.0.0.1", help="Host to run the server on")
|