Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| FastAPI server for hybrid PDF extraction. | |
| Endpoints: | |
| - GET /health | |
| - POST /extract-pdf-text (multipart: file, max_pages, ocr_lang) | |
| """ | |
| from __future__ import annotations | |
| import os | |
| import tempfile | |
| from pathlib import Path | |
| from typing import Optional | |
| from fastapi import FastAPI, File, Form, Header, HTTPException, UploadFile | |
| from fastapi.responses import JSONResponse | |
| try: | |
| from .extract_pdf_text import run as extract_run | |
| except ImportError: | |
| # Fallback when running as a plain script from this folder. | |
| from extract_pdf_text import run as extract_run | |
| app = FastAPI(title="ScriptAI PDF Extractor API", version="1.0.0") | |
| def root_health() -> dict: | |
| # Many platforms probe GET / for health checks. | |
| return {"ok": True, "service": "pdf-extractor", "endpoint": "/extract-pdf-text"} | |
| def health() -> dict: | |
| return {"ok": True, "service": "pdf-extractor"} | |
| def ensure_authorized(authorization: Optional[str]) -> None: | |
| expected_token = (os.getenv("PYTHON_EXTRACTOR_TOKEN") or "").strip() | |
| if expected_token == "": | |
| return | |
| bearer = (authorization or "").strip() | |
| if not bearer.startswith("Bearer "): | |
| raise HTTPException(status_code=401, detail="Unauthorized") | |
| received = bearer[7:].strip() | |
| if received != expected_token: | |
| raise HTTPException(status_code=401, detail="Unauthorized") | |
| async def extract_pdf_text( | |
| file: UploadFile = File(...), | |
| max_pages: int = Form(20), | |
| ocr_lang: str = Form("ind+eng"), | |
| authorization: Optional[str] = Header(default=None), | |
| ) -> JSONResponse: | |
| ensure_authorized(authorization) | |
| max_pages = max(1, min(max_pages, 80)) | |
| suffix = ".pdf" | |
| temp_path: Optional[Path] = None | |
| try: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: | |
| temp_path = Path(tmp.name) | |
| while True: | |
| chunk = await file.read(1024 * 1024) | |
| if not chunk: | |
| break | |
| tmp.write(chunk) | |
| payload = extract_run(str(temp_path), max_pages=max_pages, ocr_lang=ocr_lang) | |
| status = 200 if payload.get("success") else 422 | |
| return JSONResponse(payload, status_code=status) | |
| except HTTPException: | |
| raise | |
| except Exception as exc: | |
| return JSONResponse( | |
| { | |
| "success": False, | |
| "mode": "error", | |
| "engine": "none", | |
| "text": "", | |
| "error": str(exc), | |
| }, | |
| status_code=500, | |
| ) | |
| finally: | |
| await file.close() | |
| if temp_path and temp_path.exists(): | |
| temp_path.unlink(missing_ok=True) | |
| async def extract_pdf_text_root( | |
| file: UploadFile = File(...), | |
| max_pages: int = Form(20), | |
| ocr_lang: str = Form("ind+eng"), | |
| authorization: Optional[str] = Header(default=None), | |
| ) -> JSONResponse: | |
| # Alias endpoint to keep compatibility with simple base URL posting. | |
| return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization) | |
| async def extract_pdf_text_api_alias( | |
| file: UploadFile = File(...), | |
| max_pages: int = Form(20), | |
| ocr_lang: str = Form("ind+eng"), | |
| authorization: Optional[str] = Header(default=None), | |
| ) -> JSONResponse: | |
| # Compatibility alias used by Laravel fallback endpoint list. | |
| return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization) | |
| async def extract_pdf_text_legacy_alias( | |
| file: UploadFile = File(...), | |
| max_pages: int = Form(20), | |
| ocr_lang: str = Form("ind+eng"), | |
| authorization: Optional[str] = Header(default=None), | |
| ) -> JSONResponse: | |
| # Legacy compatibility alias used by older clients. | |
| return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization) | |