#!/usr/bin/env python3 """ FastAPI server for hybrid PDF extraction. Endpoints: - GET /health - POST /extract-pdf-text (multipart: file, max_pages, ocr_lang) """ from __future__ import annotations import os import tempfile from pathlib import Path from typing import Optional from fastapi import FastAPI, File, Form, Header, HTTPException, UploadFile from fastapi.responses import JSONResponse try: from .extract_pdf_text import run as extract_run except ImportError: # Fallback when running as a plain script from this folder. from extract_pdf_text import run as extract_run app = FastAPI(title="ScriptAI PDF Extractor API", version="1.0.0") @app.get("/") def root_health() -> dict: # Many platforms probe GET / for health checks. return {"ok": True, "service": "pdf-extractor", "endpoint": "/extract-pdf-text"} @app.get("/health") def health() -> dict: return {"ok": True, "service": "pdf-extractor"} def ensure_authorized(authorization: Optional[str]) -> None: expected_token = (os.getenv("PYTHON_EXTRACTOR_TOKEN") or "").strip() if expected_token == "": return bearer = (authorization or "").strip() if not bearer.startswith("Bearer "): raise HTTPException(status_code=401, detail="Unauthorized") received = bearer[7:].strip() if received != expected_token: raise HTTPException(status_code=401, detail="Unauthorized") @app.post("/extract-pdf-text") async def extract_pdf_text( file: UploadFile = File(...), max_pages: int = Form(20), ocr_lang: str = Form("ind+eng"), authorization: Optional[str] = Header(default=None), ) -> JSONResponse: ensure_authorized(authorization) max_pages = max(1, min(max_pages, 80)) suffix = ".pdf" temp_path: Optional[Path] = None try: with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp: temp_path = Path(tmp.name) while True: chunk = await file.read(1024 * 1024) if not chunk: break tmp.write(chunk) payload = extract_run(str(temp_path), max_pages=max_pages, ocr_lang=ocr_lang) status = 200 if payload.get("success") else 422 return JSONResponse(payload, status_code=status) except HTTPException: raise except Exception as exc: return JSONResponse( { "success": False, "mode": "error", "engine": "none", "text": "", "error": str(exc), }, status_code=500, ) finally: await file.close() if temp_path and temp_path.exists(): temp_path.unlink(missing_ok=True) @app.post("/") async def extract_pdf_text_root( file: UploadFile = File(...), max_pages: int = Form(20), ocr_lang: str = Form("ind+eng"), authorization: Optional[str] = Header(default=None), ) -> JSONResponse: # Alias endpoint to keep compatibility with simple base URL posting. return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization) @app.post("/api/extract-pdf-text") async def extract_pdf_text_api_alias( file: UploadFile = File(...), max_pages: int = Form(20), ocr_lang: str = Form("ind+eng"), authorization: Optional[str] = Header(default=None), ) -> JSONResponse: # Compatibility alias used by Laravel fallback endpoint list. return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization) @app.post("/extract/pdf-text") async def extract_pdf_text_legacy_alias( file: UploadFile = File(...), max_pages: int = Form(20), ocr_lang: str = Form("ind+eng"), authorization: Optional[str] = Header(default=None), ) -> JSONResponse: # Legacy compatibility alias used by older clients. return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization)