scriptai-backend / api_server.py
kodetr's picture
update
519d951 verified
#!/usr/bin/env python3
"""
FastAPI server for hybrid PDF extraction.
Endpoints:
- GET /health
- POST /extract-pdf-text (multipart: file, max_pages, ocr_lang)
"""
from __future__ import annotations
import os
import tempfile
from pathlib import Path
from typing import Optional
from fastapi import FastAPI, File, Form, Header, HTTPException, UploadFile
from fastapi.responses import JSONResponse
try:
from .extract_pdf_text import run as extract_run
except ImportError:
# Fallback when running as a plain script from this folder.
from extract_pdf_text import run as extract_run
app = FastAPI(title="ScriptAI PDF Extractor API", version="1.0.0")
@app.get("/")
def root_health() -> dict:
# Many platforms probe GET / for health checks.
return {"ok": True, "service": "pdf-extractor", "endpoint": "/extract-pdf-text"}
@app.get("/health")
def health() -> dict:
return {"ok": True, "service": "pdf-extractor"}
def ensure_authorized(authorization: Optional[str]) -> None:
expected_token = (os.getenv("PYTHON_EXTRACTOR_TOKEN") or "").strip()
if expected_token == "":
return
bearer = (authorization or "").strip()
if not bearer.startswith("Bearer "):
raise HTTPException(status_code=401, detail="Unauthorized")
received = bearer[7:].strip()
if received != expected_token:
raise HTTPException(status_code=401, detail="Unauthorized")
@app.post("/extract-pdf-text")
async def extract_pdf_text(
file: UploadFile = File(...),
max_pages: int = Form(20),
ocr_lang: str = Form("ind+eng"),
authorization: Optional[str] = Header(default=None),
) -> JSONResponse:
ensure_authorized(authorization)
max_pages = max(1, min(max_pages, 80))
suffix = ".pdf"
temp_path: Optional[Path] = None
try:
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
temp_path = Path(tmp.name)
while True:
chunk = await file.read(1024 * 1024)
if not chunk:
break
tmp.write(chunk)
payload = extract_run(str(temp_path), max_pages=max_pages, ocr_lang=ocr_lang)
status = 200 if payload.get("success") else 422
return JSONResponse(payload, status_code=status)
except HTTPException:
raise
except Exception as exc:
return JSONResponse(
{
"success": False,
"mode": "error",
"engine": "none",
"text": "",
"error": str(exc),
},
status_code=500,
)
finally:
await file.close()
if temp_path and temp_path.exists():
temp_path.unlink(missing_ok=True)
@app.post("/")
async def extract_pdf_text_root(
file: UploadFile = File(...),
max_pages: int = Form(20),
ocr_lang: str = Form("ind+eng"),
authorization: Optional[str] = Header(default=None),
) -> JSONResponse:
# Alias endpoint to keep compatibility with simple base URL posting.
return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization)
@app.post("/api/extract-pdf-text")
async def extract_pdf_text_api_alias(
file: UploadFile = File(...),
max_pages: int = Form(20),
ocr_lang: str = Form("ind+eng"),
authorization: Optional[str] = Header(default=None),
) -> JSONResponse:
# Compatibility alias used by Laravel fallback endpoint list.
return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization)
@app.post("/extract/pdf-text")
async def extract_pdf_text_legacy_alias(
file: UploadFile = File(...),
max_pages: int = Form(20),
ocr_lang: str = Form("ind+eng"),
authorization: Optional[str] = Header(default=None),
) -> JSONResponse:
# Legacy compatibility alias used by older clients.
return await extract_pdf_text(file=file, max_pages=max_pages, ocr_lang=ocr_lang, authorization=authorization)