stubdude's picture
Add document parser Docker service
fbba60e
"""
Hosted document parser API for Fresh Catch Inventory.
Deploy to Hugging Face Spaces (Docker), Fly.io, or any VM with Python + Tesseract.
"""
from __future__ import annotations
import os
import subprocess
import sys
import tempfile
from pathlib import Path
from fastapi import FastAPI, File, Header, HTTPException, Query, UploadFile
from fastapi.middleware.cors import CORSMiddleware
REPO_ROOT = Path(__file__).resolve().parents[2]
PARSE_SCRIPT = REPO_ROOT / "scripts" / "parse_vendor_document.py"
SERVICE_SECRET = os.environ.get("DOCUMENT_PARSER_SERVICE_SECRET", "").strip()
app = FastAPI(title="Fresh Catch Document Parser", version="1.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=os.environ.get("DOCUMENT_PARSER_CORS_ORIGINS", "*").split(","),
allow_credentials=True,
allow_methods=["POST", "GET"],
allow_headers=["*"],
)
def verify_auth(authorization: str | None) -> None:
if not SERVICE_SECRET:
return
if not authorization or authorization != f"Bearer {SERVICE_SECRET}":
raise HTTPException(status_code=401, detail="Unauthorized")
@app.get("/health")
def health() -> dict[str, str]:
return {"status": "ok"}
@app.post("/parse")
async def parse_document(
file: UploadFile = File(...),
type: str = Query("auto", pattern="^(auto|invoice|receipt)$"),
authorization: str | None = Header(default=None),
) -> dict:
verify_auth(authorization)
if not PARSE_SCRIPT.exists():
raise HTTPException(status_code=500, detail="parse_vendor_document.py not found")
contents = await file.read()
if not contents:
raise HTTPException(status_code=400, detail="Empty file")
suffix = Path(file.filename or "upload.png").suffix or ".png"
with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as tmp:
tmp.write(contents)
image_path = tmp.name
try:
completed = subprocess.run(
[sys.executable, str(PARSE_SCRIPT), "--image", image_path, "--type", type],
capture_output=True,
text=True,
timeout=int(os.environ.get("DOCUMENT_PARSER_TIMEOUT_MS", "120000")) // 1000,
cwd=str(REPO_ROOT),
)
finally:
Path(image_path).unlink(missing_ok=True)
if completed.returncode != 0:
detail = (completed.stderr or completed.stdout or "Parse failed").strip()
raise HTTPException(status_code=500, detail=detail[:2000])
import json
try:
return json.loads(completed.stdout)
except json.JSONDecodeError as error:
raise HTTPException(status_code=500, detail=f"Invalid parser output: {error}") from error