handbook-ocr-engine / app /api /routes.py
internationalscholarsprogram's picture
Initial deploy: ISP Handbook OCR Engine
b12284c verified
"""API routes β€” OCR extraction, export, and health endpoints.
Endpoint summary:
GET / β†’ HF health probe
GET /health β†’ detailed health check
POST /extract β†’ plain text extraction
POST /extract-structured β†’ full structured extraction with tables
POST /export β†’ export edited handbook to PDF/DOCX/HTML/JSON
POST /save β†’ persist edited handbook (delegates to PHP)
"""
from __future__ import annotations
import logging
import os
import shutil
import tempfile
from pathlib import Path
from fastapi import APIRouter, File, HTTPException, UploadFile
from fastapi.responses import Response
from app.core.config import get_settings
from app.schemas.extraction import (
ErrorResponse,
ExportRequest,
ExtractionResult,
HealthResponse,
PlainExtractionResult,
SaveHandbookRequest,
)
logger = logging.getLogger(__name__)
router = APIRouter()
# ── Allowed MIME types ──
_ALLOWED_TYPES = {
"application/pdf",
"image/png",
"image/jpeg",
"image/tiff",
"image/bmp",
"image/webp",
}
_ALLOWED_EXTENSIONS = {".pdf", ".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}
def _validate_upload(upload: UploadFile) -> None:
"""Reject files that are too large or have disallowed types."""
settings = get_settings()
# Extension check
ext = Path(upload.filename or "").suffix.lower()
if ext not in _ALLOWED_EXTENSIONS:
raise HTTPException(
status_code=400,
detail=f"Unsupported file extension: {ext}. Allowed: {', '.join(sorted(_ALLOWED_EXTENSIONS))}",
)
# Content-type check (loose β€” browsers may send generic types)
ct = (upload.content_type or "").lower()
if ct and ct != "application/octet-stream" and ct not in _ALLOWED_TYPES:
raise HTTPException(
status_code=400,
detail=f"Unsupported content type: {ct}",
)
def _save_upload(upload: UploadFile) -> Path:
"""Save the uploaded file to a temp location and return its path."""
settings = get_settings()
os.makedirs(settings.upload_dir, exist_ok=True)
suffix = Path(upload.filename or "upload").suffix or ".pdf"
tmp = tempfile.NamedTemporaryFile(
dir=settings.upload_dir,
suffix=suffix,
delete=False,
)
try:
shutil.copyfileobj(upload.file, tmp)
tmp.close()
return Path(tmp.name)
except Exception:
tmp.close()
os.unlink(tmp.name)
raise
# ── Health ──
@router.get("/", tags=["system"])
async def root():
return {"status": "ok"}
@router.get("/health", response_model=HealthResponse, tags=["system"])
async def health_check():
settings = get_settings()
from app.services.ocr_extractor import tesseract_available
pymupdf_ok = False
try:
import fitz # noqa: F401
pymupdf_ok = True
except ImportError:
pass
return HealthResponse(
status="ok",
service=settings.app_name,
version=settings.app_version,
tesseract_available=tesseract_available(),
pymupdf_available=pymupdf_ok,
)
# ── Plain extraction ──
@router.post(
"/extract",
response_model=PlainExtractionResult,
tags=["extraction"],
summary="Extract plain text from an uploaded handbook",
)
async def extract_plain_text(file: UploadFile = File(...)):
"""Upload a PDF or image and receive plain text back."""
_validate_upload(file)
tmp_path = _save_upload(file)
try:
from app.services.extraction_pipeline import extract_plain
result = extract_plain(str(tmp_path))
return result
except Exception as exc:
logger.exception("Plain extraction failed")
raise HTTPException(status_code=500, detail=str(exc))
finally:
tmp_path.unlink(missing_ok=True)
# ── Structured extraction ──
@router.post(
"/extract-structured",
response_model=ExtractionResult,
tags=["extraction"],
summary="Extract structured content (headings, paragraphs, tables) from an uploaded handbook",
)
async def extract_structured(file: UploadFile = File(...)):
"""Upload a PDF or image and receive structured blocks, tables, and metadata."""
_validate_upload(file)
tmp_path = _save_upload(file)
try:
from app.services.extraction_pipeline import extract_structured as _do
result = _do(str(tmp_path))
return result
except Exception as exc:
logger.exception("Structured extraction failed")
raise HTTPException(status_code=500, detail=str(exc))
finally:
tmp_path.unlink(missing_ok=True)
# ── Export ──
@router.post(
"/export",
tags=["export"],
summary="Export edited handbook to PDF, DOCX, HTML, or JSON",
)
async def export_handbook(req: ExportRequest):
"""Receive edited page data and return the exported file."""
try:
from app.services.export_service import export_document
data, content_type, ext = export_document(req)
filename = f"{req.title or 'handbook'}.{ext}"
return Response(
content=data,
media_type=content_type,
headers={
"Content-Disposition": f'attachment; filename="{filename}"',
"Content-Length": str(len(data)),
},
)
except Exception as exc:
logger.exception("Export failed")
raise HTTPException(status_code=500, detail=str(exc))
# ── Save (delegates to PHP backend) ──
@router.post(
"/save",
tags=["persistence"],
summary="Save edited handbook to the platform database",
)
async def save_handbook(req: SaveHandbookRequest):
"""Forward the edited handbook data to the PHP backend for persistence."""
settings = get_settings()
import httpx
payload = {
"action": "save_import",
"document_id": req.document_id,
"title": req.title,
"pages": [p.model_dump(mode="json") for p in req.pages],
}
try:
async with httpx.AsyncClient(timeout=30) as client:
resp = await client.post(settings.handbook_import_url, json=payload)
resp.raise_for_status()
return resp.json()
except httpx.HTTPStatusError as exc:
logger.error("PHP save failed: %s", exc.response.text[:500])
raise HTTPException(status_code=502, detail="Backend save failed")
except Exception as exc:
logger.exception("Save request failed")
raise HTTPException(status_code=500, detail=str(exc))