"""API routes — OCR extraction, export, and health endpoints. Endpoint summary: GET / → HF health probe GET /health → detailed health check POST /extract → plain text extraction POST /extract-structured → full structured extraction with tables POST /export → export edited handbook to PDF/DOCX/HTML/JSON POST /save → persist edited handbook (delegates to PHP) """ from __future__ import annotations import logging import os import shutil import tempfile from pathlib import Path from fastapi import APIRouter, File, HTTPException, UploadFile from fastapi.responses import Response from app.core.config import get_settings from app.schemas.extraction import ( ErrorResponse, ExportRequest, ExtractionResult, HealthResponse, PlainExtractionResult, SaveHandbookRequest, ) logger = logging.getLogger(__name__) router = APIRouter() # ── Allowed MIME types ── _ALLOWED_TYPES = { "application/pdf", "image/png", "image/jpeg", "image/tiff", "image/bmp", "image/webp", } _ALLOWED_EXTENSIONS = {".pdf", ".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"} def _validate_upload(upload: UploadFile) -> None: """Reject files that are too large or have disallowed types.""" settings = get_settings() # Extension check ext = Path(upload.filename or "").suffix.lower() if ext not in _ALLOWED_EXTENSIONS: raise HTTPException( status_code=400, detail=f"Unsupported file extension: {ext}. Allowed: {', '.join(sorted(_ALLOWED_EXTENSIONS))}", ) # Content-type check (loose — browsers may send generic types) ct = (upload.content_type or "").lower() if ct and ct != "application/octet-stream" and ct not in _ALLOWED_TYPES: raise HTTPException( status_code=400, detail=f"Unsupported content type: {ct}", ) def _save_upload(upload: UploadFile) -> Path: """Save the uploaded file to a temp location and return its path.""" settings = get_settings() os.makedirs(settings.upload_dir, exist_ok=True) suffix = Path(upload.filename or "upload").suffix or ".pdf" tmp = tempfile.NamedTemporaryFile( dir=settings.upload_dir, suffix=suffix, delete=False, ) try: shutil.copyfileobj(upload.file, tmp) tmp.close() return Path(tmp.name) except Exception: tmp.close() os.unlink(tmp.name) raise # ── Health ── @router.get("/", tags=["system"]) async def root(): return {"status": "ok"} @router.get("/health", response_model=HealthResponse, tags=["system"]) async def health_check(): settings = get_settings() from app.services.ocr_extractor import tesseract_available pymupdf_ok = False try: import fitz # noqa: F401 pymupdf_ok = True except ImportError: pass return HealthResponse( status="ok", service=settings.app_name, version=settings.app_version, tesseract_available=tesseract_available(), pymupdf_available=pymupdf_ok, ) # ── Plain extraction ── @router.post( "/extract", response_model=PlainExtractionResult, tags=["extraction"], summary="Extract plain text from an uploaded handbook", ) async def extract_plain_text(file: UploadFile = File(...)): """Upload a PDF or image and receive plain text back.""" _validate_upload(file) tmp_path = _save_upload(file) try: from app.services.extraction_pipeline import extract_plain result = extract_plain(str(tmp_path)) return result except Exception as exc: logger.exception("Plain extraction failed") raise HTTPException(status_code=500, detail=str(exc)) finally: tmp_path.unlink(missing_ok=True) # ── Structured extraction ── @router.post( "/extract-structured", response_model=ExtractionResult, tags=["extraction"], summary="Extract structured content (headings, paragraphs, tables) from an uploaded handbook", ) async def extract_structured(file: UploadFile = File(...)): """Upload a PDF or image and receive structured blocks, tables, and metadata.""" _validate_upload(file) tmp_path = _save_upload(file) try: from app.services.extraction_pipeline import extract_structured as _do result = _do(str(tmp_path)) return result except Exception as exc: logger.exception("Structured extraction failed") raise HTTPException(status_code=500, detail=str(exc)) finally: tmp_path.unlink(missing_ok=True) # ── Export ── @router.post( "/export", tags=["export"], summary="Export edited handbook to PDF, DOCX, HTML, or JSON", ) async def export_handbook(req: ExportRequest): """Receive edited page data and return the exported file.""" try: from app.services.export_service import export_document data, content_type, ext = export_document(req) filename = f"{req.title or 'handbook'}.{ext}" return Response( content=data, media_type=content_type, headers={ "Content-Disposition": f'attachment; filename="{filename}"', "Content-Length": str(len(data)), }, ) except Exception as exc: logger.exception("Export failed") raise HTTPException(status_code=500, detail=str(exc)) # ── Save (delegates to PHP backend) ── @router.post( "/save", tags=["persistence"], summary="Save edited handbook to the platform database", ) async def save_handbook(req: SaveHandbookRequest): """Forward the edited handbook data to the PHP backend for persistence.""" settings = get_settings() import httpx payload = { "action": "save_import", "document_id": req.document_id, "title": req.title, "pages": [p.model_dump(mode="json") for p in req.pages], } try: async with httpx.AsyncClient(timeout=30) as client: resp = await client.post(settings.handbook_import_url, json=payload) resp.raise_for_status() return resp.json() except httpx.HTTPStatusError as exc: logger.error("PHP save failed: %s", exc.response.text[:500]) raise HTTPException(status_code=502, detail="Backend save failed") except Exception as exc: logger.exception("Save request failed") raise HTTPException(status_code=500, detail=str(exc))