| """API routes β OCR extraction, export, and health endpoints.
|
|
|
| Endpoint summary:
|
| GET / β HF health probe
|
| GET /health β detailed health check
|
| POST /extract β plain text extraction
|
| POST /extract-structured β full structured extraction with tables
|
| POST /export β export edited handbook to PDF/DOCX/HTML/JSON
|
| POST /save β persist edited handbook (delegates to PHP)
|
| """
|
|
|
| from __future__ import annotations
|
|
|
| import logging
|
| import os
|
| import shutil
|
| import tempfile
|
| from pathlib import Path
|
|
|
| from fastapi import APIRouter, File, HTTPException, UploadFile
|
| from fastapi.responses import Response
|
|
|
| from app.core.config import get_settings
|
| from app.schemas.extraction import (
|
| ErrorResponse,
|
| ExportRequest,
|
| ExtractionResult,
|
| HealthResponse,
|
| PlainExtractionResult,
|
| SaveHandbookRequest,
|
| )
|
|
|
| logger = logging.getLogger(__name__)
|
| router = APIRouter()
|
|
|
|
|
|
|
| _ALLOWED_TYPES = {
|
| "application/pdf",
|
| "image/png",
|
| "image/jpeg",
|
| "image/tiff",
|
| "image/bmp",
|
| "image/webp",
|
| }
|
|
|
| _ALLOWED_EXTENSIONS = {".pdf", ".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}
|
|
|
|
|
| def _validate_upload(upload: UploadFile) -> None:
|
| """Reject files that are too large or have disallowed types."""
|
| settings = get_settings()
|
|
|
|
|
| ext = Path(upload.filename or "").suffix.lower()
|
| if ext not in _ALLOWED_EXTENSIONS:
|
| raise HTTPException(
|
| status_code=400,
|
| detail=f"Unsupported file extension: {ext}. Allowed: {', '.join(sorted(_ALLOWED_EXTENSIONS))}",
|
| )
|
|
|
|
|
| ct = (upload.content_type or "").lower()
|
| if ct and ct != "application/octet-stream" and ct not in _ALLOWED_TYPES:
|
| raise HTTPException(
|
| status_code=400,
|
| detail=f"Unsupported content type: {ct}",
|
| )
|
|
|
|
|
| def _save_upload(upload: UploadFile) -> Path:
|
| """Save the uploaded file to a temp location and return its path."""
|
| settings = get_settings()
|
| os.makedirs(settings.upload_dir, exist_ok=True)
|
|
|
| suffix = Path(upload.filename or "upload").suffix or ".pdf"
|
| tmp = tempfile.NamedTemporaryFile(
|
| dir=settings.upload_dir,
|
| suffix=suffix,
|
| delete=False,
|
| )
|
| try:
|
| shutil.copyfileobj(upload.file, tmp)
|
| tmp.close()
|
| return Path(tmp.name)
|
| except Exception:
|
| tmp.close()
|
| os.unlink(tmp.name)
|
| raise
|
|
|
|
|
|
|
|
|
|
|
| @router.get("/", tags=["system"])
|
| async def root():
|
| return {"status": "ok"}
|
|
|
|
|
| @router.get("/health", response_model=HealthResponse, tags=["system"])
|
| async def health_check():
|
| settings = get_settings()
|
| from app.services.ocr_extractor import tesseract_available
|
|
|
| pymupdf_ok = False
|
| try:
|
| import fitz
|
| pymupdf_ok = True
|
| except ImportError:
|
| pass
|
|
|
| return HealthResponse(
|
| status="ok",
|
| service=settings.app_name,
|
| version=settings.app_version,
|
| tesseract_available=tesseract_available(),
|
| pymupdf_available=pymupdf_ok,
|
| )
|
|
|
|
|
|
|
|
|
|
|
| @router.post(
|
| "/extract",
|
| response_model=PlainExtractionResult,
|
| tags=["extraction"],
|
| summary="Extract plain text from an uploaded handbook",
|
| )
|
| async def extract_plain_text(file: UploadFile = File(...)):
|
| """Upload a PDF or image and receive plain text back."""
|
| _validate_upload(file)
|
| tmp_path = _save_upload(file)
|
| try:
|
| from app.services.extraction_pipeline import extract_plain
|
| result = extract_plain(str(tmp_path))
|
| return result
|
| except Exception as exc:
|
| logger.exception("Plain extraction failed")
|
| raise HTTPException(status_code=500, detail=str(exc))
|
| finally:
|
| tmp_path.unlink(missing_ok=True)
|
|
|
|
|
|
|
|
|
|
|
| @router.post(
|
| "/extract-structured",
|
| response_model=ExtractionResult,
|
| tags=["extraction"],
|
| summary="Extract structured content (headings, paragraphs, tables) from an uploaded handbook",
|
| )
|
| async def extract_structured(file: UploadFile = File(...)):
|
| """Upload a PDF or image and receive structured blocks, tables, and metadata."""
|
| _validate_upload(file)
|
| tmp_path = _save_upload(file)
|
| try:
|
| from app.services.extraction_pipeline import extract_structured as _do
|
| result = _do(str(tmp_path))
|
| return result
|
| except Exception as exc:
|
| logger.exception("Structured extraction failed")
|
| raise HTTPException(status_code=500, detail=str(exc))
|
| finally:
|
| tmp_path.unlink(missing_ok=True)
|
|
|
|
|
|
|
|
|
|
|
| @router.post(
|
| "/export",
|
| tags=["export"],
|
| summary="Export edited handbook to PDF, DOCX, HTML, or JSON",
|
| )
|
| async def export_handbook(req: ExportRequest):
|
| """Receive edited page data and return the exported file."""
|
| try:
|
| from app.services.export_service import export_document
|
| data, content_type, ext = export_document(req)
|
|
|
| filename = f"{req.title or 'handbook'}.{ext}"
|
| return Response(
|
| content=data,
|
| media_type=content_type,
|
| headers={
|
| "Content-Disposition": f'attachment; filename="{filename}"',
|
| "Content-Length": str(len(data)),
|
| },
|
| )
|
| except Exception as exc:
|
| logger.exception("Export failed")
|
| raise HTTPException(status_code=500, detail=str(exc))
|
|
|
|
|
|
|
|
|
|
|
| @router.post(
|
| "/save",
|
| tags=["persistence"],
|
| summary="Save edited handbook to the platform database",
|
| )
|
| async def save_handbook(req: SaveHandbookRequest):
|
| """Forward the edited handbook data to the PHP backend for persistence."""
|
| settings = get_settings()
|
| import httpx
|
|
|
| payload = {
|
| "action": "save_import",
|
| "document_id": req.document_id,
|
| "title": req.title,
|
| "pages": [p.model_dump(mode="json") for p in req.pages],
|
| }
|
|
|
| try:
|
| async with httpx.AsyncClient(timeout=30) as client:
|
| resp = await client.post(settings.handbook_import_url, json=payload)
|
| resp.raise_for_status()
|
| return resp.json()
|
| except httpx.HTTPStatusError as exc:
|
| logger.error("PHP save failed: %s", exc.response.text[:500])
|
| raise HTTPException(status_code=502, detail="Backend save failed")
|
| except Exception as exc:
|
| logger.exception("Save request failed")
|
| raise HTTPException(status_code=500, detail=str(exc))
|
|
|