from __future__ import annotations import platform import time from datetime import datetime, timezone from fastapi import APIRouter, Depends from app.api.deps import get_extraction_service, require_auth from app.config import get_settings from app.core.constants import ( ARCHIVE_EXTENSIONS, AUDIO_EXTENSIONS, DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, OFFICE_EXTENSIONS, SUPPORTED_EXTENSIONS, TEXT_EXTENSIONS, WEB_EXTENSIONS, ) from app.models.schemas import ( HealthResponse, InfoResponse, SpacyLabelsResponse, SupportedFormatsResponse, ) from app.services.extraction_service import ExtractionService router = APIRouter() _settings = get_settings() _START_TIME = time.time() @router.get("/health", response_model=HealthResponse, summary="Health check") async def health( token: str = Depends(require_auth), ): return HealthResponse( success=True, status="ok", version=_settings.app_version, uptime_seconds=round(time.time() - _START_TIME, 2), timestamp=datetime.now(timezone.utc).isoformat(), ) @router.get("/info", response_model=InfoResponse, summary="Server and environment information") async def info( token: str = Depends(require_auth), ): return InfoResponse( success=True, app=_settings.app_name, version=_settings.app_version, python_version=platform.python_version(), platform=platform.system(), uptime_seconds=round(time.time() - _START_TIME, 2), max_upload_mb=_settings.max_upload_mb, supported_extensions=len(SUPPORTED_EXTENSIONS), timestamp=datetime.now(timezone.utc).isoformat(), ) @router.get("/formats", response_model=SupportedFormatsResponse, summary="List supported file formats") async def list_formats( token: str = Depends(require_auth), ): by_category = { "documents": [e for e in SUPPORTED_EXTENSIONS if e in DOCUMENT_EXTENSIONS], "office": [e for e in SUPPORTED_EXTENSIONS if e in OFFICE_EXTENSIONS], "data": [e for e in SUPPORTED_EXTENSIONS if e in {".csv", ".json", ".xml"}], "web": [e for e in SUPPORTED_EXTENSIONS if e in WEB_EXTENSIONS], "text": [e for e in SUPPORTED_EXTENSIONS if e in TEXT_EXTENSIONS], "images": [e for e in SUPPORTED_EXTENSIONS if e in IMAGE_EXTENSIONS], "audio": [e for e in SUPPORTED_EXTENSIONS if e in AUDIO_EXTENSIONS], "archives": [e for e in SUPPORTED_EXTENSIONS if e in ARCHIVE_EXTENSIONS], } return SupportedFormatsResponse( success=True, total_count=len(SUPPORTED_EXTENSIONS), all_extensions=sorted(SUPPORTED_EXTENSIONS), by_category={k: sorted(v) for k, v in by_category.items()}, ) @router.get("/spacy-labels", response_model=SpacyLabelsResponse, summary="List available spaCy NER labels") async def list_spacy_labels( token: str = Depends(require_auth), extraction_service: ExtractionService = Depends(get_extraction_service), ): return SpacyLabelsResponse( success=True, spacy_labels=extraction_service.get_spacy_labels(), source_types={ "entity": "Extract using spaCy NER labels (ORG, PERSON, DATE, etc.)", "regex": "Extract using custom regular expressions", "token_attr": "Extract using token attributes (text, pos_, tag_, etc.)", }, example_mappings={ "company": {"source_type": "entity", "label": "ORG"}, "person": {"source_type": "entity", "label": "PERSON"}, "date": {"source_type": "entity", "label": "DATE"}, "money": {"source_type": "entity", "label": "MONEY"}, "email": { "source_type": "regex", "pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", }, "phone": {"source_type": "regex", "pattern": r"\b\d{3}-\d{3}-\d{4}\b"}, }, )