Spaces:
Running
Running
| from __future__ import annotations | |
| import platform | |
| import time | |
| from datetime import datetime, timezone | |
| from fastapi import APIRouter, Depends | |
| from app.api.deps import get_extraction_service, require_auth | |
| from app.config import get_settings | |
| from app.core.constants import ( | |
| ARCHIVE_EXTENSIONS, | |
| AUDIO_EXTENSIONS, | |
| DOCUMENT_EXTENSIONS, | |
| IMAGE_EXTENSIONS, | |
| OFFICE_EXTENSIONS, | |
| SUPPORTED_EXTENSIONS, | |
| TEXT_EXTENSIONS, | |
| WEB_EXTENSIONS, | |
| ) | |
| from app.models.schemas import ( | |
| HealthResponse, | |
| InfoResponse, | |
| SpacyLabelsResponse, | |
| SupportedFormatsResponse, | |
| ) | |
| from app.services.extraction_service import ExtractionService | |
| router = APIRouter() | |
| _settings = get_settings() | |
| _START_TIME = time.time() | |
| async def health( | |
| token: str = Depends(require_auth), | |
| ): | |
| return HealthResponse( | |
| success=True, | |
| status="ok", | |
| version=_settings.app_version, | |
| uptime_seconds=round(time.time() - _START_TIME, 2), | |
| timestamp=datetime.now(timezone.utc).isoformat(), | |
| ) | |
| async def info( | |
| token: str = Depends(require_auth), | |
| ): | |
| return InfoResponse( | |
| success=True, | |
| app=_settings.app_name, | |
| version=_settings.app_version, | |
| python_version=platform.python_version(), | |
| platform=platform.system(), | |
| uptime_seconds=round(time.time() - _START_TIME, 2), | |
| max_upload_mb=_settings.max_upload_mb, | |
| supported_extensions=len(SUPPORTED_EXTENSIONS), | |
| timestamp=datetime.now(timezone.utc).isoformat(), | |
| ) | |
| async def list_formats( | |
| token: str = Depends(require_auth), | |
| ): | |
| by_category = { | |
| "documents": [e for e in SUPPORTED_EXTENSIONS if e in DOCUMENT_EXTENSIONS], | |
| "office": [e for e in SUPPORTED_EXTENSIONS if e in OFFICE_EXTENSIONS], | |
| "data": [e for e in SUPPORTED_EXTENSIONS if e in {".csv", ".json", ".xml"}], | |
| "web": [e for e in SUPPORTED_EXTENSIONS if e in WEB_EXTENSIONS], | |
| "text": [e for e in SUPPORTED_EXTENSIONS if e in TEXT_EXTENSIONS], | |
| "images": [e for e in SUPPORTED_EXTENSIONS if e in IMAGE_EXTENSIONS], | |
| "audio": [e for e in SUPPORTED_EXTENSIONS if e in AUDIO_EXTENSIONS], | |
| "archives": [e for e in SUPPORTED_EXTENSIONS if e in ARCHIVE_EXTENSIONS], | |
| } | |
| return SupportedFormatsResponse( | |
| success=True, | |
| total_count=len(SUPPORTED_EXTENSIONS), | |
| all_extensions=sorted(SUPPORTED_EXTENSIONS), | |
| by_category={k: sorted(v) for k, v in by_category.items()}, | |
| ) | |
| async def list_spacy_labels( | |
| token: str = Depends(require_auth), | |
| extraction_service: ExtractionService = Depends(get_extraction_service), | |
| ): | |
| return SpacyLabelsResponse( | |
| success=True, | |
| spacy_labels=extraction_service.get_spacy_labels(), | |
| source_types={ | |
| "entity": "Extract using spaCy NER labels (ORG, PERSON, DATE, etc.)", | |
| "regex": "Extract using custom regular expressions", | |
| "token_attr": "Extract using token attributes (text, pos_, tag_, etc.)", | |
| }, | |
| example_mappings={ | |
| "company": {"source_type": "entity", "label": "ORG"}, | |
| "person": {"source_type": "entity", "label": "PERSON"}, | |
| "date": {"source_type": "entity", "label": "DATE"}, | |
| "money": {"source_type": "entity", "label": "MONEY"}, | |
| "email": { | |
| "source_type": "regex", | |
| "pattern": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", | |
| }, | |
| "phone": {"source_type": "regex", "pattern": r"\b\d{3}-\d{3}-\d{4}\b"}, | |
| }, | |
| ) | |