""" Pydantic models for API request/response validation. Provides schemas for: - Document submission - Processing status - DCAT-AP results - Error responses """ from typing import List, Optional, Dict, Any from datetime import datetime, timezone from enum import Enum from pydantic import BaseModel, Field, HttpUrl, ConfigDict class DocumentType(str, Enum): """UNESCO document types.""" REPORT = "report" RESOLUTION = "resolution" DECISION = "decision" MEETING = "meeting" STRATEGY = "strategy" GUIDELINE = "guideline" class ProcessingStatus(str, Enum): """Pipeline processing status.""" PENDING = "pending" PARSING = "parsing" EXTRACTING = "extracting" GROUNDING = "grounding" FORMATTING = "formatting" COMPLETED = "completed" FAILED = "failed" class DocumentSubmission(BaseModel): """Request model for document submission.""" document_id: str = Field( ..., pattern=r"^[\w\.\-]{1,128}$", description="Unique document identifier (alphanumeric, hyphens, underscores, dots; 1–128 chars)", ) file_url: Optional[HttpUrl] = Field(None, description="URL to PDF file") file_content: Optional[str] = Field( None, description="Base64 encoded PDF content", ) text_content: Optional[str] = Field( None, max_length=500_000, # ~500 KB plain text description="Raw text to process (bypasses PDF parsing)", ) source_url: Optional[HttpUrl] = Field(None, description="Website URL to scrape for content") languages: Optional[List[str]] = Field(default=None, description="Document languages (ISO codes); auto-detected if omitted") document_family: Optional[str] = Field(None, description="UNESCO family (CLT, SHS, ED, etc.)") priority: str = Field(default="normal", description="Processing priority (high/normal/low)") webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for completion notification") model_config = ConfigDict( json_schema_extra={ "example": { "document_id": "UNESDOC-2024-001", "text_content": "UNESCO Category 2 Centre focused on human rights education...", "source_url": "https://example-centre.org", "languages": ["en", "fr"], "document_family": "SHS", "priority": "normal" } } ) class BatchSubmission(BaseModel): """Request model for batch document submission.""" documents: List[DocumentSubmission] = Field(..., max_length=100) model_config = ConfigDict( json_schema_extra={ "example": { "documents": [ { "document_id": "DOC-001", "file_url": "https://example.com/1.pdf" }, { "document_id": "DOC-002", "file_url": "https://example.com/2.pdf" } ] } } ) class ProcessingResult(BaseModel): """Response model for processing status.""" document_id: str status: ProcessingStatus created_at: datetime updated_at: datetime current_stage: Optional[str] = None progress_percent: int = Field(0, ge=0, le=100) error_message: Optional[str] = None model_config = ConfigDict( json_schema_extra={ "example": { "document_id": "UNESDOC-2024-001", "status": "processing", "created_at": "2024-01-15T10:00:00Z", "updated_at": "2024-01-15T10:00:30Z", "current_stage": "extracting", "progress_percent": 45 } } ) class DCATEntity(BaseModel): """DCAT-AP entity representation.""" uri: str label: Optional[str] = None entity_type: Optional[str] = None # GLiNER2 label, e.g. "geo.country", "org.name" source: str confidence: float = Field(..., ge=0.0, le=1.0) class CountryEntity(BaseModel): """Country with ISO3 code representation.""" name: str = Field(..., description="Country name (e.g., 'France')") iso3: str = Field(..., description="ISO 3166-1 alpha-3 code (e.g., 'FRA')") iso2: Optional[str] = Field(None, description="ISO 3166-1 alpha-2 code (e.g., 'FR')") confidence: float = Field(1.0, ge=0.0, le=1.0) thesaurus_uri: Optional[str] = Field(None, description="UNESCO Thesaurus domain7 URI") class RegionEntity(BaseModel): """UNESCO region representation.""" name: str = Field(..., description="Region name (e.g., 'Africa')") entity_type: str = Field(default="region", description="Type: region, area, or continent") confidence: float = Field(1.0, ge=0.0, le=1.0) thesaurus_uri: Optional[str] = Field(None, description="UNESCO Thesaurus domain7 URI") class GeographicalCoverage(BaseModel): """Structured geographical coverage with ISO3 codes.""" countries: List[CountryEntity] = Field( default=[], description="Countries with ISO3 codes" ) regions: List[RegionEntity] = Field( default=[], description="UNESCO regions and areas (no ISO3)" ) iso3_codes: List[str] = Field( default=[], description="List of all ISO3 codes for filtering" ) model_config = ConfigDict( json_schema_extra={ "example": { "countries": [ {"name": "France", "iso3": "FRA", "iso2": "FR"}, {"name": "Germany", "iso3": "DEU", "iso2": "DE"} ], "regions": [ {"name": "Europe", "entity_type": "region"} ], "iso3_codes": ["FRA", "DEU"] } } ) class TimeCoverage(BaseModel): """Structured temporal coverage.""" year: Optional[int] = Field(None, description="Primary year") date: Optional[str] = Field(None, description="Full date (ISO 8601)") years_mentioned: List[int] = Field( default=[], description="All years mentioned in document" ) sessions: List[str] = Field( default=[], description="Biennium sessions (e.g., '2023-2024')" ) adoption_date: Optional[str] = Field( None, description="Document adoption date (ISO 8601)" ) model_config = ConfigDict( json_schema_extra={ "example": { "year": 2024, "date": "2024", "years_mentioned": [2024, 2025], "sessions": ["2023-2024"], "adoption_date": None } } ) class DCATResult(BaseModel): """DCAT-AP formatted result with coverage fields.""" document_id: str status: str dcat_metadata: Dict[str, Any] = Field(..., description="DCAT-AP JSON-LD") entities: List[DCATEntity] sdg_goals: List[Dict[str, Any]] justifications: List[Dict[str, str]] processing_time_ms: float created_at: datetime # Coverage fields (6.3 addition) geographical_coverage: GeographicalCoverage = Field( default_factory=GeographicalCoverage, description="Geographical coverage with ISO3 codes" ) time_coverage: TimeCoverage = Field( default_factory=TimeCoverage, description="Temporal coverage information" ) model_config = ConfigDict( json_schema_extra={ "example": { "document_id": "UNESDOC-2024-001", "status": "completed", "dcat_metadata": { "@context": ["https://www.w3.org/ns/dcat/v3"], "@id": "http://unesdoc.unesco.org/ark:/48223/pf0000XXXXX", "dcterms:title": {"@language": "en", "@value": "Test Document"} }, "entities": [ { "uri": "http://vocabularies.unesco.org/thesaurus/concept122", "label": "Education", "source": "GLiNER2", "confidence": 0.94 } ], "sdg_goals": [ {"goal": "SDG4", "confidence": 0.88} ], "justifications": [], "processing_time_ms": 2345.0, "created_at": "2024-01-15T10:00:00Z", "geographical_coverage": { "countries": [ {"name": "France", "iso3": "FRA", "iso2": "FR"} ], "regions": [ {"name": "Europe", "entity_type": "region"} ], "iso3_codes": ["FRA"] }, "time_coverage": { "year": 2024, "date": "2024", "years_mentioned": [2024] } } } ) class BatchResult(BaseModel): """Response model for batch submission.""" batch_id: str total_documents: int submitted_documents: List[str] status: str estimated_completion: Optional[datetime] = None class HealthCheck(BaseModel): """Health check response.""" status: str version: str timestamp: datetime components: Dict[str, str] model_config = ConfigDict( json_schema_extra={ "example": { "status": "healthy", "version": "2.0.0", "timestamp": "2024-01-15T10:00:00Z", "components": { "database": "ok", "pipeline": "ok", "storage": "ok" } } } ) class ErrorResponse(BaseModel): """Error response model.""" error: str detail: Optional[str] = None code: Optional[str] = None timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) model_config = ConfigDict( json_schema_extra={ "example": { "error": "Document not found", "detail": "Document ID 'INVALID-001' does not exist", "code": "DOC_NOT_FOUND", "timestamp": "2024-01-15T10:00:00Z" } } ) class WebhookPayload(BaseModel): """Webhook notification payload.""" event: str = Field(..., description="Event type: processing.completed, processing.failed") document_id: str status: ProcessingStatus result_url: Optional[str] = None timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) model_config = ConfigDict( json_schema_extra={ "example": { "event": "processing.completed", "document_id": "UNESDOC-2024-001", "status": "completed", "result_url": "http://api.example.com/api/v1/result/UNESDOC-2024-001", "timestamp": "2024-01-15T10:05:00Z" } } )