| """ |
| Pydantic models for API request/response validation. |
| |
| Provides schemas for: |
| - Document submission |
| - Processing status |
| - DCAT-AP results |
| - Error responses |
| """ |
|
|
| from typing import List, Optional, Dict, Any |
| from datetime import datetime, timezone |
| from enum import Enum |
| from pydantic import BaseModel, Field, HttpUrl, ConfigDict |
|
|
|
|
| class DocumentType(str, Enum): |
| """UNESCO document types.""" |
| REPORT = "report" |
| RESOLUTION = "resolution" |
| DECISION = "decision" |
| MEETING = "meeting" |
| STRATEGY = "strategy" |
| GUIDELINE = "guideline" |
|
|
|
|
| class ProcessingStatus(str, Enum): |
| """Pipeline processing status.""" |
| PENDING = "pending" |
| PARSING = "parsing" |
| EXTRACTING = "extracting" |
| GROUNDING = "grounding" |
| FORMATTING = "formatting" |
| COMPLETED = "completed" |
| FAILED = "failed" |
|
|
|
|
| class DocumentSubmission(BaseModel): |
| """Request model for document submission.""" |
| document_id: str = Field( |
| ..., |
| pattern=r"^[\w\.\-]{1,128}$", |
| description="Unique document identifier (alphanumeric, hyphens, underscores, dots; 1–128 chars)", |
| ) |
| file_url: Optional[HttpUrl] = Field(None, description="URL to PDF file") |
| file_content: Optional[str] = Field( |
| None, |
| description="Base64 encoded PDF content", |
| ) |
| text_content: Optional[str] = Field( |
| None, |
| max_length=500_000, |
| description="Raw text to process (bypasses PDF parsing)", |
| ) |
| source_url: Optional[HttpUrl] = Field(None, description="Website URL to scrape for content") |
| languages: Optional[List[str]] = Field(default=None, description="Document languages (ISO codes); auto-detected if omitted") |
| document_family: Optional[str] = Field(None, description="UNESCO family (CLT, SHS, ED, etc.)") |
| priority: str = Field(default="normal", description="Processing priority (high/normal/low)") |
| webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for completion notification") |
|
|
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "document_id": "UNESDOC-2024-001", |
| "text_content": "UNESCO Category 2 Centre focused on human rights education...", |
| "source_url": "https://example-centre.org", |
| "languages": ["en", "fr"], |
| "document_family": "SHS", |
| "priority": "normal" |
| } |
| } |
| ) |
|
|
|
|
| class BatchSubmission(BaseModel): |
| """Request model for batch document submission.""" |
| documents: List[DocumentSubmission] = Field(..., max_length=100) |
| |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "documents": [ |
| { |
| "document_id": "DOC-001", |
| "file_url": "https://example.com/1.pdf" |
| }, |
| { |
| "document_id": "DOC-002", |
| "file_url": "https://example.com/2.pdf" |
| } |
| ] |
| } |
| } |
| ) |
|
|
|
|
| class ProcessingResult(BaseModel): |
| """Response model for processing status.""" |
| document_id: str |
| status: ProcessingStatus |
| created_at: datetime |
| updated_at: datetime |
| current_stage: Optional[str] = None |
| progress_percent: int = Field(0, ge=0, le=100) |
| error_message: Optional[str] = None |
| |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "document_id": "UNESDOC-2024-001", |
| "status": "processing", |
| "created_at": "2024-01-15T10:00:00Z", |
| "updated_at": "2024-01-15T10:00:30Z", |
| "current_stage": "extracting", |
| "progress_percent": 45 |
| } |
| } |
| ) |
|
|
|
|
| class DCATEntity(BaseModel): |
| """DCAT-AP entity representation.""" |
| uri: str |
| label: Optional[str] = None |
| entity_type: Optional[str] = None |
| source: str |
| confidence: float = Field(..., ge=0.0, le=1.0) |
|
|
|
|
| class CountryEntity(BaseModel): |
| """Country with ISO3 code representation.""" |
| name: str = Field(..., description="Country name (e.g., 'France')") |
| iso3: str = Field(..., description="ISO 3166-1 alpha-3 code (e.g., 'FRA')") |
| iso2: Optional[str] = Field(None, description="ISO 3166-1 alpha-2 code (e.g., 'FR')") |
| confidence: float = Field(1.0, ge=0.0, le=1.0) |
| thesaurus_uri: Optional[str] = Field(None, description="UNESCO Thesaurus domain7 URI") |
|
|
|
|
| class RegionEntity(BaseModel): |
| """UNESCO region representation.""" |
| name: str = Field(..., description="Region name (e.g., 'Africa')") |
| entity_type: str = Field(default="region", description="Type: region, area, or continent") |
| confidence: float = Field(1.0, ge=0.0, le=1.0) |
| thesaurus_uri: Optional[str] = Field(None, description="UNESCO Thesaurus domain7 URI") |
|
|
|
|
| class GeographicalCoverage(BaseModel): |
| """Structured geographical coverage with ISO3 codes.""" |
| countries: List[CountryEntity] = Field( |
| default=[], |
| description="Countries with ISO3 codes" |
| ) |
| regions: List[RegionEntity] = Field( |
| default=[], |
| description="UNESCO regions and areas (no ISO3)" |
| ) |
| iso3_codes: List[str] = Field( |
| default=[], |
| description="List of all ISO3 codes for filtering" |
| ) |
| |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "countries": [ |
| {"name": "France", "iso3": "FRA", "iso2": "FR"}, |
| {"name": "Germany", "iso3": "DEU", "iso2": "DE"} |
| ], |
| "regions": [ |
| {"name": "Europe", "entity_type": "region"} |
| ], |
| "iso3_codes": ["FRA", "DEU"] |
| } |
| } |
| ) |
|
|
|
|
| class TimeCoverage(BaseModel): |
| """Structured temporal coverage.""" |
| year: Optional[int] = Field(None, description="Primary year") |
| date: Optional[str] = Field(None, description="Full date (ISO 8601)") |
| years_mentioned: List[int] = Field( |
| default=[], |
| description="All years mentioned in document" |
| ) |
| sessions: List[str] = Field( |
| default=[], |
| description="Biennium sessions (e.g., '2023-2024')" |
| ) |
| adoption_date: Optional[str] = Field( |
| None, |
| description="Document adoption date (ISO 8601)" |
| ) |
| |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "year": 2024, |
| "date": "2024", |
| "years_mentioned": [2024, 2025], |
| "sessions": ["2023-2024"], |
| "adoption_date": None |
| } |
| } |
| ) |
|
|
|
|
| class DCATResult(BaseModel): |
| """DCAT-AP formatted result with coverage fields.""" |
| document_id: str |
| status: str |
| dcat_metadata: Dict[str, Any] = Field(..., description="DCAT-AP JSON-LD") |
| entities: List[DCATEntity] |
| sdg_goals: List[Dict[str, Any]] |
| justifications: List[Dict[str, str]] |
| processing_time_ms: float |
| created_at: datetime |
| |
| |
| geographical_coverage: GeographicalCoverage = Field( |
| default_factory=GeographicalCoverage, |
| description="Geographical coverage with ISO3 codes" |
| ) |
| time_coverage: TimeCoverage = Field( |
| default_factory=TimeCoverage, |
| description="Temporal coverage information" |
| ) |
| |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "document_id": "UNESDOC-2024-001", |
| "status": "completed", |
| "dcat_metadata": { |
| "@context": ["https://www.w3.org/ns/dcat/v3"], |
| "@id": "http://unesdoc.unesco.org/ark:/48223/pf0000XXXXX", |
| "dcterms:title": {"@language": "en", "@value": "Test Document"} |
| }, |
| "entities": [ |
| { |
| "uri": "http://vocabularies.unesco.org/thesaurus/concept122", |
| "label": "Education", |
| "source": "GLiNER2", |
| "confidence": 0.94 |
| } |
| ], |
| "sdg_goals": [ |
| {"goal": "SDG4", "confidence": 0.88} |
| ], |
| "justifications": [], |
| "processing_time_ms": 2345.0, |
| "created_at": "2024-01-15T10:00:00Z", |
| "geographical_coverage": { |
| "countries": [ |
| {"name": "France", "iso3": "FRA", "iso2": "FR"} |
| ], |
| "regions": [ |
| {"name": "Europe", "entity_type": "region"} |
| ], |
| "iso3_codes": ["FRA"] |
| }, |
| "time_coverage": { |
| "year": 2024, |
| "date": "2024", |
| "years_mentioned": [2024] |
| } |
| } |
| } |
| ) |
|
|
|
|
| class BatchResult(BaseModel): |
| """Response model for batch submission.""" |
| batch_id: str |
| total_documents: int |
| submitted_documents: List[str] |
| status: str |
| estimated_completion: Optional[datetime] = None |
|
|
|
|
| class HealthCheck(BaseModel): |
| """Health check response.""" |
| status: str |
| version: str |
| timestamp: datetime |
| components: Dict[str, str] |
| |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "status": "healthy", |
| "version": "2.0.0", |
| "timestamp": "2024-01-15T10:00:00Z", |
| "components": { |
| "database": "ok", |
| "pipeline": "ok", |
| "storage": "ok" |
| } |
| } |
| } |
| ) |
|
|
|
|
| class ErrorResponse(BaseModel): |
| """Error response model.""" |
| error: str |
| detail: Optional[str] = None |
| code: Optional[str] = None |
| timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) |
| |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "error": "Document not found", |
| "detail": "Document ID 'INVALID-001' does not exist", |
| "code": "DOC_NOT_FOUND", |
| "timestamp": "2024-01-15T10:00:00Z" |
| } |
| } |
| ) |
|
|
|
|
| class WebhookPayload(BaseModel): |
| """Webhook notification payload.""" |
| event: str = Field(..., description="Event type: processing.completed, processing.failed") |
| document_id: str |
| status: ProcessingStatus |
| result_url: Optional[str] = None |
| timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) |
| |
| model_config = ConfigDict( |
| json_schema_extra={ |
| "example": { |
| "event": "processing.completed", |
| "document_id": "UNESDOC-2024-001", |
| "status": "completed", |
| "result_url": "http://api.example.com/api/v1/result/UNESDOC-2024-001", |
| "timestamp": "2024-01-15T10:05:00Z" |
| } |
| } |
| ) |
|
|