unesco-data-ai's picture
Deploy v2.0.0 [api-dev]
5ca88f7 verified
"""
Pydantic models for API request/response validation.
Provides schemas for:
- Document submission
- Processing status
- DCAT-AP results
- Error responses
"""
from typing import List, Optional, Dict, Any
from datetime import datetime, timezone
from enum import Enum
from pydantic import BaseModel, Field, HttpUrl, ConfigDict
class DocumentType(str, Enum):
"""UNESCO document types."""
REPORT = "report"
RESOLUTION = "resolution"
DECISION = "decision"
MEETING = "meeting"
STRATEGY = "strategy"
GUIDELINE = "guideline"
class ProcessingStatus(str, Enum):
"""Pipeline processing status."""
PENDING = "pending"
PARSING = "parsing"
EXTRACTING = "extracting"
GROUNDING = "grounding"
FORMATTING = "formatting"
COMPLETED = "completed"
FAILED = "failed"
class DocumentSubmission(BaseModel):
"""Request model for document submission."""
document_id: str = Field(
...,
pattern=r"^[\w\.\-]{1,128}$",
description="Unique document identifier (alphanumeric, hyphens, underscores, dots; 1–128 chars)",
)
file_url: Optional[HttpUrl] = Field(None, description="URL to PDF file")
file_content: Optional[str] = Field(
None,
description="Base64 encoded PDF content",
)
text_content: Optional[str] = Field(
None,
max_length=500_000, # ~500 KB plain text
description="Raw text to process (bypasses PDF parsing)",
)
source_url: Optional[HttpUrl] = Field(None, description="Website URL to scrape for content")
languages: Optional[List[str]] = Field(default=None, description="Document languages (ISO codes); auto-detected if omitted")
document_family: Optional[str] = Field(None, description="UNESCO family (CLT, SHS, ED, etc.)")
priority: str = Field(default="normal", description="Processing priority (high/normal/low)")
webhook_url: Optional[HttpUrl] = Field(None, description="Webhook URL for completion notification")
model_config = ConfigDict(
json_schema_extra={
"example": {
"document_id": "UNESDOC-2024-001",
"text_content": "UNESCO Category 2 Centre focused on human rights education...",
"source_url": "https://example-centre.org",
"languages": ["en", "fr"],
"document_family": "SHS",
"priority": "normal"
}
}
)
class BatchSubmission(BaseModel):
"""Request model for batch document submission."""
documents: List[DocumentSubmission] = Field(..., max_length=100)
model_config = ConfigDict(
json_schema_extra={
"example": {
"documents": [
{
"document_id": "DOC-001",
"file_url": "https://example.com/1.pdf"
},
{
"document_id": "DOC-002",
"file_url": "https://example.com/2.pdf"
}
]
}
}
)
class ProcessingResult(BaseModel):
"""Response model for processing status."""
document_id: str
status: ProcessingStatus
created_at: datetime
updated_at: datetime
current_stage: Optional[str] = None
progress_percent: int = Field(0, ge=0, le=100)
error_message: Optional[str] = None
model_config = ConfigDict(
json_schema_extra={
"example": {
"document_id": "UNESDOC-2024-001",
"status": "processing",
"created_at": "2024-01-15T10:00:00Z",
"updated_at": "2024-01-15T10:00:30Z",
"current_stage": "extracting",
"progress_percent": 45
}
}
)
class DCATEntity(BaseModel):
"""DCAT-AP entity representation."""
uri: str
label: Optional[str] = None
entity_type: Optional[str] = None # GLiNER2 label, e.g. "geo.country", "org.name"
source: str
confidence: float = Field(..., ge=0.0, le=1.0)
class CountryEntity(BaseModel):
"""Country with ISO3 code representation."""
name: str = Field(..., description="Country name (e.g., 'France')")
iso3: str = Field(..., description="ISO 3166-1 alpha-3 code (e.g., 'FRA')")
iso2: Optional[str] = Field(None, description="ISO 3166-1 alpha-2 code (e.g., 'FR')")
confidence: float = Field(1.0, ge=0.0, le=1.0)
thesaurus_uri: Optional[str] = Field(None, description="UNESCO Thesaurus domain7 URI")
class RegionEntity(BaseModel):
"""UNESCO region representation."""
name: str = Field(..., description="Region name (e.g., 'Africa')")
entity_type: str = Field(default="region", description="Type: region, area, or continent")
confidence: float = Field(1.0, ge=0.0, le=1.0)
thesaurus_uri: Optional[str] = Field(None, description="UNESCO Thesaurus domain7 URI")
class GeographicalCoverage(BaseModel):
"""Structured geographical coverage with ISO3 codes."""
countries: List[CountryEntity] = Field(
default=[],
description="Countries with ISO3 codes"
)
regions: List[RegionEntity] = Field(
default=[],
description="UNESCO regions and areas (no ISO3)"
)
iso3_codes: List[str] = Field(
default=[],
description="List of all ISO3 codes for filtering"
)
model_config = ConfigDict(
json_schema_extra={
"example": {
"countries": [
{"name": "France", "iso3": "FRA", "iso2": "FR"},
{"name": "Germany", "iso3": "DEU", "iso2": "DE"}
],
"regions": [
{"name": "Europe", "entity_type": "region"}
],
"iso3_codes": ["FRA", "DEU"]
}
}
)
class TimeCoverage(BaseModel):
"""Structured temporal coverage."""
year: Optional[int] = Field(None, description="Primary year")
date: Optional[str] = Field(None, description="Full date (ISO 8601)")
years_mentioned: List[int] = Field(
default=[],
description="All years mentioned in document"
)
sessions: List[str] = Field(
default=[],
description="Biennium sessions (e.g., '2023-2024')"
)
adoption_date: Optional[str] = Field(
None,
description="Document adoption date (ISO 8601)"
)
model_config = ConfigDict(
json_schema_extra={
"example": {
"year": 2024,
"date": "2024",
"years_mentioned": [2024, 2025],
"sessions": ["2023-2024"],
"adoption_date": None
}
}
)
class DCATResult(BaseModel):
"""DCAT-AP formatted result with coverage fields."""
document_id: str
status: str
dcat_metadata: Dict[str, Any] = Field(..., description="DCAT-AP JSON-LD")
entities: List[DCATEntity]
sdg_goals: List[Dict[str, Any]]
justifications: List[Dict[str, str]]
processing_time_ms: float
created_at: datetime
# Coverage fields (6.3 addition)
geographical_coverage: GeographicalCoverage = Field(
default_factory=GeographicalCoverage,
description="Geographical coverage with ISO3 codes"
)
time_coverage: TimeCoverage = Field(
default_factory=TimeCoverage,
description="Temporal coverage information"
)
model_config = ConfigDict(
json_schema_extra={
"example": {
"document_id": "UNESDOC-2024-001",
"status": "completed",
"dcat_metadata": {
"@context": ["https://www.w3.org/ns/dcat/v3"],
"@id": "http://unesdoc.unesco.org/ark:/48223/pf0000XXXXX",
"dcterms:title": {"@language": "en", "@value": "Test Document"}
},
"entities": [
{
"uri": "http://vocabularies.unesco.org/thesaurus/concept122",
"label": "Education",
"source": "GLiNER2",
"confidence": 0.94
}
],
"sdg_goals": [
{"goal": "SDG4", "confidence": 0.88}
],
"justifications": [],
"processing_time_ms": 2345.0,
"created_at": "2024-01-15T10:00:00Z",
"geographical_coverage": {
"countries": [
{"name": "France", "iso3": "FRA", "iso2": "FR"}
],
"regions": [
{"name": "Europe", "entity_type": "region"}
],
"iso3_codes": ["FRA"]
},
"time_coverage": {
"year": 2024,
"date": "2024",
"years_mentioned": [2024]
}
}
}
)
class BatchResult(BaseModel):
"""Response model for batch submission."""
batch_id: str
total_documents: int
submitted_documents: List[str]
status: str
estimated_completion: Optional[datetime] = None
class HealthCheck(BaseModel):
"""Health check response."""
status: str
version: str
timestamp: datetime
components: Dict[str, str]
model_config = ConfigDict(
json_schema_extra={
"example": {
"status": "healthy",
"version": "2.0.0",
"timestamp": "2024-01-15T10:00:00Z",
"components": {
"database": "ok",
"pipeline": "ok",
"storage": "ok"
}
}
}
)
class ErrorResponse(BaseModel):
"""Error response model."""
error: str
detail: Optional[str] = None
code: Optional[str] = None
timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
model_config = ConfigDict(
json_schema_extra={
"example": {
"error": "Document not found",
"detail": "Document ID 'INVALID-001' does not exist",
"code": "DOC_NOT_FOUND",
"timestamp": "2024-01-15T10:00:00Z"
}
}
)
class WebhookPayload(BaseModel):
"""Webhook notification payload."""
event: str = Field(..., description="Event type: processing.completed, processing.failed")
document_id: str
status: ProcessingStatus
result_url: Optional[str] = None
timestamp: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
model_config = ConfigDict(
json_schema_extra={
"example": {
"event": "processing.completed",
"document_id": "UNESDOC-2024-001",
"status": "completed",
"result_url": "http://api.example.com/api/v1/result/UNESDOC-2024-001",
"timestamp": "2024-01-15T10:05:00Z"
}
}
)