from __future__ import annotations from datetime import datetime, timezone from pathlib import Path from typing import Any, Literal from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator TagCategory = Literal["genre", "mood", "instrument", "vocal", "theme", "other"] class Tag(BaseModel): model_config = ConfigDict(extra="forbid") name: str score: float = Field(ge=0.0, le=1.0) category: TagCategory source: str @field_validator("name", "source") @classmethod def _non_empty(cls, v: str) -> str: v = v.strip() if not v: raise ValueError("must be non-empty") return v class KeywordSet(BaseModel): """LLM-synthesized sync-licensing search keywords (see keywords.py). Open free-text discovery terms, distinct from the controlled taxonomy tags: English + Turkish search terms plus concrete sync placement scenarios. Each list is stripped, de-duplicated (case-insensitive, order-preserving), and capped so a hallucinating model cannot bloat the record. """ model_config = ConfigDict(extra="forbid") search_en: list[str] = Field(default_factory=list) search_tr: list[str] = Field(default_factory=list) use_cases: list[str] = Field(default_factory=list) @field_validator("search_en", "search_tr", "use_cases") @classmethod def _clean_terms(cls, values: list[str]) -> list[str]: out: list[str] = [] seen: set[str] = set() for v in values: v = str(v).strip() if not v: continue key = v.casefold() if key in seen: continue seen.add(key) out.append(v) return out[:15] CONTENT_SAFETY_CATEGORIES: tuple[str, ...] = ( "strong_language", "substance_abuse", "sexual_reference", "references_to_violence", "discriminatory_language", ) class ContentSafety(BaseModel): """Lyric-derived brand-safety flags (see content_safety.py). ``explicit`` is the top-level parent; ``categories`` are the non-exclusive child reasons (subset of ``CONTENT_SAFETY_CATEGORIES``). Per the annotation guidelines, when ``explicit`` is False there are no child categories. Review-assist for sync licensing — not source-verified ground truth. """ model_config = ConfigDict(extra="forbid") explicit: bool = False categories: list[str] = Field(default_factory=list) confidence: Literal["high", "medium", "low"] = "low" rationale: str = "" @field_validator("categories") @classmethod def _known_categories(cls, values: list[str]) -> list[str]: out: list[str] = [] for v in values: v = str(v).strip().lower() if v in CONTENT_SAFETY_CATEGORIES and v not in out: out.append(v) return out @model_validator(mode="after") def _no_categories_when_clean(self) -> "ContentSafety": if not self.explicit: self.categories = [] return self class Segment(BaseModel): model_config = ConfigDict(extra="forbid") start: float = Field(ge=0.0) end: float = Field(ge=0.0) label: str | None = None confidence: float | None = Field(default=None, ge=0.0, le=1.0) @model_validator(mode="after") def _end_after_start(self) -> "Segment": if self.end <= self.start: raise ValueError(f"end ({self.end}) must be > start ({self.start})") return self class TrackRecord(BaseModel): # arbitrary_types_allowed for Path; extra='allow' on metadata only via dict[str, Any]. model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True) track_id: str path: Path duration_sec: float = Field(ge=0.0) sample_rate: int = Field(gt=0) tags: list[Tag] = Field(default_factory=list) segments: list[Segment] = Field(default_factory=list) embeddings: dict[str, list[float]] | None = None lyrics: str | None = None # caption: future audio-captioning model output produced directly from the # waveform (LP-MusicCaps-style). Reserved; not populated by the current # pipeline. # description: LLM-synthesized prose built from the structured tag layers # (MAEST + CLAP). Sync-licensing-grade English copy, see description.py. caption: str | None = None description: str | None = None # keywords: LLM-synthesized open search/discovery terms for sync licensing # (EN + TR + use-cases), distinct from controlled taxonomy tags. See # keywords.py. Optional; populated by the generate-keywords stage. keywords: KeywordSet | None = None # content_safety: lyric-derived brand-safety flags (explicit + reasons). # Review-assist for sync licensing; see content_safety.py. content_safety: ContentSafety | None = None metadata: dict[str, Any] = Field(default_factory=dict) created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) pipeline_version: str @field_validator("track_id") @classmethod def _track_id_non_empty(cls, v: str) -> str: v = v.strip() if not v: raise ValueError("track_id must be non-empty") return v