from __future__ import annotations

from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Literal

from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator

TagCategory = Literal["genre", "mood", "instrument", "vocal", "theme", "other"]


class Tag(BaseModel):
    model_config = ConfigDict(extra="forbid")

    name: str
    score: float = Field(ge=0.0, le=1.0)
    category: TagCategory
    source: str

    @field_validator("name", "source")
    @classmethod
    def _non_empty(cls, v: str) -> str:
        v = v.strip()
        if not v:
            raise ValueError("must be non-empty")
        return v


class KeywordSet(BaseModel):
    """LLM-synthesized sync-licensing search keywords (see keywords.py).

    Open free-text discovery terms, distinct from the controlled taxonomy tags:
    English + Turkish search terms plus concrete sync placement scenarios. Each
    list is stripped, de-duplicated (case-insensitive, order-preserving), and
    capped so a hallucinating model cannot bloat the record.
    """

    model_config = ConfigDict(extra="forbid")

    search_en: list[str] = Field(default_factory=list)
    search_tr: list[str] = Field(default_factory=list)
    use_cases: list[str] = Field(default_factory=list)

    @field_validator("search_en", "search_tr", "use_cases")
    @classmethod
    def _clean_terms(cls, values: list[str]) -> list[str]:
        out: list[str] = []
        seen: set[str] = set()
        for v in values:
            v = str(v).strip()
            if not v:
                continue
            key = v.casefold()
            if key in seen:
                continue
            seen.add(key)
            out.append(v)
        return out[:15]


CONTENT_SAFETY_CATEGORIES: tuple[str, ...] = (
    "strong_language",
    "substance_abuse",
    "sexual_reference",
    "references_to_violence",
    "discriminatory_language",
)


class ContentSafety(BaseModel):
    """Lyric-derived brand-safety flags (see content_safety.py).

    ``explicit`` is the top-level parent; ``categories`` are the non-exclusive
    child reasons (subset of ``CONTENT_SAFETY_CATEGORIES``). Per the annotation
    guidelines, when ``explicit`` is False there are no child categories.
    Review-assist for sync licensing — not source-verified ground truth.
    """

    model_config = ConfigDict(extra="forbid")

    explicit: bool = False
    categories: list[str] = Field(default_factory=list)
    confidence: Literal["high", "medium", "low"] = "low"
    rationale: str = ""

    @field_validator("categories")
    @classmethod
    def _known_categories(cls, values: list[str]) -> list[str]:
        out: list[str] = []
        for v in values:
            v = str(v).strip().lower()
            if v in CONTENT_SAFETY_CATEGORIES and v not in out:
                out.append(v)
        return out

    @model_validator(mode="after")
    def _no_categories_when_clean(self) -> "ContentSafety":
        if not self.explicit:
            self.categories = []
        return self


class Segment(BaseModel):
    model_config = ConfigDict(extra="forbid")

    start: float = Field(ge=0.0)
    end: float = Field(ge=0.0)
    label: str | None = None
    confidence: float | None = Field(default=None, ge=0.0, le=1.0)

    @model_validator(mode="after")
    def _end_after_start(self) -> "Segment":
        if self.end <= self.start:
            raise ValueError(f"end ({self.end}) must be > start ({self.start})")
        return self


class TrackRecord(BaseModel):
    # arbitrary_types_allowed for Path; extra='allow' on metadata only via dict[str, Any].
    model_config = ConfigDict(extra="forbid", arbitrary_types_allowed=True)

    track_id: str
    path: Path
    duration_sec: float = Field(ge=0.0)
    sample_rate: int = Field(gt=0)

    tags: list[Tag] = Field(default_factory=list)
    segments: list[Segment] = Field(default_factory=list)
    embeddings: dict[str, list[float]] | None = None
    lyrics: str | None = None
    # caption: future audio-captioning model output produced directly from the
    #   waveform (LP-MusicCaps-style). Reserved; not populated by the current
    #   pipeline.
    # description: LLM-synthesized prose built from the structured tag layers
    #   (MAEST + CLAP). Sync-licensing-grade English copy, see description.py.
    caption: str | None = None
    description: str | None = None
    # keywords: LLM-synthesized open search/discovery terms for sync licensing
    #   (EN + TR + use-cases), distinct from controlled taxonomy tags. See
    #   keywords.py. Optional; populated by the generate-keywords stage.
    keywords: KeywordSet | None = None
    # content_safety: lyric-derived brand-safety flags (explicit + reasons).
    #   Review-assist for sync licensing; see content_safety.py.
    content_safety: ContentSafety | None = None

    metadata: dict[str, Any] = Field(default_factory=dict)

    created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
    pipeline_version: str

    @field_validator("track_id")
    @classmethod
    def _track_id_non_empty(cls, v: str) -> str:
        v = v.strip()
        if not v:
            raise ValueError("track_id must be non-empty")
        return v