"""
VoiceVault — Core Data Models
==============================
All Pydantic schemas used across the pipeline.
These are the canonical data contracts between every module.

Schemas:
    DocumentChunk   — a single indexed text chunk with metadata
    Citation        — a source reference attached to an LLM answer
    QuerySession    — a full voice query → answer session record
    KnowledgeBase   — a named document collection
    Document        — a single document within a KB
    IngestionReport — result of indexing a document
    TranscriptResult— result of Whisper ASR
    RetrievalResult — a retrieved chunk with its relevance score
"""

import uuid
from datetime import datetime, timezone
from typing import Optional
from pydantic import BaseModel, Field


# ------------------------------------------------------------------ #
# Ingestion Models                                                      #
# ------------------------------------------------------------------ #


class DocumentChunk(BaseModel):
    """
    A single semantically chunked piece of a source document.
    Stored in ChromaDB (embedding) and SQLite (metadata).
    """

    chunk_id: str = Field(
        default_factory=lambda: str(uuid.uuid4()),
        description="UUID primary key for this chunk.",
    )
    kb_name: str = Field(description="Knowledge base this chunk belongs to.")
    source_file: str = Field(description="Original filename (not path) of the source document.")
    page_number: int = Field(description="Page number in the original document (1-indexed).")
    section: str = Field(
        default="",
        description="Nearest heading above this chunk in the document.",
    )
    chunk_index: int = Field(description="Position of this chunk within its source document.")
    text: str = Field(description="Raw chunk text (400–600 tokens).")
    text_hash: str = Field(description="SHA-256 of text — used for deduplication.")
    token_count: int = Field(description="Approximate token count of the text.")
    language: str = Field(default="en", description="ISO 639-1 language code.")
    ingested_at: datetime = Field(
        default_factory=lambda: datetime.now(timezone.utc),
        description="UTC timestamp when this chunk was indexed.",
    )


class IngestionReport(BaseModel):
    """Result of ingesting a single document into a knowledge base."""

    doc_id: str = Field(description="UUID of the ingested document.")
    filename: str = Field(description="Original filename.")
    chunk_count: int = Field(description="Number of chunks created from this document.")
    page_count: int = Field(description="Total pages parsed.")
    status: str = Field(description="One of: success | error | skipped.")
    message: str = Field(default="", description="Human-readable status detail or error message.")
    duration_ms: int = Field(default=0, description="Time taken to ingest this document in ms.")


# ------------------------------------------------------------------ #
# Retrieval Models                                                      #
# ------------------------------------------------------------------ #


class RetrievalResult(BaseModel):
    """A single retrieved chunk with its composite relevance score."""

    chunk_id: str = Field(description="Reference to the DocumentChunk.chunk_id.")
    text: str = Field(description="The chunk text.")
    source_file: str = Field(description="Source document filename.")
    page_number: int = Field(description="Page number in the source document.")
    section: str = Field(default="", description="Section heading above this chunk.")
    rrf_score: float = Field(default=0.0, description="Reciprocal Rank Fusion combined score.")
    rerank_score: float = Field(
        default=0.0,
        description="Cross-encoder reranking score (higher = more relevant).",
    )


# ------------------------------------------------------------------ #
# Generation Models                                                     #
# ------------------------------------------------------------------ #


class Citation(BaseModel):
    """A single source citation attached to an LLM answer."""

    source_file: str = Field(description="Filename of the cited document.")
    page_number: int = Field(description="Page number of the cited chunk.")
    section: str = Field(default="", description="Section heading of the cited chunk.")
    excerpt: str = Field(description="Relevant sentence or phrase from the cited chunk.")
    relevance_score: float = Field(
        default=0.0,
        description="Reranking relevance score (0.0 – 1.0).",
    )


class QuerySession(BaseModel):
    """
    A complete record of one user voice query and its answer.
    Written to the SQLite audit log after every query.
    """

    session_id: str = Field(
        default_factory=lambda: str(uuid.uuid4()),
        description="UUID for this query session.",
    )
    kb_names: list[str] = Field(description="Knowledge bases queried.")
    voice_query: str = Field(description="Raw Whisper transcript.")
    processed_query: str = Field(description="Cleaned, normalized query string.")
    query_type: str = Field(
        default="factual",
        description="Classified query intent: factual | summary | compare.",
    )
    retrieved_chunks: list[str] = Field(
        default_factory=list,
        description="chunk_ids in ranked retrieval order.",
    )
    answer: str = Field(default="", description="Final generated answer with inline citations.")
    citations: list[Citation] = Field(
        default_factory=list,
        description="Structured citation list extracted from the answer.",
    )
    latency_asr_ms: int = Field(default=0, description="Whisper transcription latency in ms.")
    latency_retrieval_ms: int = Field(default=0, description="Retrieval pipeline latency in ms.")
    latency_llm_ms: int = Field(default=0, description="LLM generation latency in ms.")
    total_latency_ms: int = Field(default=0, description="End-to-end latency in ms.")
    groq_tokens_used: int = Field(default=0, description="Groq API tokens consumed.")
    timestamp: datetime = Field(
        default_factory=lambda: datetime.now(timezone.utc),
        description="UTC timestamp of the query.",
    )


# ------------------------------------------------------------------ #
# Knowledge Base Models                                                 #
# ------------------------------------------------------------------ #


class KnowledgeBase(BaseModel):
    """A named, persistent collection of indexed documents."""

    kb_name: str = Field(description="Unique identifier (slug) for this knowledge base.")
    display_name: str = Field(description="Human-readable name shown in the UI.")
    password_hash: Optional[str] = Field(
        default=None,
        description="bcrypt hash of the KB password. None means public.",
    )
    owner: str = Field(default="default", description="Owner identifier.")
    doc_count: int = Field(default=0, description="Number of indexed documents.")
    chunk_count: int = Field(default=0, description="Total indexed chunks.")
    created_at: datetime = Field(
        default_factory=lambda: datetime.now(timezone.utc),
        description="UTC timestamp of KB creation.",
    )
    last_updated: Optional[datetime] = Field(
        default=None,
        description="UTC timestamp of last document ingestion.",
    )

    @property
    def is_protected(self) -> bool:
        """True if this knowledge base requires a password."""
        return self.password_hash is not None


class Document(BaseModel):
    """A single source document registered within a knowledge base."""

    doc_id: str = Field(
        default_factory=lambda: str(uuid.uuid4()),
        description="UUID primary key.",
    )
    kb_name: str = Field(description="Knowledge base this document belongs to.")
    filename: str = Field(description="Original filename.")
    file_hash: str = Field(description="SHA-256 of file bytes — used for deduplication.")
    page_count: int = Field(default=0, description="Total pages in the document.")
    chunk_count: int = Field(default=0, description="Number of chunks created.")
    is_private: bool = Field(
        default=False,
        description="If True, this document is excluded from shared KB queries.",
    )
    ingested_at: datetime = Field(
        default_factory=lambda: datetime.now(timezone.utc),
        description="UTC timestamp of ingestion.",
    )


# ------------------------------------------------------------------ #
# ASR Model                                                             #
# ------------------------------------------------------------------ #


class TranscriptResult(BaseModel):
    """Result returned by the Whisper transcription pipeline."""

    transcript: str = Field(description="Cleaned transcription text.")
    raw_transcript: str = Field(description="Unprocessed Whisper output.")
    language: str = Field(default="en", description="Detected ISO 639-1 language code.")
    confidence: float = Field(
        default=1.0,
        description="Transcription confidence (0.0 – 1.0); 1.0 if not available.",
    )
    model_used: str = Field(
        description="Which ASR model produced this transcript (whisper-large-v3 or distil-large-v3)."
    )
    latency_ms: int = Field(default=0, description="Transcription time in milliseconds.")
    query_type: str = Field(
        default="factual",
        description="Classified query intent after preprocessing.",
    )