""" VoiceVault — Core Data Models ============================== All Pydantic schemas used across the pipeline. These are the canonical data contracts between every module. Schemas: DocumentChunk — a single indexed text chunk with metadata Citation — a source reference attached to an LLM answer QuerySession — a full voice query → answer session record KnowledgeBase — a named document collection Document — a single document within a KB IngestionReport — result of indexing a document TranscriptResult— result of Whisper ASR RetrievalResult — a retrieved chunk with its relevance score """ import uuid from datetime import datetime, timezone from typing import Optional from pydantic import BaseModel, Field # ------------------------------------------------------------------ # # Ingestion Models # # ------------------------------------------------------------------ # class DocumentChunk(BaseModel): """ A single semantically chunked piece of a source document. Stored in ChromaDB (embedding) and SQLite (metadata). """ chunk_id: str = Field( default_factory=lambda: str(uuid.uuid4()), description="UUID primary key for this chunk.", ) kb_name: str = Field(description="Knowledge base this chunk belongs to.") source_file: str = Field(description="Original filename (not path) of the source document.") page_number: int = Field(description="Page number in the original document (1-indexed).") section: str = Field( default="", description="Nearest heading above this chunk in the document.", ) chunk_index: int = Field(description="Position of this chunk within its source document.") text: str = Field(description="Raw chunk text (400–600 tokens).") text_hash: str = Field(description="SHA-256 of text — used for deduplication.") token_count: int = Field(description="Approximate token count of the text.") language: str = Field(default="en", description="ISO 639-1 language code.") ingested_at: datetime = Field( default_factory=lambda: datetime.now(timezone.utc), description="UTC timestamp when this chunk was indexed.", ) class IngestionReport(BaseModel): """Result of ingesting a single document into a knowledge base.""" doc_id: str = Field(description="UUID of the ingested document.") filename: str = Field(description="Original filename.") chunk_count: int = Field(description="Number of chunks created from this document.") page_count: int = Field(description="Total pages parsed.") status: str = Field(description="One of: success | error | skipped.") message: str = Field(default="", description="Human-readable status detail or error message.") duration_ms: int = Field(default=0, description="Time taken to ingest this document in ms.") # ------------------------------------------------------------------ # # Retrieval Models # # ------------------------------------------------------------------ # class RetrievalResult(BaseModel): """A single retrieved chunk with its composite relevance score.""" chunk_id: str = Field(description="Reference to the DocumentChunk.chunk_id.") text: str = Field(description="The chunk text.") source_file: str = Field(description="Source document filename.") page_number: int = Field(description="Page number in the source document.") section: str = Field(default="", description="Section heading above this chunk.") rrf_score: float = Field(default=0.0, description="Reciprocal Rank Fusion combined score.") rerank_score: float = Field( default=0.0, description="Cross-encoder reranking score (higher = more relevant).", ) # ------------------------------------------------------------------ # # Generation Models # # ------------------------------------------------------------------ # class Citation(BaseModel): """A single source citation attached to an LLM answer.""" source_file: str = Field(description="Filename of the cited document.") page_number: int = Field(description="Page number of the cited chunk.") section: str = Field(default="", description="Section heading of the cited chunk.") excerpt: str = Field(description="Relevant sentence or phrase from the cited chunk.") relevance_score: float = Field( default=0.0, description="Reranking relevance score (0.0 – 1.0).", ) class QuerySession(BaseModel): """ A complete record of one user voice query and its answer. Written to the SQLite audit log after every query. """ session_id: str = Field( default_factory=lambda: str(uuid.uuid4()), description="UUID for this query session.", ) kb_names: list[str] = Field(description="Knowledge bases queried.") voice_query: str = Field(description="Raw Whisper transcript.") processed_query: str = Field(description="Cleaned, normalized query string.") query_type: str = Field( default="factual", description="Classified query intent: factual | summary | compare.", ) retrieved_chunks: list[str] = Field( default_factory=list, description="chunk_ids in ranked retrieval order.", ) answer: str = Field(default="", description="Final generated answer with inline citations.") citations: list[Citation] = Field( default_factory=list, description="Structured citation list extracted from the answer.", ) latency_asr_ms: int = Field(default=0, description="Whisper transcription latency in ms.") latency_retrieval_ms: int = Field(default=0, description="Retrieval pipeline latency in ms.") latency_llm_ms: int = Field(default=0, description="LLM generation latency in ms.") total_latency_ms: int = Field(default=0, description="End-to-end latency in ms.") groq_tokens_used: int = Field(default=0, description="Groq API tokens consumed.") timestamp: datetime = Field( default_factory=lambda: datetime.now(timezone.utc), description="UTC timestamp of the query.", ) # ------------------------------------------------------------------ # # Knowledge Base Models # # ------------------------------------------------------------------ # class KnowledgeBase(BaseModel): """A named, persistent collection of indexed documents.""" kb_name: str = Field(description="Unique identifier (slug) for this knowledge base.") display_name: str = Field(description="Human-readable name shown in the UI.") password_hash: Optional[str] = Field( default=None, description="bcrypt hash of the KB password. None means public.", ) owner: str = Field(default="default", description="Owner identifier.") doc_count: int = Field(default=0, description="Number of indexed documents.") chunk_count: int = Field(default=0, description="Total indexed chunks.") created_at: datetime = Field( default_factory=lambda: datetime.now(timezone.utc), description="UTC timestamp of KB creation.", ) last_updated: Optional[datetime] = Field( default=None, description="UTC timestamp of last document ingestion.", ) @property def is_protected(self) -> bool: """True if this knowledge base requires a password.""" return self.password_hash is not None class Document(BaseModel): """A single source document registered within a knowledge base.""" doc_id: str = Field( default_factory=lambda: str(uuid.uuid4()), description="UUID primary key.", ) kb_name: str = Field(description="Knowledge base this document belongs to.") filename: str = Field(description="Original filename.") file_hash: str = Field(description="SHA-256 of file bytes — used for deduplication.") page_count: int = Field(default=0, description="Total pages in the document.") chunk_count: int = Field(default=0, description="Number of chunks created.") is_private: bool = Field( default=False, description="If True, this document is excluded from shared KB queries.", ) ingested_at: datetime = Field( default_factory=lambda: datetime.now(timezone.utc), description="UTC timestamp of ingestion.", ) # ------------------------------------------------------------------ # # ASR Model # # ------------------------------------------------------------------ # class TranscriptResult(BaseModel): """Result returned by the Whisper transcription pipeline.""" transcript: str = Field(description="Cleaned transcription text.") raw_transcript: str = Field(description="Unprocessed Whisper output.") language: str = Field(default="en", description="Detected ISO 639-1 language code.") confidence: float = Field( default=1.0, description="Transcription confidence (0.0 – 1.0); 1.0 if not available.", ) model_used: str = Field( description="Which ASR model produced this transcript (whisper-large-v3 or distil-large-v3)." ) latency_ms: int = Field(default=0, description="Transcription time in milliseconds.") query_type: str = Field( default="factual", description="Classified query intent after preprocessing.", )