Spaces:
Running
Running
| """ | |
| VoiceVault — Core Data Models | |
| ============================== | |
| All Pydantic schemas used across the pipeline. | |
| These are the canonical data contracts between every module. | |
| Schemas: | |
| DocumentChunk — a single indexed text chunk with metadata | |
| Citation — a source reference attached to an LLM answer | |
| QuerySession — a full voice query → answer session record | |
| KnowledgeBase — a named document collection | |
| Document — a single document within a KB | |
| IngestionReport — result of indexing a document | |
| TranscriptResult— result of Whisper ASR | |
| RetrievalResult — a retrieved chunk with its relevance score | |
| """ | |
| import uuid | |
| from datetime import datetime, timezone | |
| from typing import Optional | |
| from pydantic import BaseModel, Field | |
| # ------------------------------------------------------------------ # | |
| # Ingestion Models # | |
| # ------------------------------------------------------------------ # | |
| class DocumentChunk(BaseModel): | |
| """ | |
| A single semantically chunked piece of a source document. | |
| Stored in ChromaDB (embedding) and SQLite (metadata). | |
| """ | |
| chunk_id: str = Field( | |
| default_factory=lambda: str(uuid.uuid4()), | |
| description="UUID primary key for this chunk.", | |
| ) | |
| kb_name: str = Field(description="Knowledge base this chunk belongs to.") | |
| source_file: str = Field(description="Original filename (not path) of the source document.") | |
| page_number: int = Field(description="Page number in the original document (1-indexed).") | |
| section: str = Field( | |
| default="", | |
| description="Nearest heading above this chunk in the document.", | |
| ) | |
| chunk_index: int = Field(description="Position of this chunk within its source document.") | |
| text: str = Field(description="Raw chunk text (400–600 tokens).") | |
| text_hash: str = Field(description="SHA-256 of text — used for deduplication.") | |
| token_count: int = Field(description="Approximate token count of the text.") | |
| language: str = Field(default="en", description="ISO 639-1 language code.") | |
| ingested_at: datetime = Field( | |
| default_factory=lambda: datetime.now(timezone.utc), | |
| description="UTC timestamp when this chunk was indexed.", | |
| ) | |
| class IngestionReport(BaseModel): | |
| """Result of ingesting a single document into a knowledge base.""" | |
| doc_id: str = Field(description="UUID of the ingested document.") | |
| filename: str = Field(description="Original filename.") | |
| chunk_count: int = Field(description="Number of chunks created from this document.") | |
| page_count: int = Field(description="Total pages parsed.") | |
| status: str = Field(description="One of: success | error | skipped.") | |
| message: str = Field(default="", description="Human-readable status detail or error message.") | |
| duration_ms: int = Field(default=0, description="Time taken to ingest this document in ms.") | |
| # ------------------------------------------------------------------ # | |
| # Retrieval Models # | |
| # ------------------------------------------------------------------ # | |
| class RetrievalResult(BaseModel): | |
| """A single retrieved chunk with its composite relevance score.""" | |
| chunk_id: str = Field(description="Reference to the DocumentChunk.chunk_id.") | |
| text: str = Field(description="The chunk text.") | |
| source_file: str = Field(description="Source document filename.") | |
| page_number: int = Field(description="Page number in the source document.") | |
| section: str = Field(default="", description="Section heading above this chunk.") | |
| rrf_score: float = Field(default=0.0, description="Reciprocal Rank Fusion combined score.") | |
| rerank_score: float = Field( | |
| default=0.0, | |
| description="Cross-encoder reranking score (higher = more relevant).", | |
| ) | |
| # ------------------------------------------------------------------ # | |
| # Generation Models # | |
| # ------------------------------------------------------------------ # | |
| class Citation(BaseModel): | |
| """A single source citation attached to an LLM answer.""" | |
| source_file: str = Field(description="Filename of the cited document.") | |
| page_number: int = Field(description="Page number of the cited chunk.") | |
| section: str = Field(default="", description="Section heading of the cited chunk.") | |
| excerpt: str = Field(description="Relevant sentence or phrase from the cited chunk.") | |
| relevance_score: float = Field( | |
| default=0.0, | |
| description="Reranking relevance score (0.0 – 1.0).", | |
| ) | |
| class QuerySession(BaseModel): | |
| """ | |
| A complete record of one user voice query and its answer. | |
| Written to the SQLite audit log after every query. | |
| """ | |
| session_id: str = Field( | |
| default_factory=lambda: str(uuid.uuid4()), | |
| description="UUID for this query session.", | |
| ) | |
| kb_names: list[str] = Field(description="Knowledge bases queried.") | |
| voice_query: str = Field(description="Raw Whisper transcript.") | |
| processed_query: str = Field(description="Cleaned, normalized query string.") | |
| query_type: str = Field( | |
| default="factual", | |
| description="Classified query intent: factual | summary | compare.", | |
| ) | |
| retrieved_chunks: list[str] = Field( | |
| default_factory=list, | |
| description="chunk_ids in ranked retrieval order.", | |
| ) | |
| answer: str = Field(default="", description="Final generated answer with inline citations.") | |
| citations: list[Citation] = Field( | |
| default_factory=list, | |
| description="Structured citation list extracted from the answer.", | |
| ) | |
| latency_asr_ms: int = Field(default=0, description="Whisper transcription latency in ms.") | |
| latency_retrieval_ms: int = Field(default=0, description="Retrieval pipeline latency in ms.") | |
| latency_llm_ms: int = Field(default=0, description="LLM generation latency in ms.") | |
| total_latency_ms: int = Field(default=0, description="End-to-end latency in ms.") | |
| groq_tokens_used: int = Field(default=0, description="Groq API tokens consumed.") | |
| timestamp: datetime = Field( | |
| default_factory=lambda: datetime.now(timezone.utc), | |
| description="UTC timestamp of the query.", | |
| ) | |
| # ------------------------------------------------------------------ # | |
| # Knowledge Base Models # | |
| # ------------------------------------------------------------------ # | |
| class KnowledgeBase(BaseModel): | |
| """A named, persistent collection of indexed documents.""" | |
| kb_name: str = Field(description="Unique identifier (slug) for this knowledge base.") | |
| display_name: str = Field(description="Human-readable name shown in the UI.") | |
| password_hash: Optional[str] = Field( | |
| default=None, | |
| description="bcrypt hash of the KB password. None means public.", | |
| ) | |
| owner: str = Field(default="default", description="Owner identifier.") | |
| doc_count: int = Field(default=0, description="Number of indexed documents.") | |
| chunk_count: int = Field(default=0, description="Total indexed chunks.") | |
| created_at: datetime = Field( | |
| default_factory=lambda: datetime.now(timezone.utc), | |
| description="UTC timestamp of KB creation.", | |
| ) | |
| last_updated: Optional[datetime] = Field( | |
| default=None, | |
| description="UTC timestamp of last document ingestion.", | |
| ) | |
| def is_protected(self) -> bool: | |
| """True if this knowledge base requires a password.""" | |
| return self.password_hash is not None | |
| class Document(BaseModel): | |
| """A single source document registered within a knowledge base.""" | |
| doc_id: str = Field( | |
| default_factory=lambda: str(uuid.uuid4()), | |
| description="UUID primary key.", | |
| ) | |
| kb_name: str = Field(description="Knowledge base this document belongs to.") | |
| filename: str = Field(description="Original filename.") | |
| file_hash: str = Field(description="SHA-256 of file bytes — used for deduplication.") | |
| page_count: int = Field(default=0, description="Total pages in the document.") | |
| chunk_count: int = Field(default=0, description="Number of chunks created.") | |
| is_private: bool = Field( | |
| default=False, | |
| description="If True, this document is excluded from shared KB queries.", | |
| ) | |
| ingested_at: datetime = Field( | |
| default_factory=lambda: datetime.now(timezone.utc), | |
| description="UTC timestamp of ingestion.", | |
| ) | |
| # ------------------------------------------------------------------ # | |
| # ASR Model # | |
| # ------------------------------------------------------------------ # | |
| class TranscriptResult(BaseModel): | |
| """Result returned by the Whisper transcription pipeline.""" | |
| transcript: str = Field(description="Cleaned transcription text.") | |
| raw_transcript: str = Field(description="Unprocessed Whisper output.") | |
| language: str = Field(default="en", description="Detected ISO 639-1 language code.") | |
| confidence: float = Field( | |
| default=1.0, | |
| description="Transcription confidence (0.0 – 1.0); 1.0 if not available.", | |
| ) | |
| model_used: str = Field( | |
| description="Which ASR model produced this transcript (whisper-large-v3 or distil-large-v3)." | |
| ) | |
| latency_ms: int = Field(default=0, description="Transcription time in milliseconds.") | |
| query_type: str = Field( | |
| default="factual", | |
| description="Classified query intent after preprocessing.", | |
| ) | |