VoiceVault / voicevault /models.py
NinjainPJs's picture
Initial release: VoiceVault v1.0.0 — Voice-First RAG Knowledge Agent
85f900d
"""
VoiceVault — Core Data Models
==============================
All Pydantic schemas used across the pipeline.
These are the canonical data contracts between every module.
Schemas:
DocumentChunk — a single indexed text chunk with metadata
Citation — a source reference attached to an LLM answer
QuerySession — a full voice query → answer session record
KnowledgeBase — a named document collection
Document — a single document within a KB
IngestionReport — result of indexing a document
TranscriptResult— result of Whisper ASR
RetrievalResult — a retrieved chunk with its relevance score
"""
import uuid
from datetime import datetime, timezone
from typing import Optional
from pydantic import BaseModel, Field
# ------------------------------------------------------------------ #
# Ingestion Models #
# ------------------------------------------------------------------ #
class DocumentChunk(BaseModel):
"""
A single semantically chunked piece of a source document.
Stored in ChromaDB (embedding) and SQLite (metadata).
"""
chunk_id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="UUID primary key for this chunk.",
)
kb_name: str = Field(description="Knowledge base this chunk belongs to.")
source_file: str = Field(description="Original filename (not path) of the source document.")
page_number: int = Field(description="Page number in the original document (1-indexed).")
section: str = Field(
default="",
description="Nearest heading above this chunk in the document.",
)
chunk_index: int = Field(description="Position of this chunk within its source document.")
text: str = Field(description="Raw chunk text (400–600 tokens).")
text_hash: str = Field(description="SHA-256 of text — used for deduplication.")
token_count: int = Field(description="Approximate token count of the text.")
language: str = Field(default="en", description="ISO 639-1 language code.")
ingested_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
description="UTC timestamp when this chunk was indexed.",
)
class IngestionReport(BaseModel):
"""Result of ingesting a single document into a knowledge base."""
doc_id: str = Field(description="UUID of the ingested document.")
filename: str = Field(description="Original filename.")
chunk_count: int = Field(description="Number of chunks created from this document.")
page_count: int = Field(description="Total pages parsed.")
status: str = Field(description="One of: success | error | skipped.")
message: str = Field(default="", description="Human-readable status detail or error message.")
duration_ms: int = Field(default=0, description="Time taken to ingest this document in ms.")
# ------------------------------------------------------------------ #
# Retrieval Models #
# ------------------------------------------------------------------ #
class RetrievalResult(BaseModel):
"""A single retrieved chunk with its composite relevance score."""
chunk_id: str = Field(description="Reference to the DocumentChunk.chunk_id.")
text: str = Field(description="The chunk text.")
source_file: str = Field(description="Source document filename.")
page_number: int = Field(description="Page number in the source document.")
section: str = Field(default="", description="Section heading above this chunk.")
rrf_score: float = Field(default=0.0, description="Reciprocal Rank Fusion combined score.")
rerank_score: float = Field(
default=0.0,
description="Cross-encoder reranking score (higher = more relevant).",
)
# ------------------------------------------------------------------ #
# Generation Models #
# ------------------------------------------------------------------ #
class Citation(BaseModel):
"""A single source citation attached to an LLM answer."""
source_file: str = Field(description="Filename of the cited document.")
page_number: int = Field(description="Page number of the cited chunk.")
section: str = Field(default="", description="Section heading of the cited chunk.")
excerpt: str = Field(description="Relevant sentence or phrase from the cited chunk.")
relevance_score: float = Field(
default=0.0,
description="Reranking relevance score (0.0 – 1.0).",
)
class QuerySession(BaseModel):
"""
A complete record of one user voice query and its answer.
Written to the SQLite audit log after every query.
"""
session_id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="UUID for this query session.",
)
kb_names: list[str] = Field(description="Knowledge bases queried.")
voice_query: str = Field(description="Raw Whisper transcript.")
processed_query: str = Field(description="Cleaned, normalized query string.")
query_type: str = Field(
default="factual",
description="Classified query intent: factual | summary | compare.",
)
retrieved_chunks: list[str] = Field(
default_factory=list,
description="chunk_ids in ranked retrieval order.",
)
answer: str = Field(default="", description="Final generated answer with inline citations.")
citations: list[Citation] = Field(
default_factory=list,
description="Structured citation list extracted from the answer.",
)
latency_asr_ms: int = Field(default=0, description="Whisper transcription latency in ms.")
latency_retrieval_ms: int = Field(default=0, description="Retrieval pipeline latency in ms.")
latency_llm_ms: int = Field(default=0, description="LLM generation latency in ms.")
total_latency_ms: int = Field(default=0, description="End-to-end latency in ms.")
groq_tokens_used: int = Field(default=0, description="Groq API tokens consumed.")
timestamp: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
description="UTC timestamp of the query.",
)
# ------------------------------------------------------------------ #
# Knowledge Base Models #
# ------------------------------------------------------------------ #
class KnowledgeBase(BaseModel):
"""A named, persistent collection of indexed documents."""
kb_name: str = Field(description="Unique identifier (slug) for this knowledge base.")
display_name: str = Field(description="Human-readable name shown in the UI.")
password_hash: Optional[str] = Field(
default=None,
description="bcrypt hash of the KB password. None means public.",
)
owner: str = Field(default="default", description="Owner identifier.")
doc_count: int = Field(default=0, description="Number of indexed documents.")
chunk_count: int = Field(default=0, description="Total indexed chunks.")
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
description="UTC timestamp of KB creation.",
)
last_updated: Optional[datetime] = Field(
default=None,
description="UTC timestamp of last document ingestion.",
)
@property
def is_protected(self) -> bool:
"""True if this knowledge base requires a password."""
return self.password_hash is not None
class Document(BaseModel):
"""A single source document registered within a knowledge base."""
doc_id: str = Field(
default_factory=lambda: str(uuid.uuid4()),
description="UUID primary key.",
)
kb_name: str = Field(description="Knowledge base this document belongs to.")
filename: str = Field(description="Original filename.")
file_hash: str = Field(description="SHA-256 of file bytes — used for deduplication.")
page_count: int = Field(default=0, description="Total pages in the document.")
chunk_count: int = Field(default=0, description="Number of chunks created.")
is_private: bool = Field(
default=False,
description="If True, this document is excluded from shared KB queries.",
)
ingested_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
description="UTC timestamp of ingestion.",
)
# ------------------------------------------------------------------ #
# ASR Model #
# ------------------------------------------------------------------ #
class TranscriptResult(BaseModel):
"""Result returned by the Whisper transcription pipeline."""
transcript: str = Field(description="Cleaned transcription text.")
raw_transcript: str = Field(description="Unprocessed Whisper output.")
language: str = Field(default="en", description="Detected ISO 639-1 language code.")
confidence: float = Field(
default=1.0,
description="Transcription confidence (0.0 – 1.0); 1.0 if not available.",
)
model_used: str = Field(
description="Which ASR model produced this transcript (whisper-large-v3 or distil-large-v3)."
)
latency_ms: int = Field(default=0, description="Transcription time in milliseconds.")
query_type: str = Field(
default="factual",
description="Classified query intent after preprocessing.",
)