Spaces:

NinjainPJs
/

VoiceVault

Running

App Files Files Community

VoiceVault / voicevault /models.py

NinjainPJs

Initial release: VoiceVault v1.0.0 — Voice-First RAG Knowledge Agent

85f900d 3 months ago

raw

history blame contribute delete

9.59 kB

	"""
	VoiceVault — Core Data Models
	==============================
	All Pydantic schemas used across the pipeline.
	These are the canonical data contracts between every module.

	Schemas:
	DocumentChunk — a single indexed text chunk with metadata
	Citation — a source reference attached to an LLM answer
	QuerySession — a full voice query → answer session record
	KnowledgeBase — a named document collection
	Document — a single document within a KB
	IngestionReport — result of indexing a document
	TranscriptResult— result of Whisper ASR
	RetrievalResult — a retrieved chunk with its relevance score
	"""

	import uuid
	from datetime import datetime, timezone
	from typing import Optional
	from pydantic import BaseModel, Field


	# ------------------------------------------------------------------ #
	# Ingestion Models #
	# ------------------------------------------------------------------ #


	class DocumentChunk(BaseModel):
	"""
	A single semantically chunked piece of a source document.
	Stored in ChromaDB (embedding) and SQLite (metadata).
	"""

	chunk_id: str = Field(
	default_factory=lambda: str(uuid.uuid4()),
	description="UUID primary key for this chunk.",
	)
	kb_name: str = Field(description="Knowledge base this chunk belongs to.")
	source_file: str = Field(description="Original filename (not path) of the source document.")
	page_number: int = Field(description="Page number in the original document (1-indexed).")
	section: str = Field(
	default="",
	description="Nearest heading above this chunk in the document.",
	)
	chunk_index: int = Field(description="Position of this chunk within its source document.")
	text: str = Field(description="Raw chunk text (400–600 tokens).")
	text_hash: str = Field(description="SHA-256 of text — used for deduplication.")
	token_count: int = Field(description="Approximate token count of the text.")
	language: str = Field(default="en", description="ISO 639-1 language code.")
	ingested_at: datetime = Field(
	default_factory=lambda: datetime.now(timezone.utc),
	description="UTC timestamp when this chunk was indexed.",
	)


	class IngestionReport(BaseModel):
	"""Result of ingesting a single document into a knowledge base."""

	doc_id: str = Field(description="UUID of the ingested document.")
	filename: str = Field(description="Original filename.")
	chunk_count: int = Field(description="Number of chunks created from this document.")
	page_count: int = Field(description="Total pages parsed.")
	status: str = Field(description="One of: success \| error \| skipped.")
	message: str = Field(default="", description="Human-readable status detail or error message.")
	duration_ms: int = Field(default=0, description="Time taken to ingest this document in ms.")


	# ------------------------------------------------------------------ #
	# Retrieval Models #
	# ------------------------------------------------------------------ #


	class RetrievalResult(BaseModel):
	"""A single retrieved chunk with its composite relevance score."""

	chunk_id: str = Field(description="Reference to the DocumentChunk.chunk_id.")
	text: str = Field(description="The chunk text.")
	source_file: str = Field(description="Source document filename.")
	page_number: int = Field(description="Page number in the source document.")
	section: str = Field(default="", description="Section heading above this chunk.")
	rrf_score: float = Field(default=0.0, description="Reciprocal Rank Fusion combined score.")
	rerank_score: float = Field(
	default=0.0,
	description="Cross-encoder reranking score (higher = more relevant).",
	)


	# ------------------------------------------------------------------ #
	# Generation Models #
	# ------------------------------------------------------------------ #


	class Citation(BaseModel):
	"""A single source citation attached to an LLM answer."""

	source_file: str = Field(description="Filename of the cited document.")
	page_number: int = Field(description="Page number of the cited chunk.")
	section: str = Field(default="", description="Section heading of the cited chunk.")
	excerpt: str = Field(description="Relevant sentence or phrase from the cited chunk.")
	relevance_score: float = Field(
	default=0.0,
	description="Reranking relevance score (0.0 – 1.0).",
	)


	class QuerySession(BaseModel):
	"""
	A complete record of one user voice query and its answer.
	Written to the SQLite audit log after every query.
	"""

	session_id: str = Field(
	default_factory=lambda: str(uuid.uuid4()),
	description="UUID for this query session.",
	)
	kb_names: list[str] = Field(description="Knowledge bases queried.")
	voice_query: str = Field(description="Raw Whisper transcript.")
	processed_query: str = Field(description="Cleaned, normalized query string.")
	query_type: str = Field(
	default="factual",
	description="Classified query intent: factual \| summary \| compare.",
	)
	retrieved_chunks: list[str] = Field(
	default_factory=list,
	description="chunk_ids in ranked retrieval order.",
	)
	answer: str = Field(default="", description="Final generated answer with inline citations.")
	citations: list[Citation] = Field(
	default_factory=list,
	description="Structured citation list extracted from the answer.",
	)
	latency_asr_ms: int = Field(default=0, description="Whisper transcription latency in ms.")
	latency_retrieval_ms: int = Field(default=0, description="Retrieval pipeline latency in ms.")
	latency_llm_ms: int = Field(default=0, description="LLM generation latency in ms.")
	total_latency_ms: int = Field(default=0, description="End-to-end latency in ms.")
	groq_tokens_used: int = Field(default=0, description="Groq API tokens consumed.")
	timestamp: datetime = Field(
	default_factory=lambda: datetime.now(timezone.utc),
	description="UTC timestamp of the query.",
	)


	# ------------------------------------------------------------------ #
	# Knowledge Base Models #
	# ------------------------------------------------------------------ #


	class KnowledgeBase(BaseModel):
	"""A named, persistent collection of indexed documents."""

	kb_name: str = Field(description="Unique identifier (slug) for this knowledge base.")
	display_name: str = Field(description="Human-readable name shown in the UI.")
	password_hash: Optional[str] = Field(
	default=None,
	description="bcrypt hash of the KB password. None means public.",
	)
	owner: str = Field(default="default", description="Owner identifier.")
	doc_count: int = Field(default=0, description="Number of indexed documents.")
	chunk_count: int = Field(default=0, description="Total indexed chunks.")
	created_at: datetime = Field(
	default_factory=lambda: datetime.now(timezone.utc),
	description="UTC timestamp of KB creation.",
	)
	last_updated: Optional[datetime] = Field(
	default=None,
	description="UTC timestamp of last document ingestion.",
	)

	@property
	def is_protected(self) -> bool:
	"""True if this knowledge base requires a password."""
	return self.password_hash is not None


	class Document(BaseModel):
	"""A single source document registered within a knowledge base."""

	doc_id: str = Field(
	default_factory=lambda: str(uuid.uuid4()),
	description="UUID primary key.",
	)
	kb_name: str = Field(description="Knowledge base this document belongs to.")
	filename: str = Field(description="Original filename.")
	file_hash: str = Field(description="SHA-256 of file bytes — used for deduplication.")
	page_count: int = Field(default=0, description="Total pages in the document.")
	chunk_count: int = Field(default=0, description="Number of chunks created.")
	is_private: bool = Field(
	default=False,
	description="If True, this document is excluded from shared KB queries.",
	)
	ingested_at: datetime = Field(
	default_factory=lambda: datetime.now(timezone.utc),
	description="UTC timestamp of ingestion.",
	)


	# ------------------------------------------------------------------ #
	# ASR Model #
	# ------------------------------------------------------------------ #


	class TranscriptResult(BaseModel):
	"""Result returned by the Whisper transcription pipeline."""

	transcript: str = Field(description="Cleaned transcription text.")
	raw_transcript: str = Field(description="Unprocessed Whisper output.")
	language: str = Field(default="en", description="Detected ISO 639-1 language code.")
	confidence: float = Field(
	default=1.0,
	description="Transcription confidence (0.0 – 1.0); 1.0 if not available.",
	)
	model_used: str = Field(
	description="Which ASR model produced this transcript (whisper-large-v3 or distil-large-v3)."
	)
	latency_ms: int = Field(default=0, description="Transcription time in milliseconds.")
	query_type: str = Field(
	default="factual",
	description="Classified query intent after preprocessing.",
	)