VoiceVault / tests /test_phase0.py
NinjainPJs's picture
Initial release: VoiceVault v1.0.0 — Voice-First RAG Knowledge Agent
85f900d
"""
tests/test_phase0.py
====================
Phase 0 — Foundation Smoke Tests
Verifies that the project scaffold is correct:
- Config loads with correct types and defaults
- SQLite schema creates all required tables
- All Pydantic data models instantiate without errors
- Runtime directories are created correctly
- Gradio app builds without errors
- All package imports succeed
These tests do NOT require heavy ML dependencies (Whisper, torch, spacy).
Run with: pytest tests/test_phase0.py -v
"""
from __future__ import annotations
import sqlite3
from pathlib import Path
import pytest
# ------------------------------------------------------------------ #
# Config Tests #
# ------------------------------------------------------------------ #
class TestConfig:
"""Verify centralized config loads correctly."""
def test_config_imports(self) -> None:
"""Config module must import without exceptions."""
from config import cfg # noqa: F401
def test_config_is_singleton(self) -> None:
"""Multiple imports must return the same object."""
from config import cfg as cfg1
from config import cfg as cfg2
assert cfg1 is cfg2
def test_default_types(self) -> None:
"""All fields must have correct types."""
from config import cfg
assert isinstance(cfg.groq_api_key, str)
assert isinstance(cfg.gemini_api_key, str)
assert isinstance(cfg.data_dir, Path)
assert isinstance(cfg.bm25_top_k, int)
assert isinstance(cfg.vector_top_k, int)
assert isinstance(cfg.rrf_k, int)
assert isinstance(cfg.final_top_k, int)
assert isinstance(cfg.chunk_size_min, int)
assert isinstance(cfg.chunk_size_max, int)
assert isinstance(cfg.semantic_similarity_threshold, float)
assert isinstance(cfg.bcrypt_rounds, int)
assert isinstance(cfg.debug, bool)
def test_default_values_are_sane(self) -> None:
"""Critical defaults must be within valid operational ranges."""
from config import cfg
assert cfg.bm25_top_k > 0
assert cfg.vector_top_k > 0
assert cfg.rrf_k == 60, "RRF k=60 is the standard value — must not be changed"
assert cfg.final_top_k <= cfg.rerank_top_k
assert cfg.chunk_size_min < cfg.chunk_size_max
assert 0.0 < cfg.semantic_similarity_threshold < 1.0
assert cfg.bcrypt_rounds >= 12, "bcrypt rounds < 12 is insecure"
assert cfg.port == 7860
def test_path_helpers_return_paths(self) -> None:
"""Path helper methods must return Path objects."""
from config import cfg
assert isinstance(cfg.kb_dir("test"), Path)
assert isinstance(cfg.kb_chroma_dir("test"), Path)
assert isinstance(cfg.kb_bm25_path("test"), Path)
assert isinstance(cfg.kb_db_path("test"), Path)
assert isinstance(cfg.uploads_dir, Path)
assert isinstance(cfg.models_cache_dir, Path)
def test_kb_path_hierarchy(self) -> None:
"""KB subdirectory paths must be children of the KB root."""
from config import cfg
kb = "myknowledgebase"
assert cfg.kb_chroma_dir(kb).parent == cfg.kb_dir(kb)
assert cfg.kb_bm25_path(kb).parent == cfg.kb_dir(kb)
assert cfg.kb_db_path(kb).parent == cfg.kb_dir(kb)
def test_has_groq_key_false_when_empty(self) -> None:
"""has_groq_key() must return False when key is empty string."""
from config import VoiceVaultConfig
empty_cfg = VoiceVaultConfig(GROQ_API_KEY="")
assert empty_cfg.has_groq_key() is False
def test_has_groq_key_true_when_set(self) -> None:
"""has_groq_key() must return True when key is non-empty."""
from config import VoiceVaultConfig
cfg_with_key = VoiceVaultConfig(GROQ_API_KEY="gsk_test123")
assert cfg_with_key.has_groq_key() is True
def test_allowed_extensions_contains_expected(self) -> None:
"""Security extension whitelist must include all supported formats."""
from config import cfg
expected = {".pdf", ".html", ".htm", ".docx", ".md", ".mdx", ".txt"}
assert expected.issubset(cfg.allowed_extensions)
def test_ensure_directories_creates_dirs(self, tmp_path: Path) -> None:
"""ensure_directories() must create data/ and data/uploads/."""
from config import VoiceVaultConfig
local_cfg = VoiceVaultConfig(DATA_DIR=str(tmp_path / "data"))
local_cfg.ensure_directories()
assert local_cfg.data_dir.exists()
assert local_cfg.uploads_dir.exists()
# ------------------------------------------------------------------ #
# Data Model Tests #
# ------------------------------------------------------------------ #
class TestModels:
"""Verify all Pydantic data models instantiate and validate correctly."""
def test_document_chunk_defaults(self) -> None:
"""DocumentChunk must generate a UUID chunk_id automatically."""
from voicevault.models import DocumentChunk
chunk = DocumentChunk(
kb_name="test",
source_file="doc.pdf",
page_number=1,
chunk_index=0,
text="Hello world.",
text_hash="deadbeef",
token_count=2,
)
assert chunk.chunk_id # non-empty UUID string
assert len(chunk.chunk_id) == 36 # UUID4 format
assert chunk.language == "en"
def test_document_chunk_uuid_unique(self) -> None:
"""Two DocumentChunks created without explicit chunk_id must have different IDs."""
from voicevault.models import DocumentChunk
base = dict(
kb_name="test", source_file="doc.pdf", page_number=1,
chunk_index=0, text="x", text_hash="y", token_count=1,
)
a = DocumentChunk(**base)
b = DocumentChunk(**base)
assert a.chunk_id != b.chunk_id
def test_citation_model(self) -> None:
"""Citation must store all fields correctly."""
from voicevault.models import Citation
c = Citation(
source_file="report.pdf",
page_number=42,
section="Results",
excerpt="The accuracy was 95%.",
relevance_score=0.87,
)
assert c.source_file == "report.pdf"
assert c.page_number == 42
assert c.relevance_score == pytest.approx(0.87)
def test_query_session_defaults(self) -> None:
"""QuerySession must auto-generate session_id and timestamp."""
from voicevault.models import QuerySession
session = QuerySession(
kb_names=["kb1"],
voice_query="What is machine learning?",
processed_query="what is machine learning",
)
assert session.session_id
assert session.query_type == "factual"
assert session.answer == ""
assert session.citations == []
def test_knowledge_base_is_protected(self) -> None:
"""KnowledgeBase.is_protected must reflect password_hash presence."""
from voicevault.models import KnowledgeBase
public_kb = KnowledgeBase(kb_name="public", display_name="Public KB")
private_kb = KnowledgeBase(
kb_name="private", display_name="Private KB", password_hash="$2b$12$hash"
)
assert public_kb.is_protected is False
assert private_kb.is_protected is True
def test_document_model(self) -> None:
"""Document must auto-generate doc_id."""
from voicevault.models import Document
doc = Document(kb_name="test", filename="test.pdf", file_hash="sha256abc")
assert len(doc.doc_id) == 36
assert doc.is_private is False
def test_ingestion_report(self) -> None:
"""IngestionReport must store status correctly."""
from voicevault.models import IngestionReport
report = IngestionReport(
doc_id="some-uuid",
filename="file.pdf",
chunk_count=42,
page_count=10,
status="success",
)
assert report.status == "success"
assert report.chunk_count == 42
def test_transcript_result(self) -> None:
"""TranscriptResult must store transcript and model_used."""
from voicevault.models import TranscriptResult
result = TranscriptResult(
transcript="What is machine learning?",
raw_transcript="Um, what is machine learning?",
model_used="whisper-large-v3",
)
assert result.confidence == 1.0
assert result.language == "en"
assert result.query_type == "factual"
def test_retrieval_result(self) -> None:
"""RetrievalResult must store RRF and rerank scores."""
from voicevault.models import RetrievalResult
result = RetrievalResult(
chunk_id="uuid-001",
text="Machine learning is a subset of AI.",
source_file="ml_intro.pdf",
page_number=3,
rrf_score=0.032,
rerank_score=0.91,
)
assert result.rrf_score == pytest.approx(0.032)
assert result.rerank_score == pytest.approx(0.91)
# ------------------------------------------------------------------ #
# SQLite Schema Tests #
# ------------------------------------------------------------------ #
class TestSQLiteSchema:
"""Verify the SQLite schema is created correctly."""
def test_initialize_creates_tables(self, tmp_db: Path) -> None:
"""All four tables must exist after initialize_database()."""
conn = sqlite3.connect(str(tmp_db))
cursor = conn.execute(
"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
)
tables = {row[0] for row in cursor.fetchall()}
conn.close()
expected = {"knowledge_bases", "documents", "chunks", "query_log"}
assert expected.issubset(tables), f"Missing tables: {expected - tables}"
def test_initialize_is_idempotent(self, tmp_path: Path) -> None:
"""Calling initialize_database() twice must not raise or duplicate tables."""
from voicevault.storage.sqlite_store import initialize_database
db_path = tmp_path / "voicevault.db"
initialize_database(db_path)
initialize_database(db_path) # Second call — must be silent
conn = sqlite3.connect(str(db_path))
cursor = conn.execute(
"SELECT COUNT(*) FROM sqlite_master WHERE type='table'"
)
count = cursor.fetchone()[0]
conn.close()
assert count >= 4
def test_initialize_raises_if_dir_missing(self, tmp_path: Path) -> None:
"""initialize_database() must raise FileNotFoundError if parent dir is missing."""
from voicevault.storage.sqlite_store import initialize_database
bad_path = tmp_path / "nonexistent" / "voicevault.db"
with pytest.raises(FileNotFoundError):
initialize_database(bad_path)
def test_kb_table_schema(self, tmp_db: Path) -> None:
"""knowledge_bases table must have all required columns."""
conn = sqlite3.connect(str(tmp_db))
cursor = conn.execute("PRAGMA table_info(knowledge_bases)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
required = {"kb_name", "display_name", "password_hash", "owner",
"doc_count", "chunk_count", "created_at", "last_updated"}
assert required.issubset(columns)
def test_documents_table_schema(self, tmp_db: Path) -> None:
"""documents table must have all required columns."""
conn = sqlite3.connect(str(tmp_db))
cursor = conn.execute("PRAGMA table_info(documents)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
required = {"doc_id", "kb_name", "filename", "file_hash",
"page_count", "chunk_count", "is_private", "ingested_at"}
assert required.issubset(columns)
def test_query_log_schema(self, tmp_db: Path) -> None:
"""query_log table must have anonymized query hash, not raw query text."""
conn = sqlite3.connect(str(tmp_db))
cursor = conn.execute("PRAGMA table_info(query_log)")
columns = {row[1] for row in cursor.fetchall()}
conn.close()
# Security: raw voice_query must NOT be stored — only hash
assert "voice_query_hash" in columns, "Audit log must store query hash, not raw text"
assert "voice_query" not in columns, "Raw voice query must NOT be in audit log"
# ------------------------------------------------------------------ #
# SQLite CRUD Tests #
# ------------------------------------------------------------------ #
class TestSQLiteCRUD:
"""Verify SQLite CRUD operations are correct and use parameterized queries."""
def test_create_and_get_kb(self, tmp_db: Path) -> None:
"""create_kb + get_kb round-trip."""
from voicevault.storage.sqlite_store import create_kb, get_kb
create_kb(tmp_db, "my-kb", "My Knowledge Base", owner="navnit")
kb = get_kb(tmp_db, "my-kb")
assert kb is not None
assert kb["kb_name"] == "my-kb"
assert kb["display_name"] == "My Knowledge Base"
assert kb["owner"] == "navnit"
assert kb["password_hash"] is None
def test_create_kb_duplicate_raises(self, tmp_db: Path) -> None:
"""Inserting a KB with a duplicate name must raise."""
from voicevault.storage.sqlite_store import create_kb
create_kb(tmp_db, "dupe-kb", "Dupe")
with pytest.raises(Exception):
create_kb(tmp_db, "dupe-kb", "Dupe Again")
def test_list_kbs_empty(self, tmp_db: Path) -> None:
"""list_kbs on empty DB returns empty list."""
from voicevault.storage.sqlite_store import list_kbs
assert list_kbs(tmp_db) == []
def test_list_kbs_multiple(self, tmp_db: Path) -> None:
"""list_kbs returns all created KBs."""
from voicevault.storage.sqlite_store import create_kb, list_kbs
create_kb(tmp_db, "kb-a", "KB A")
create_kb(tmp_db, "kb-b", "KB B")
result = list_kbs(tmp_db)
names = {kb["kb_name"] for kb in result}
assert {"kb-a", "kb-b"} == names
def test_delete_kb(self, tmp_db: Path) -> None:
"""delete_kb removes the KB row."""
from voicevault.storage.sqlite_store import create_kb, delete_kb, get_kb
create_kb(tmp_db, "to-delete", "To Delete")
assert get_kb(tmp_db, "to-delete") is not None
delete_kb(tmp_db, "to-delete")
assert get_kb(tmp_db, "to-delete") is None
def test_update_kb_counts(self, tmp_db: Path) -> None:
"""update_kb_counts updates doc/chunk counts correctly."""
from voicevault.storage.sqlite_store import create_kb, get_kb, update_kb_counts
create_kb(tmp_db, "count-kb", "Count KB")
update_kb_counts(tmp_db, "count-kb", doc_count=5, chunk_count=120)
kb = get_kb(tmp_db, "count-kb")
assert kb["doc_count"] == 5
assert kb["chunk_count"] == 120
def test_register_and_list_document(self, tmp_db: Path) -> None:
"""register_document + list_documents round-trip."""
from voicevault.storage.sqlite_store import (
create_kb, list_documents, register_document
)
create_kb(tmp_db, "doc-kb", "Doc KB")
register_document(
tmp_db,
doc_id="doc-uuid-001",
kb_name="doc-kb",
filename="report.pdf",
file_hash="sha256_abc",
page_count=10,
chunk_count=25,
)
docs = list_documents(tmp_db, "doc-kb")
assert len(docs) == 1
assert docs[0]["filename"] == "report.pdf"
assert docs[0]["file_hash"] == "sha256_abc"
def test_get_document_by_hash(self, tmp_db: Path) -> None:
"""get_document_by_hash enables deduplication check."""
from voicevault.storage.sqlite_store import (
create_kb, get_document_by_hash, register_document
)
create_kb(tmp_db, "dedup-kb", "Dedup KB")
register_document(tmp_db, "uuid-x", "dedup-kb", "file.pdf", "unique_hash_123")
found = get_document_by_hash(tmp_db, "unique_hash_123", "dedup-kb")
assert found is not None
not_found = get_document_by_hash(tmp_db, "nonexistent_hash", "dedup-kb")
assert not_found is None
def test_register_chunk_and_dedup(self, tmp_db: Path) -> None:
"""chunk_hash_exists returns True only for registered chunk hashes."""
from voicevault.storage.sqlite_store import (
chunk_hash_exists, create_kb, register_chunk, register_document
)
create_kb(tmp_db, "chunk-kb", "Chunk KB")
register_document(tmp_db, "doc-001", "chunk-kb", "doc.pdf", "file_hash_001")
register_chunk(
tmp_db,
chunk_id="chunk-uuid-001",
kb_name="chunk-kb",
doc_id="doc-001",
source_file="doc.pdf",
page_number=1,
section="Intro",
chunk_index=0,
text_hash="chunk_hash_abc",
token_count=50,
)
assert chunk_hash_exists(tmp_db, "chunk_hash_abc", "chunk-kb") is True
assert chunk_hash_exists(tmp_db, "nonexistent_hash", "chunk-kb") is False
def test_delete_document_cascades_chunks(self, tmp_db: Path) -> None:
"""Deleting a document must cascade-delete its chunks."""
from voicevault.storage.sqlite_store import (
chunk_hash_exists, create_kb, delete_document,
register_chunk, register_document
)
create_kb(tmp_db, "cascade-kb", "Cascade KB")
register_document(tmp_db, "doc-cascade", "cascade-kb", "x.pdf", "hash_x")
register_chunk(
tmp_db, "chunk-cascade", "cascade-kb", "doc-cascade",
"x.pdf", 1, "Intro", 0, "chunk_hash_cascade", 10
)
assert chunk_hash_exists(tmp_db, "chunk_hash_cascade", "cascade-kb") is True
delete_document(tmp_db, "doc-cascade")
assert chunk_hash_exists(tmp_db, "chunk_hash_cascade", "cascade-kb") is False
def test_log_query_and_retrieve(self, tmp_db: Path) -> None:
"""log_query + get_query_log round-trip."""
import uuid
from voicevault.storage.sqlite_store import get_query_log, log_query
log_id = str(uuid.uuid4())
session_id = str(uuid.uuid4())
log_query(
tmp_db, log_id, session_id,
kb_names=["kb1", "kb2"],
voice_query_hash="sha256_of_query",
processed_query="what is machine learning",
query_type="factual",
answer_length=150,
citation_count=3,
latency_asr_ms=2000,
latency_ret_ms=300,
latency_llm_ms=1500,
total_latency_ms=3800,
groq_tokens_used=200,
)
logs = get_query_log(tmp_db)
assert len(logs) == 1
assert logs[0]["session_id"] == session_id
assert logs[0]["query_type"] == "factual"
assert logs[0]["citation_count"] == 3
def test_get_query_stats_empty(self, tmp_db: Path) -> None:
"""get_query_stats on empty log returns zeros."""
from voicevault.storage.sqlite_store import get_query_stats
stats = get_query_stats(tmp_db)
assert stats["total_queries"] == 0
# ------------------------------------------------------------------ #
# Package Import Tests #
# ------------------------------------------------------------------ #
class TestPackageImports:
"""All package __init__.py files must import without errors."""
def test_voicevault_package(self) -> None:
import voicevault # noqa: F401
assert voicevault.__version__ == "1.0.0"
def test_asr_package(self) -> None:
import voicevault.asr # noqa: F401
def test_ingestion_package(self) -> None:
import voicevault.ingestion # noqa: F401
def test_retrieval_package(self) -> None:
import voicevault.retrieval # noqa: F401
def test_generation_package(self) -> None:
import voicevault.generation # noqa: F401
def test_kb_package(self) -> None:
import voicevault.kb # noqa: F401
def test_tts_package(self) -> None:
import voicevault.tts # noqa: F401
def test_storage_package(self) -> None:
import voicevault.storage # noqa: F401
def test_ui_package(self) -> None:
import ui # noqa: F401
def test_ui_tabs_package(self) -> None:
import ui.tabs # noqa: F401
def test_ui_components_package(self) -> None:
import ui.components # noqa: F401
def test_config_module(self) -> None:
from config import cfg, VoiceVaultConfig # noqa: F401
assert cfg is not None
def test_models_module(self) -> None:
from voicevault.models import ( # noqa: F401
Citation, Document, DocumentChunk, IngestionReport,
KnowledgeBase, QuerySession, RetrievalResult, TranscriptResult,
)
def test_sqlite_store_module(self) -> None:
from voicevault.storage.sqlite_store import ( # noqa: F401
initialize_database, create_kb, get_kb, list_kbs,
update_kb_counts, delete_kb, register_document,
list_documents, delete_document, register_chunk,
chunk_hash_exists, log_query, get_query_log,
)
def test_citation_panel_module(self) -> None:
from ui.components.citation_panel import format_citations_markdown # noqa: F401
def test_audio_controls_module(self) -> None:
from ui.components.audio_controls import get_tts_html, WEB_SPEECH_JS # noqa: F401
# ------------------------------------------------------------------ #
# UI Component Tests #
# ------------------------------------------------------------------ #
class TestUIComponents:
"""Verify UI helper functions produce correct output."""
def test_format_citations_empty(self) -> None:
"""Empty citation list must return placeholder text."""
from ui.components.citation_panel import format_citations_markdown
result = format_citations_markdown([])
assert "No citations" in result
def test_format_citations_single(self, sample_citation) -> None:
"""Single citation must include filename and page number."""
from ui.components.citation_panel import format_citations_markdown
result = format_citations_markdown([sample_citation])
assert "sample.pdf" in result
assert "1" in result # page number
assert "Introduction" in result
def test_format_citations_multiple(self) -> None:
"""Multiple citations must be numbered correctly."""
from voicevault.models import Citation
from ui.components.citation_panel import format_citations_markdown
citations = [
Citation(source_file="a.pdf", page_number=1, section="S1",
excerpt="Text A", relevance_score=0.9),
Citation(source_file="b.pdf", page_number=5, section="S2",
excerpt="Text B", relevance_score=0.8),
]
result = format_citations_markdown(citations)
assert "[1]" in result
assert "[2]" in result
assert "a.pdf" in result
assert "b.pdf" in result
def test_tts_html_contains_js(self) -> None:
"""Web Speech API bridge must include speechSynthesis."""
from ui.components.audio_controls import get_tts_html
html = get_tts_html()
assert "speechSynthesis" in html
assert "_vv_tts" in html
def test_gradio_app_builds(self, tmp_path) -> None:
"""build_app() must return a gr.Blocks instance without error."""
from unittest.mock import MagicMock
import gradio as gr
from app import build_app
from voicevault.kb.kb_manager import KBManager
kb_manager = KBManager(db_path=tmp_path / "test.db")
demo = build_app(kb_manager, MagicMock(), MagicMock())
assert isinstance(demo, gr.Blocks)