""" tests/test_phase0.py ==================== Phase 0 — Foundation Smoke Tests Verifies that the project scaffold is correct: - Config loads with correct types and defaults - SQLite schema creates all required tables - All Pydantic data models instantiate without errors - Runtime directories are created correctly - Gradio app builds without errors - All package imports succeed These tests do NOT require heavy ML dependencies (Whisper, torch, spacy). Run with: pytest tests/test_phase0.py -v """ from __future__ import annotations import sqlite3 from pathlib import Path import pytest # ------------------------------------------------------------------ # # Config Tests # # ------------------------------------------------------------------ # class TestConfig: """Verify centralized config loads correctly.""" def test_config_imports(self) -> None: """Config module must import without exceptions.""" from config import cfg # noqa: F401 def test_config_is_singleton(self) -> None: """Multiple imports must return the same object.""" from config import cfg as cfg1 from config import cfg as cfg2 assert cfg1 is cfg2 def test_default_types(self) -> None: """All fields must have correct types.""" from config import cfg assert isinstance(cfg.groq_api_key, str) assert isinstance(cfg.gemini_api_key, str) assert isinstance(cfg.data_dir, Path) assert isinstance(cfg.bm25_top_k, int) assert isinstance(cfg.vector_top_k, int) assert isinstance(cfg.rrf_k, int) assert isinstance(cfg.final_top_k, int) assert isinstance(cfg.chunk_size_min, int) assert isinstance(cfg.chunk_size_max, int) assert isinstance(cfg.semantic_similarity_threshold, float) assert isinstance(cfg.bcrypt_rounds, int) assert isinstance(cfg.debug, bool) def test_default_values_are_sane(self) -> None: """Critical defaults must be within valid operational ranges.""" from config import cfg assert cfg.bm25_top_k > 0 assert cfg.vector_top_k > 0 assert cfg.rrf_k == 60, "RRF k=60 is the standard value — must not be changed" assert cfg.final_top_k <= cfg.rerank_top_k assert cfg.chunk_size_min < cfg.chunk_size_max assert 0.0 < cfg.semantic_similarity_threshold < 1.0 assert cfg.bcrypt_rounds >= 12, "bcrypt rounds < 12 is insecure" assert cfg.port == 7860 def test_path_helpers_return_paths(self) -> None: """Path helper methods must return Path objects.""" from config import cfg assert isinstance(cfg.kb_dir("test"), Path) assert isinstance(cfg.kb_chroma_dir("test"), Path) assert isinstance(cfg.kb_bm25_path("test"), Path) assert isinstance(cfg.kb_db_path("test"), Path) assert isinstance(cfg.uploads_dir, Path) assert isinstance(cfg.models_cache_dir, Path) def test_kb_path_hierarchy(self) -> None: """KB subdirectory paths must be children of the KB root.""" from config import cfg kb = "myknowledgebase" assert cfg.kb_chroma_dir(kb).parent == cfg.kb_dir(kb) assert cfg.kb_bm25_path(kb).parent == cfg.kb_dir(kb) assert cfg.kb_db_path(kb).parent == cfg.kb_dir(kb) def test_has_groq_key_false_when_empty(self) -> None: """has_groq_key() must return False when key is empty string.""" from config import VoiceVaultConfig empty_cfg = VoiceVaultConfig(GROQ_API_KEY="") assert empty_cfg.has_groq_key() is False def test_has_groq_key_true_when_set(self) -> None: """has_groq_key() must return True when key is non-empty.""" from config import VoiceVaultConfig cfg_with_key = VoiceVaultConfig(GROQ_API_KEY="gsk_test123") assert cfg_with_key.has_groq_key() is True def test_allowed_extensions_contains_expected(self) -> None: """Security extension whitelist must include all supported formats.""" from config import cfg expected = {".pdf", ".html", ".htm", ".docx", ".md", ".mdx", ".txt"} assert expected.issubset(cfg.allowed_extensions) def test_ensure_directories_creates_dirs(self, tmp_path: Path) -> None: """ensure_directories() must create data/ and data/uploads/.""" from config import VoiceVaultConfig local_cfg = VoiceVaultConfig(DATA_DIR=str(tmp_path / "data")) local_cfg.ensure_directories() assert local_cfg.data_dir.exists() assert local_cfg.uploads_dir.exists() # ------------------------------------------------------------------ # # Data Model Tests # # ------------------------------------------------------------------ # class TestModels: """Verify all Pydantic data models instantiate and validate correctly.""" def test_document_chunk_defaults(self) -> None: """DocumentChunk must generate a UUID chunk_id automatically.""" from voicevault.models import DocumentChunk chunk = DocumentChunk( kb_name="test", source_file="doc.pdf", page_number=1, chunk_index=0, text="Hello world.", text_hash="deadbeef", token_count=2, ) assert chunk.chunk_id # non-empty UUID string assert len(chunk.chunk_id) == 36 # UUID4 format assert chunk.language == "en" def test_document_chunk_uuid_unique(self) -> None: """Two DocumentChunks created without explicit chunk_id must have different IDs.""" from voicevault.models import DocumentChunk base = dict( kb_name="test", source_file="doc.pdf", page_number=1, chunk_index=0, text="x", text_hash="y", token_count=1, ) a = DocumentChunk(**base) b = DocumentChunk(**base) assert a.chunk_id != b.chunk_id def test_citation_model(self) -> None: """Citation must store all fields correctly.""" from voicevault.models import Citation c = Citation( source_file="report.pdf", page_number=42, section="Results", excerpt="The accuracy was 95%.", relevance_score=0.87, ) assert c.source_file == "report.pdf" assert c.page_number == 42 assert c.relevance_score == pytest.approx(0.87) def test_query_session_defaults(self) -> None: """QuerySession must auto-generate session_id and timestamp.""" from voicevault.models import QuerySession session = QuerySession( kb_names=["kb1"], voice_query="What is machine learning?", processed_query="what is machine learning", ) assert session.session_id assert session.query_type == "factual" assert session.answer == "" assert session.citations == [] def test_knowledge_base_is_protected(self) -> None: """KnowledgeBase.is_protected must reflect password_hash presence.""" from voicevault.models import KnowledgeBase public_kb = KnowledgeBase(kb_name="public", display_name="Public KB") private_kb = KnowledgeBase( kb_name="private", display_name="Private KB", password_hash="$2b$12$hash" ) assert public_kb.is_protected is False assert private_kb.is_protected is True def test_document_model(self) -> None: """Document must auto-generate doc_id.""" from voicevault.models import Document doc = Document(kb_name="test", filename="test.pdf", file_hash="sha256abc") assert len(doc.doc_id) == 36 assert doc.is_private is False def test_ingestion_report(self) -> None: """IngestionReport must store status correctly.""" from voicevault.models import IngestionReport report = IngestionReport( doc_id="some-uuid", filename="file.pdf", chunk_count=42, page_count=10, status="success", ) assert report.status == "success" assert report.chunk_count == 42 def test_transcript_result(self) -> None: """TranscriptResult must store transcript and model_used.""" from voicevault.models import TranscriptResult result = TranscriptResult( transcript="What is machine learning?", raw_transcript="Um, what is machine learning?", model_used="whisper-large-v3", ) assert result.confidence == 1.0 assert result.language == "en" assert result.query_type == "factual" def test_retrieval_result(self) -> None: """RetrievalResult must store RRF and rerank scores.""" from voicevault.models import RetrievalResult result = RetrievalResult( chunk_id="uuid-001", text="Machine learning is a subset of AI.", source_file="ml_intro.pdf", page_number=3, rrf_score=0.032, rerank_score=0.91, ) assert result.rrf_score == pytest.approx(0.032) assert result.rerank_score == pytest.approx(0.91) # ------------------------------------------------------------------ # # SQLite Schema Tests # # ------------------------------------------------------------------ # class TestSQLiteSchema: """Verify the SQLite schema is created correctly.""" def test_initialize_creates_tables(self, tmp_db: Path) -> None: """All four tables must exist after initialize_database().""" conn = sqlite3.connect(str(tmp_db)) cursor = conn.execute( "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name" ) tables = {row[0] for row in cursor.fetchall()} conn.close() expected = {"knowledge_bases", "documents", "chunks", "query_log"} assert expected.issubset(tables), f"Missing tables: {expected - tables}" def test_initialize_is_idempotent(self, tmp_path: Path) -> None: """Calling initialize_database() twice must not raise or duplicate tables.""" from voicevault.storage.sqlite_store import initialize_database db_path = tmp_path / "voicevault.db" initialize_database(db_path) initialize_database(db_path) # Second call — must be silent conn = sqlite3.connect(str(db_path)) cursor = conn.execute( "SELECT COUNT(*) FROM sqlite_master WHERE type='table'" ) count = cursor.fetchone()[0] conn.close() assert count >= 4 def test_initialize_raises_if_dir_missing(self, tmp_path: Path) -> None: """initialize_database() must raise FileNotFoundError if parent dir is missing.""" from voicevault.storage.sqlite_store import initialize_database bad_path = tmp_path / "nonexistent" / "voicevault.db" with pytest.raises(FileNotFoundError): initialize_database(bad_path) def test_kb_table_schema(self, tmp_db: Path) -> None: """knowledge_bases table must have all required columns.""" conn = sqlite3.connect(str(tmp_db)) cursor = conn.execute("PRAGMA table_info(knowledge_bases)") columns = {row[1] for row in cursor.fetchall()} conn.close() required = {"kb_name", "display_name", "password_hash", "owner", "doc_count", "chunk_count", "created_at", "last_updated"} assert required.issubset(columns) def test_documents_table_schema(self, tmp_db: Path) -> None: """documents table must have all required columns.""" conn = sqlite3.connect(str(tmp_db)) cursor = conn.execute("PRAGMA table_info(documents)") columns = {row[1] for row in cursor.fetchall()} conn.close() required = {"doc_id", "kb_name", "filename", "file_hash", "page_count", "chunk_count", "is_private", "ingested_at"} assert required.issubset(columns) def test_query_log_schema(self, tmp_db: Path) -> None: """query_log table must have anonymized query hash, not raw query text.""" conn = sqlite3.connect(str(tmp_db)) cursor = conn.execute("PRAGMA table_info(query_log)") columns = {row[1] for row in cursor.fetchall()} conn.close() # Security: raw voice_query must NOT be stored — only hash assert "voice_query_hash" in columns, "Audit log must store query hash, not raw text" assert "voice_query" not in columns, "Raw voice query must NOT be in audit log" # ------------------------------------------------------------------ # # SQLite CRUD Tests # # ------------------------------------------------------------------ # class TestSQLiteCRUD: """Verify SQLite CRUD operations are correct and use parameterized queries.""" def test_create_and_get_kb(self, tmp_db: Path) -> None: """create_kb + get_kb round-trip.""" from voicevault.storage.sqlite_store import create_kb, get_kb create_kb(tmp_db, "my-kb", "My Knowledge Base", owner="navnit") kb = get_kb(tmp_db, "my-kb") assert kb is not None assert kb["kb_name"] == "my-kb" assert kb["display_name"] == "My Knowledge Base" assert kb["owner"] == "navnit" assert kb["password_hash"] is None def test_create_kb_duplicate_raises(self, tmp_db: Path) -> None: """Inserting a KB with a duplicate name must raise.""" from voicevault.storage.sqlite_store import create_kb create_kb(tmp_db, "dupe-kb", "Dupe") with pytest.raises(Exception): create_kb(tmp_db, "dupe-kb", "Dupe Again") def test_list_kbs_empty(self, tmp_db: Path) -> None: """list_kbs on empty DB returns empty list.""" from voicevault.storage.sqlite_store import list_kbs assert list_kbs(tmp_db) == [] def test_list_kbs_multiple(self, tmp_db: Path) -> None: """list_kbs returns all created KBs.""" from voicevault.storage.sqlite_store import create_kb, list_kbs create_kb(tmp_db, "kb-a", "KB A") create_kb(tmp_db, "kb-b", "KB B") result = list_kbs(tmp_db) names = {kb["kb_name"] for kb in result} assert {"kb-a", "kb-b"} == names def test_delete_kb(self, tmp_db: Path) -> None: """delete_kb removes the KB row.""" from voicevault.storage.sqlite_store import create_kb, delete_kb, get_kb create_kb(tmp_db, "to-delete", "To Delete") assert get_kb(tmp_db, "to-delete") is not None delete_kb(tmp_db, "to-delete") assert get_kb(tmp_db, "to-delete") is None def test_update_kb_counts(self, tmp_db: Path) -> None: """update_kb_counts updates doc/chunk counts correctly.""" from voicevault.storage.sqlite_store import create_kb, get_kb, update_kb_counts create_kb(tmp_db, "count-kb", "Count KB") update_kb_counts(tmp_db, "count-kb", doc_count=5, chunk_count=120) kb = get_kb(tmp_db, "count-kb") assert kb["doc_count"] == 5 assert kb["chunk_count"] == 120 def test_register_and_list_document(self, tmp_db: Path) -> None: """register_document + list_documents round-trip.""" from voicevault.storage.sqlite_store import ( create_kb, list_documents, register_document ) create_kb(tmp_db, "doc-kb", "Doc KB") register_document( tmp_db, doc_id="doc-uuid-001", kb_name="doc-kb", filename="report.pdf", file_hash="sha256_abc", page_count=10, chunk_count=25, ) docs = list_documents(tmp_db, "doc-kb") assert len(docs) == 1 assert docs[0]["filename"] == "report.pdf" assert docs[0]["file_hash"] == "sha256_abc" def test_get_document_by_hash(self, tmp_db: Path) -> None: """get_document_by_hash enables deduplication check.""" from voicevault.storage.sqlite_store import ( create_kb, get_document_by_hash, register_document ) create_kb(tmp_db, "dedup-kb", "Dedup KB") register_document(tmp_db, "uuid-x", "dedup-kb", "file.pdf", "unique_hash_123") found = get_document_by_hash(tmp_db, "unique_hash_123", "dedup-kb") assert found is not None not_found = get_document_by_hash(tmp_db, "nonexistent_hash", "dedup-kb") assert not_found is None def test_register_chunk_and_dedup(self, tmp_db: Path) -> None: """chunk_hash_exists returns True only for registered chunk hashes.""" from voicevault.storage.sqlite_store import ( chunk_hash_exists, create_kb, register_chunk, register_document ) create_kb(tmp_db, "chunk-kb", "Chunk KB") register_document(tmp_db, "doc-001", "chunk-kb", "doc.pdf", "file_hash_001") register_chunk( tmp_db, chunk_id="chunk-uuid-001", kb_name="chunk-kb", doc_id="doc-001", source_file="doc.pdf", page_number=1, section="Intro", chunk_index=0, text_hash="chunk_hash_abc", token_count=50, ) assert chunk_hash_exists(tmp_db, "chunk_hash_abc", "chunk-kb") is True assert chunk_hash_exists(tmp_db, "nonexistent_hash", "chunk-kb") is False def test_delete_document_cascades_chunks(self, tmp_db: Path) -> None: """Deleting a document must cascade-delete its chunks.""" from voicevault.storage.sqlite_store import ( chunk_hash_exists, create_kb, delete_document, register_chunk, register_document ) create_kb(tmp_db, "cascade-kb", "Cascade KB") register_document(tmp_db, "doc-cascade", "cascade-kb", "x.pdf", "hash_x") register_chunk( tmp_db, "chunk-cascade", "cascade-kb", "doc-cascade", "x.pdf", 1, "Intro", 0, "chunk_hash_cascade", 10 ) assert chunk_hash_exists(tmp_db, "chunk_hash_cascade", "cascade-kb") is True delete_document(tmp_db, "doc-cascade") assert chunk_hash_exists(tmp_db, "chunk_hash_cascade", "cascade-kb") is False def test_log_query_and_retrieve(self, tmp_db: Path) -> None: """log_query + get_query_log round-trip.""" import uuid from voicevault.storage.sqlite_store import get_query_log, log_query log_id = str(uuid.uuid4()) session_id = str(uuid.uuid4()) log_query( tmp_db, log_id, session_id, kb_names=["kb1", "kb2"], voice_query_hash="sha256_of_query", processed_query="what is machine learning", query_type="factual", answer_length=150, citation_count=3, latency_asr_ms=2000, latency_ret_ms=300, latency_llm_ms=1500, total_latency_ms=3800, groq_tokens_used=200, ) logs = get_query_log(tmp_db) assert len(logs) == 1 assert logs[0]["session_id"] == session_id assert logs[0]["query_type"] == "factual" assert logs[0]["citation_count"] == 3 def test_get_query_stats_empty(self, tmp_db: Path) -> None: """get_query_stats on empty log returns zeros.""" from voicevault.storage.sqlite_store import get_query_stats stats = get_query_stats(tmp_db) assert stats["total_queries"] == 0 # ------------------------------------------------------------------ # # Package Import Tests # # ------------------------------------------------------------------ # class TestPackageImports: """All package __init__.py files must import without errors.""" def test_voicevault_package(self) -> None: import voicevault # noqa: F401 assert voicevault.__version__ == "1.0.0" def test_asr_package(self) -> None: import voicevault.asr # noqa: F401 def test_ingestion_package(self) -> None: import voicevault.ingestion # noqa: F401 def test_retrieval_package(self) -> None: import voicevault.retrieval # noqa: F401 def test_generation_package(self) -> None: import voicevault.generation # noqa: F401 def test_kb_package(self) -> None: import voicevault.kb # noqa: F401 def test_tts_package(self) -> None: import voicevault.tts # noqa: F401 def test_storage_package(self) -> None: import voicevault.storage # noqa: F401 def test_ui_package(self) -> None: import ui # noqa: F401 def test_ui_tabs_package(self) -> None: import ui.tabs # noqa: F401 def test_ui_components_package(self) -> None: import ui.components # noqa: F401 def test_config_module(self) -> None: from config import cfg, VoiceVaultConfig # noqa: F401 assert cfg is not None def test_models_module(self) -> None: from voicevault.models import ( # noqa: F401 Citation, Document, DocumentChunk, IngestionReport, KnowledgeBase, QuerySession, RetrievalResult, TranscriptResult, ) def test_sqlite_store_module(self) -> None: from voicevault.storage.sqlite_store import ( # noqa: F401 initialize_database, create_kb, get_kb, list_kbs, update_kb_counts, delete_kb, register_document, list_documents, delete_document, register_chunk, chunk_hash_exists, log_query, get_query_log, ) def test_citation_panel_module(self) -> None: from ui.components.citation_panel import format_citations_markdown # noqa: F401 def test_audio_controls_module(self) -> None: from ui.components.audio_controls import get_tts_html, WEB_SPEECH_JS # noqa: F401 # ------------------------------------------------------------------ # # UI Component Tests # # ------------------------------------------------------------------ # class TestUIComponents: """Verify UI helper functions produce correct output.""" def test_format_citations_empty(self) -> None: """Empty citation list must return placeholder text.""" from ui.components.citation_panel import format_citations_markdown result = format_citations_markdown([]) assert "No citations" in result def test_format_citations_single(self, sample_citation) -> None: """Single citation must include filename and page number.""" from ui.components.citation_panel import format_citations_markdown result = format_citations_markdown([sample_citation]) assert "sample.pdf" in result assert "1" in result # page number assert "Introduction" in result def test_format_citations_multiple(self) -> None: """Multiple citations must be numbered correctly.""" from voicevault.models import Citation from ui.components.citation_panel import format_citations_markdown citations = [ Citation(source_file="a.pdf", page_number=1, section="S1", excerpt="Text A", relevance_score=0.9), Citation(source_file="b.pdf", page_number=5, section="S2", excerpt="Text B", relevance_score=0.8), ] result = format_citations_markdown(citations) assert "[1]" in result assert "[2]" in result assert "a.pdf" in result assert "b.pdf" in result def test_tts_html_contains_js(self) -> None: """Web Speech API bridge must include speechSynthesis.""" from ui.components.audio_controls import get_tts_html html = get_tts_html() assert "speechSynthesis" in html assert "_vv_tts" in html def test_gradio_app_builds(self, tmp_path) -> None: """build_app() must return a gr.Blocks instance without error.""" from unittest.mock import MagicMock import gradio as gr from app import build_app from voicevault.kb.kb_manager import KBManager kb_manager = KBManager(db_path=tmp_path / "test.db") demo = build_app(kb_manager, MagicMock(), MagicMock()) assert isinstance(demo, gr.Blocks)