Spaces:

NinjainPJs
/

VoiceVault

Running

App Files Files Community

VoiceVault / tests /test_phase0.py

NinjainPJs

Initial release: VoiceVault v1.0.0 — Voice-First RAG Knowledge Agent

85f900d 3 months ago

raw

history blame contribute delete

24.5 kB

	"""
	tests/test_phase0.py
	====================
	Phase 0 — Foundation Smoke Tests

	Verifies that the project scaffold is correct:
	- Config loads with correct types and defaults
	- SQLite schema creates all required tables
	- All Pydantic data models instantiate without errors
	- Runtime directories are created correctly
	- Gradio app builds without errors
	- All package imports succeed

	These tests do NOT require heavy ML dependencies (Whisper, torch, spacy).
	Run with: pytest tests/test_phase0.py -v
	"""

	from __future__ import annotations

	import sqlite3
	from pathlib import Path

	import pytest


	# ------------------------------------------------------------------ #
	# Config Tests #
	# ------------------------------------------------------------------ #


	class TestConfig:
	"""Verify centralized config loads correctly."""

	def test_config_imports(self) -> None:
	"""Config module must import without exceptions."""
	from config import cfg # noqa: F401

	def test_config_is_singleton(self) -> None:
	"""Multiple imports must return the same object."""
	from config import cfg as cfg1
	from config import cfg as cfg2
	assert cfg1 is cfg2

	def test_default_types(self) -> None:
	"""All fields must have correct types."""
	from config import cfg
	assert isinstance(cfg.groq_api_key, str)
	assert isinstance(cfg.gemini_api_key, str)
	assert isinstance(cfg.data_dir, Path)
	assert isinstance(cfg.bm25_top_k, int)
	assert isinstance(cfg.vector_top_k, int)
	assert isinstance(cfg.rrf_k, int)
	assert isinstance(cfg.final_top_k, int)
	assert isinstance(cfg.chunk_size_min, int)
	assert isinstance(cfg.chunk_size_max, int)
	assert isinstance(cfg.semantic_similarity_threshold, float)
	assert isinstance(cfg.bcrypt_rounds, int)
	assert isinstance(cfg.debug, bool)

	def test_default_values_are_sane(self) -> None:
	"""Critical defaults must be within valid operational ranges."""
	from config import cfg
	assert cfg.bm25_top_k > 0
	assert cfg.vector_top_k > 0
	assert cfg.rrf_k == 60, "RRF k=60 is the standard value — must not be changed"
	assert cfg.final_top_k <= cfg.rerank_top_k
	assert cfg.chunk_size_min < cfg.chunk_size_max
	assert 0.0 < cfg.semantic_similarity_threshold < 1.0
	assert cfg.bcrypt_rounds >= 12, "bcrypt rounds < 12 is insecure"
	assert cfg.port == 7860

	def test_path_helpers_return_paths(self) -> None:
	"""Path helper methods must return Path objects."""
	from config import cfg
	assert isinstance(cfg.kb_dir("test"), Path)
	assert isinstance(cfg.kb_chroma_dir("test"), Path)
	assert isinstance(cfg.kb_bm25_path("test"), Path)
	assert isinstance(cfg.kb_db_path("test"), Path)
	assert isinstance(cfg.uploads_dir, Path)
	assert isinstance(cfg.models_cache_dir, Path)

	def test_kb_path_hierarchy(self) -> None:
	"""KB subdirectory paths must be children of the KB root."""
	from config import cfg
	kb = "myknowledgebase"
	assert cfg.kb_chroma_dir(kb).parent == cfg.kb_dir(kb)
	assert cfg.kb_bm25_path(kb).parent == cfg.kb_dir(kb)
	assert cfg.kb_db_path(kb).parent == cfg.kb_dir(kb)

	def test_has_groq_key_false_when_empty(self) -> None:
	"""has_groq_key() must return False when key is empty string."""
	from config import VoiceVaultConfig
	empty_cfg = VoiceVaultConfig(GROQ_API_KEY="")
	assert empty_cfg.has_groq_key() is False

	def test_has_groq_key_true_when_set(self) -> None:
	"""has_groq_key() must return True when key is non-empty."""
	from config import VoiceVaultConfig
	cfg_with_key = VoiceVaultConfig(GROQ_API_KEY="gsk_test123")
	assert cfg_with_key.has_groq_key() is True

	def test_allowed_extensions_contains_expected(self) -> None:
	"""Security extension whitelist must include all supported formats."""
	from config import cfg
	expected = {".pdf", ".html", ".htm", ".docx", ".md", ".mdx", ".txt"}
	assert expected.issubset(cfg.allowed_extensions)

	def test_ensure_directories_creates_dirs(self, tmp_path: Path) -> None:
	"""ensure_directories() must create data/ and data/uploads/."""
	from config import VoiceVaultConfig
	local_cfg = VoiceVaultConfig(DATA_DIR=str(tmp_path / "data"))
	local_cfg.ensure_directories()
	assert local_cfg.data_dir.exists()
	assert local_cfg.uploads_dir.exists()


	# ------------------------------------------------------------------ #
	# Data Model Tests #
	# ------------------------------------------------------------------ #


	class TestModels:
	"""Verify all Pydantic data models instantiate and validate correctly."""

	def test_document_chunk_defaults(self) -> None:
	"""DocumentChunk must generate a UUID chunk_id automatically."""
	from voicevault.models import DocumentChunk
	chunk = DocumentChunk(
	kb_name="test",
	source_file="doc.pdf",
	page_number=1,
	chunk_index=0,
	text="Hello world.",
	text_hash="deadbeef",
	token_count=2,
	)
	assert chunk.chunk_id # non-empty UUID string
	assert len(chunk.chunk_id) == 36 # UUID4 format
	assert chunk.language == "en"

	def test_document_chunk_uuid_unique(self) -> None:
	"""Two DocumentChunks created without explicit chunk_id must have different IDs."""
	from voicevault.models import DocumentChunk
	base = dict(
	kb_name="test", source_file="doc.pdf", page_number=1,
	chunk_index=0, text="x", text_hash="y", token_count=1,
	)
	a = DocumentChunk(**base)
	b = DocumentChunk(**base)
	assert a.chunk_id != b.chunk_id

	def test_citation_model(self) -> None:
	"""Citation must store all fields correctly."""
	from voicevault.models import Citation
	c = Citation(
	source_file="report.pdf",
	page_number=42,
	section="Results",
	excerpt="The accuracy was 95%.",
	relevance_score=0.87,
	)
	assert c.source_file == "report.pdf"
	assert c.page_number == 42
	assert c.relevance_score == pytest.approx(0.87)

	def test_query_session_defaults(self) -> None:
	"""QuerySession must auto-generate session_id and timestamp."""
	from voicevault.models import QuerySession
	session = QuerySession(
	kb_names=["kb1"],
	voice_query="What is machine learning?",
	processed_query="what is machine learning",
	)
	assert session.session_id
	assert session.query_type == "factual"
	assert session.answer == ""
	assert session.citations == []

	def test_knowledge_base_is_protected(self) -> None:
	"""KnowledgeBase.is_protected must reflect password_hash presence."""
	from voicevault.models import KnowledgeBase
	public_kb = KnowledgeBase(kb_name="public", display_name="Public KB")
	private_kb = KnowledgeBase(
	kb_name="private", display_name="Private KB", password_hash="$2b$12$hash"
	)
	assert public_kb.is_protected is False
	assert private_kb.is_protected is True

	def test_document_model(self) -> None:
	"""Document must auto-generate doc_id."""
	from voicevault.models import Document
	doc = Document(kb_name="test", filename="test.pdf", file_hash="sha256abc")
	assert len(doc.doc_id) == 36
	assert doc.is_private is False

	def test_ingestion_report(self) -> None:
	"""IngestionReport must store status correctly."""
	from voicevault.models import IngestionReport
	report = IngestionReport(
	doc_id="some-uuid",
	filename="file.pdf",
	chunk_count=42,
	page_count=10,
	status="success",
	)
	assert report.status == "success"
	assert report.chunk_count == 42

	def test_transcript_result(self) -> None:
	"""TranscriptResult must store transcript and model_used."""
	from voicevault.models import TranscriptResult
	result = TranscriptResult(
	transcript="What is machine learning?",
	raw_transcript="Um, what is machine learning?",
	model_used="whisper-large-v3",
	)
	assert result.confidence == 1.0
	assert result.language == "en"
	assert result.query_type == "factual"

	def test_retrieval_result(self) -> None:
	"""RetrievalResult must store RRF and rerank scores."""
	from voicevault.models import RetrievalResult
	result = RetrievalResult(
	chunk_id="uuid-001",
	text="Machine learning is a subset of AI.",
	source_file="ml_intro.pdf",
	page_number=3,
	rrf_score=0.032,
	rerank_score=0.91,
	)
	assert result.rrf_score == pytest.approx(0.032)
	assert result.rerank_score == pytest.approx(0.91)


	# ------------------------------------------------------------------ #
	# SQLite Schema Tests #
	# ------------------------------------------------------------------ #


	class TestSQLiteSchema:
	"""Verify the SQLite schema is created correctly."""

	def test_initialize_creates_tables(self, tmp_db: Path) -> None:
	"""All four tables must exist after initialize_database()."""
	conn = sqlite3.connect(str(tmp_db))
	cursor = conn.execute(
	"SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
	)
	tables = {row[0] for row in cursor.fetchall()}
	conn.close()
	expected = {"knowledge_bases", "documents", "chunks", "query_log"}
	assert expected.issubset(tables), f"Missing tables: {expected - tables}"

	def test_initialize_is_idempotent(self, tmp_path: Path) -> None:
	"""Calling initialize_database() twice must not raise or duplicate tables."""
	from voicevault.storage.sqlite_store import initialize_database
	db_path = tmp_path / "voicevault.db"
	initialize_database(db_path)
	initialize_database(db_path) # Second call — must be silent

	conn = sqlite3.connect(str(db_path))
	cursor = conn.execute(
	"SELECT COUNT(*) FROM sqlite_master WHERE type='table'"
	)
	count = cursor.fetchone()[0]
	conn.close()
	assert count >= 4

	def test_initialize_raises_if_dir_missing(self, tmp_path: Path) -> None:
	"""initialize_database() must raise FileNotFoundError if parent dir is missing."""
	from voicevault.storage.sqlite_store import initialize_database
	bad_path = tmp_path / "nonexistent" / "voicevault.db"
	with pytest.raises(FileNotFoundError):
	initialize_database(bad_path)

	def test_kb_table_schema(self, tmp_db: Path) -> None:
	"""knowledge_bases table must have all required columns."""
	conn = sqlite3.connect(str(tmp_db))
	cursor = conn.execute("PRAGMA table_info(knowledge_bases)")
	columns = {row[1] for row in cursor.fetchall()}
	conn.close()
	required = {"kb_name", "display_name", "password_hash", "owner",
	"doc_count", "chunk_count", "created_at", "last_updated"}
	assert required.issubset(columns)

	def test_documents_table_schema(self, tmp_db: Path) -> None:
	"""documents table must have all required columns."""
	conn = sqlite3.connect(str(tmp_db))
	cursor = conn.execute("PRAGMA table_info(documents)")
	columns = {row[1] for row in cursor.fetchall()}
	conn.close()
	required = {"doc_id", "kb_name", "filename", "file_hash",
	"page_count", "chunk_count", "is_private", "ingested_at"}
	assert required.issubset(columns)

	def test_query_log_schema(self, tmp_db: Path) -> None:
	"""query_log table must have anonymized query hash, not raw query text."""
	conn = sqlite3.connect(str(tmp_db))
	cursor = conn.execute("PRAGMA table_info(query_log)")
	columns = {row[1] for row in cursor.fetchall()}
	conn.close()
	# Security: raw voice_query must NOT be stored — only hash
	assert "voice_query_hash" in columns, "Audit log must store query hash, not raw text"
	assert "voice_query" not in columns, "Raw voice query must NOT be in audit log"


	# ------------------------------------------------------------------ #
	# SQLite CRUD Tests #
	# ------------------------------------------------------------------ #


	class TestSQLiteCRUD:
	"""Verify SQLite CRUD operations are correct and use parameterized queries."""

	def test_create_and_get_kb(self, tmp_db: Path) -> None:
	"""create_kb + get_kb round-trip."""
	from voicevault.storage.sqlite_store import create_kb, get_kb
	create_kb(tmp_db, "my-kb", "My Knowledge Base", owner="navnit")
	kb = get_kb(tmp_db, "my-kb")
	assert kb is not None
	assert kb["kb_name"] == "my-kb"
	assert kb["display_name"] == "My Knowledge Base"
	assert kb["owner"] == "navnit"
	assert kb["password_hash"] is None

	def test_create_kb_duplicate_raises(self, tmp_db: Path) -> None:
	"""Inserting a KB with a duplicate name must raise."""
	from voicevault.storage.sqlite_store import create_kb
	create_kb(tmp_db, "dupe-kb", "Dupe")
	with pytest.raises(Exception):
	create_kb(tmp_db, "dupe-kb", "Dupe Again")

	def test_list_kbs_empty(self, tmp_db: Path) -> None:
	"""list_kbs on empty DB returns empty list."""
	from voicevault.storage.sqlite_store import list_kbs
	assert list_kbs(tmp_db) == []

	def test_list_kbs_multiple(self, tmp_db: Path) -> None:
	"""list_kbs returns all created KBs."""
	from voicevault.storage.sqlite_store import create_kb, list_kbs
	create_kb(tmp_db, "kb-a", "KB A")
	create_kb(tmp_db, "kb-b", "KB B")
	result = list_kbs(tmp_db)
	names = {kb["kb_name"] for kb in result}
	assert {"kb-a", "kb-b"} == names

	def test_delete_kb(self, tmp_db: Path) -> None:
	"""delete_kb removes the KB row."""
	from voicevault.storage.sqlite_store import create_kb, delete_kb, get_kb
	create_kb(tmp_db, "to-delete", "To Delete")
	assert get_kb(tmp_db, "to-delete") is not None
	delete_kb(tmp_db, "to-delete")
	assert get_kb(tmp_db, "to-delete") is None

	def test_update_kb_counts(self, tmp_db: Path) -> None:
	"""update_kb_counts updates doc/chunk counts correctly."""
	from voicevault.storage.sqlite_store import create_kb, get_kb, update_kb_counts
	create_kb(tmp_db, "count-kb", "Count KB")
	update_kb_counts(tmp_db, "count-kb", doc_count=5, chunk_count=120)
	kb = get_kb(tmp_db, "count-kb")
	assert kb["doc_count"] == 5
	assert kb["chunk_count"] == 120

	def test_register_and_list_document(self, tmp_db: Path) -> None:
	"""register_document + list_documents round-trip."""
	from voicevault.storage.sqlite_store import (
	create_kb, list_documents, register_document
	)
	create_kb(tmp_db, "doc-kb", "Doc KB")
	register_document(
	tmp_db,
	doc_id="doc-uuid-001",
	kb_name="doc-kb",
	filename="report.pdf",
	file_hash="sha256_abc",
	page_count=10,
	chunk_count=25,
	)
	docs = list_documents(tmp_db, "doc-kb")
	assert len(docs) == 1
	assert docs[0]["filename"] == "report.pdf"
	assert docs[0]["file_hash"] == "sha256_abc"

	def test_get_document_by_hash(self, tmp_db: Path) -> None:
	"""get_document_by_hash enables deduplication check."""
	from voicevault.storage.sqlite_store import (
	create_kb, get_document_by_hash, register_document
	)
	create_kb(tmp_db, "dedup-kb", "Dedup KB")
	register_document(tmp_db, "uuid-x", "dedup-kb", "file.pdf", "unique_hash_123")
	found = get_document_by_hash(tmp_db, "unique_hash_123", "dedup-kb")
	assert found is not None
	not_found = get_document_by_hash(tmp_db, "nonexistent_hash", "dedup-kb")
	assert not_found is None

	def test_register_chunk_and_dedup(self, tmp_db: Path) -> None:
	"""chunk_hash_exists returns True only for registered chunk hashes."""
	from voicevault.storage.sqlite_store import (
	chunk_hash_exists, create_kb, register_chunk, register_document
	)
	create_kb(tmp_db, "chunk-kb", "Chunk KB")
	register_document(tmp_db, "doc-001", "chunk-kb", "doc.pdf", "file_hash_001")
	register_chunk(
	tmp_db,
	chunk_id="chunk-uuid-001",
	kb_name="chunk-kb",
	doc_id="doc-001",
	source_file="doc.pdf",
	page_number=1,
	section="Intro",
	chunk_index=0,
	text_hash="chunk_hash_abc",
	token_count=50,
	)
	assert chunk_hash_exists(tmp_db, "chunk_hash_abc", "chunk-kb") is True
	assert chunk_hash_exists(tmp_db, "nonexistent_hash", "chunk-kb") is False

	def test_delete_document_cascades_chunks(self, tmp_db: Path) -> None:
	"""Deleting a document must cascade-delete its chunks."""
	from voicevault.storage.sqlite_store import (
	chunk_hash_exists, create_kb, delete_document,
	register_chunk, register_document
	)
	create_kb(tmp_db, "cascade-kb", "Cascade KB")
	register_document(tmp_db, "doc-cascade", "cascade-kb", "x.pdf", "hash_x")
	register_chunk(
	tmp_db, "chunk-cascade", "cascade-kb", "doc-cascade",
	"x.pdf", 1, "Intro", 0, "chunk_hash_cascade", 10
	)
	assert chunk_hash_exists(tmp_db, "chunk_hash_cascade", "cascade-kb") is True
	delete_document(tmp_db, "doc-cascade")
	assert chunk_hash_exists(tmp_db, "chunk_hash_cascade", "cascade-kb") is False

	def test_log_query_and_retrieve(self, tmp_db: Path) -> None:
	"""log_query + get_query_log round-trip."""
	import uuid
	from voicevault.storage.sqlite_store import get_query_log, log_query
	log_id = str(uuid.uuid4())
	session_id = str(uuid.uuid4())
	log_query(
	tmp_db, log_id, session_id,
	kb_names=["kb1", "kb2"],
	voice_query_hash="sha256_of_query",
	processed_query="what is machine learning",
	query_type="factual",
	answer_length=150,
	citation_count=3,
	latency_asr_ms=2000,
	latency_ret_ms=300,
	latency_llm_ms=1500,
	total_latency_ms=3800,
	groq_tokens_used=200,
	)
	logs = get_query_log(tmp_db)
	assert len(logs) == 1
	assert logs[0]["session_id"] == session_id
	assert logs[0]["query_type"] == "factual"
	assert logs[0]["citation_count"] == 3

	def test_get_query_stats_empty(self, tmp_db: Path) -> None:
	"""get_query_stats on empty log returns zeros."""
	from voicevault.storage.sqlite_store import get_query_stats
	stats = get_query_stats(tmp_db)
	assert stats["total_queries"] == 0


	# ------------------------------------------------------------------ #
	# Package Import Tests #
	# ------------------------------------------------------------------ #


	class TestPackageImports:
	"""All package __init__.py files must import without errors."""

	def test_voicevault_package(self) -> None:
	import voicevault # noqa: F401
	assert voicevault.__version__ == "1.0.0"

	def test_asr_package(self) -> None:
	import voicevault.asr # noqa: F401

	def test_ingestion_package(self) -> None:
	import voicevault.ingestion # noqa: F401

	def test_retrieval_package(self) -> None:
	import voicevault.retrieval # noqa: F401

	def test_generation_package(self) -> None:
	import voicevault.generation # noqa: F401

	def test_kb_package(self) -> None:
	import voicevault.kb # noqa: F401

	def test_tts_package(self) -> None:
	import voicevault.tts # noqa: F401

	def test_storage_package(self) -> None:
	import voicevault.storage # noqa: F401

	def test_ui_package(self) -> None:
	import ui # noqa: F401

	def test_ui_tabs_package(self) -> None:
	import ui.tabs # noqa: F401

	def test_ui_components_package(self) -> None:
	import ui.components # noqa: F401

	def test_config_module(self) -> None:
	from config import cfg, VoiceVaultConfig # noqa: F401
	assert cfg is not None

	def test_models_module(self) -> None:
	from voicevault.models import ( # noqa: F401
	Citation, Document, DocumentChunk, IngestionReport,
	KnowledgeBase, QuerySession, RetrievalResult, TranscriptResult,
	)

	def test_sqlite_store_module(self) -> None:
	from voicevault.storage.sqlite_store import ( # noqa: F401
	initialize_database, create_kb, get_kb, list_kbs,
	update_kb_counts, delete_kb, register_document,
	list_documents, delete_document, register_chunk,
	chunk_hash_exists, log_query, get_query_log,
	)

	def test_citation_panel_module(self) -> None:
	from ui.components.citation_panel import format_citations_markdown # noqa: F401

	def test_audio_controls_module(self) -> None:
	from ui.components.audio_controls import get_tts_html, WEB_SPEECH_JS # noqa: F401


	# ------------------------------------------------------------------ #
	# UI Component Tests #
	# ------------------------------------------------------------------ #


	class TestUIComponents:
	"""Verify UI helper functions produce correct output."""

	def test_format_citations_empty(self) -> None:
	"""Empty citation list must return placeholder text."""
	from ui.components.citation_panel import format_citations_markdown
	result = format_citations_markdown([])
	assert "No citations" in result

	def test_format_citations_single(self, sample_citation) -> None:
	"""Single citation must include filename and page number."""
	from ui.components.citation_panel import format_citations_markdown
	result = format_citations_markdown([sample_citation])
	assert "sample.pdf" in result
	assert "1" in result # page number
	assert "Introduction" in result

	def test_format_citations_multiple(self) -> None:
	"""Multiple citations must be numbered correctly."""
	from voicevault.models import Citation
	from ui.components.citation_panel import format_citations_markdown
	citations = [
	Citation(source_file="a.pdf", page_number=1, section="S1",
	excerpt="Text A", relevance_score=0.9),
	Citation(source_file="b.pdf", page_number=5, section="S2",
	excerpt="Text B", relevance_score=0.8),
	]
	result = format_citations_markdown(citations)
	assert "[1]" in result
	assert "[2]" in result
	assert "a.pdf" in result
	assert "b.pdf" in result

	def test_tts_html_contains_js(self) -> None:
	"""Web Speech API bridge must include speechSynthesis."""
	from ui.components.audio_controls import get_tts_html
	html = get_tts_html()
	assert "speechSynthesis" in html
	assert "_vv_tts" in html

	def test_gradio_app_builds(self, tmp_path) -> None:
	"""build_app() must return a gr.Blocks instance without error."""
	from unittest.mock import MagicMock
	import gradio as gr
	from app import build_app
	from voicevault.kb.kb_manager import KBManager
	kb_manager = KBManager(db_path=tmp_path / "test.db")
	demo = build_app(kb_manager, MagicMock(), MagicMock())
	assert isinstance(demo, gr.Blocks)