Spaces:

NinjainPJs
/

VoiceVault

Running

File size: 24,466 Bytes

85f900d

"""
tests/test_phase0.py
====================
Phase 0 — Foundation Smoke Tests

Verifies that the project scaffold is correct:
  - Config loads with correct types and defaults
  - SQLite schema creates all required tables
  - All Pydantic data models instantiate without errors
  - Runtime directories are created correctly
  - Gradio app builds without errors
  - All package imports succeed

These tests do NOT require heavy ML dependencies (Whisper, torch, spacy).
Run with: pytest tests/test_phase0.py -v
"""

from __future__ import annotations

import sqlite3
from pathlib import Path

import pytest


# ------------------------------------------------------------------ #
# Config Tests                                                          #
# ------------------------------------------------------------------ #


class TestConfig:
    """Verify centralized config loads correctly."""

    def test_config_imports(self) -> None:
        """Config module must import without exceptions."""
        from config import cfg  # noqa: F401

    def test_config_is_singleton(self) -> None:
        """Multiple imports must return the same object."""
        from config import cfg as cfg1
        from config import cfg as cfg2
        assert cfg1 is cfg2

    def test_default_types(self) -> None:
        """All fields must have correct types."""
        from config import cfg
        assert isinstance(cfg.groq_api_key, str)
        assert isinstance(cfg.gemini_api_key, str)
        assert isinstance(cfg.data_dir, Path)
        assert isinstance(cfg.bm25_top_k, int)
        assert isinstance(cfg.vector_top_k, int)
        assert isinstance(cfg.rrf_k, int)
        assert isinstance(cfg.final_top_k, int)
        assert isinstance(cfg.chunk_size_min, int)
        assert isinstance(cfg.chunk_size_max, int)
        assert isinstance(cfg.semantic_similarity_threshold, float)
        assert isinstance(cfg.bcrypt_rounds, int)
        assert isinstance(cfg.debug, bool)

    def test_default_values_are_sane(self) -> None:
        """Critical defaults must be within valid operational ranges."""
        from config import cfg
        assert cfg.bm25_top_k > 0
        assert cfg.vector_top_k > 0
        assert cfg.rrf_k == 60, "RRF k=60 is the standard value — must not be changed"
        assert cfg.final_top_k <= cfg.rerank_top_k
        assert cfg.chunk_size_min < cfg.chunk_size_max
        assert 0.0 < cfg.semantic_similarity_threshold < 1.0
        assert cfg.bcrypt_rounds >= 12, "bcrypt rounds < 12 is insecure"
        assert cfg.port == 7860

    def test_path_helpers_return_paths(self) -> None:
        """Path helper methods must return Path objects."""
        from config import cfg
        assert isinstance(cfg.kb_dir("test"), Path)
        assert isinstance(cfg.kb_chroma_dir("test"), Path)
        assert isinstance(cfg.kb_bm25_path("test"), Path)
        assert isinstance(cfg.kb_db_path("test"), Path)
        assert isinstance(cfg.uploads_dir, Path)
        assert isinstance(cfg.models_cache_dir, Path)

    def test_kb_path_hierarchy(self) -> None:
        """KB subdirectory paths must be children of the KB root."""
        from config import cfg
        kb = "myknowledgebase"
        assert cfg.kb_chroma_dir(kb).parent == cfg.kb_dir(kb)
        assert cfg.kb_bm25_path(kb).parent == cfg.kb_dir(kb)
        assert cfg.kb_db_path(kb).parent == cfg.kb_dir(kb)

    def test_has_groq_key_false_when_empty(self) -> None:
        """has_groq_key() must return False when key is empty string."""
        from config import VoiceVaultConfig
        empty_cfg = VoiceVaultConfig(GROQ_API_KEY="")
        assert empty_cfg.has_groq_key() is False

    def test_has_groq_key_true_when_set(self) -> None:
        """has_groq_key() must return True when key is non-empty."""
        from config import VoiceVaultConfig
        cfg_with_key = VoiceVaultConfig(GROQ_API_KEY="gsk_test123")
        assert cfg_with_key.has_groq_key() is True

    def test_allowed_extensions_contains_expected(self) -> None:
        """Security extension whitelist must include all supported formats."""
        from config import cfg
        expected = {".pdf", ".html", ".htm", ".docx", ".md", ".mdx", ".txt"}
        assert expected.issubset(cfg.allowed_extensions)

    def test_ensure_directories_creates_dirs(self, tmp_path: Path) -> None:
        """ensure_directories() must create data/ and data/uploads/."""
        from config import VoiceVaultConfig
        local_cfg = VoiceVaultConfig(DATA_DIR=str(tmp_path / "data"))
        local_cfg.ensure_directories()
        assert local_cfg.data_dir.exists()
        assert local_cfg.uploads_dir.exists()


# ------------------------------------------------------------------ #
# Data Model Tests                                                      #
# ------------------------------------------------------------------ #


class TestModels:
    """Verify all Pydantic data models instantiate and validate correctly."""

    def test_document_chunk_defaults(self) -> None:
        """DocumentChunk must generate a UUID chunk_id automatically."""
        from voicevault.models import DocumentChunk
        chunk = DocumentChunk(
            kb_name="test",
            source_file="doc.pdf",
            page_number=1,
            chunk_index=0,
            text="Hello world.",
            text_hash="deadbeef",
            token_count=2,
        )
        assert chunk.chunk_id  # non-empty UUID string
        assert len(chunk.chunk_id) == 36  # UUID4 format
        assert chunk.language == "en"

    def test_document_chunk_uuid_unique(self) -> None:
        """Two DocumentChunks created without explicit chunk_id must have different IDs."""
        from voicevault.models import DocumentChunk
        base = dict(
            kb_name="test", source_file="doc.pdf", page_number=1,
            chunk_index=0, text="x", text_hash="y", token_count=1,
        )
        a = DocumentChunk(**base)
        b = DocumentChunk(**base)
        assert a.chunk_id != b.chunk_id

    def test_citation_model(self) -> None:
        """Citation must store all fields correctly."""
        from voicevault.models import Citation
        c = Citation(
            source_file="report.pdf",
            page_number=42,
            section="Results",
            excerpt="The accuracy was 95%.",
            relevance_score=0.87,
        )
        assert c.source_file == "report.pdf"
        assert c.page_number == 42
        assert c.relevance_score == pytest.approx(0.87)

    def test_query_session_defaults(self) -> None:
        """QuerySession must auto-generate session_id and timestamp."""
        from voicevault.models import QuerySession
        session = QuerySession(
            kb_names=["kb1"],
            voice_query="What is machine learning?",
            processed_query="what is machine learning",
        )
        assert session.session_id
        assert session.query_type == "factual"
        assert session.answer == ""
        assert session.citations == []

    def test_knowledge_base_is_protected(self) -> None:
        """KnowledgeBase.is_protected must reflect password_hash presence."""
        from voicevault.models import KnowledgeBase
        public_kb = KnowledgeBase(kb_name="public", display_name="Public KB")
        private_kb = KnowledgeBase(
            kb_name="private", display_name="Private KB", password_hash="$2b$12$hash"
        )
        assert public_kb.is_protected is False
        assert private_kb.is_protected is True

    def test_document_model(self) -> None:
        """Document must auto-generate doc_id."""
        from voicevault.models import Document
        doc = Document(kb_name="test", filename="test.pdf", file_hash="sha256abc")
        assert len(doc.doc_id) == 36
        assert doc.is_private is False

    def test_ingestion_report(self) -> None:
        """IngestionReport must store status correctly."""
        from voicevault.models import IngestionReport
        report = IngestionReport(
            doc_id="some-uuid",
            filename="file.pdf",
            chunk_count=42,
            page_count=10,
            status="success",
        )
        assert report.status == "success"
        assert report.chunk_count == 42

    def test_transcript_result(self) -> None:
        """TranscriptResult must store transcript and model_used."""
        from voicevault.models import TranscriptResult
        result = TranscriptResult(
            transcript="What is machine learning?",
            raw_transcript="Um, what is machine learning?",
            model_used="whisper-large-v3",
        )
        assert result.confidence == 1.0
        assert result.language == "en"
        assert result.query_type == "factual"

    def test_retrieval_result(self) -> None:
        """RetrievalResult must store RRF and rerank scores."""
        from voicevault.models import RetrievalResult
        result = RetrievalResult(
            chunk_id="uuid-001",
            text="Machine learning is a subset of AI.",
            source_file="ml_intro.pdf",
            page_number=3,
            rrf_score=0.032,
            rerank_score=0.91,
        )
        assert result.rrf_score == pytest.approx(0.032)
        assert result.rerank_score == pytest.approx(0.91)


# ------------------------------------------------------------------ #
# SQLite Schema Tests                                                   #
# ------------------------------------------------------------------ #


class TestSQLiteSchema:
    """Verify the SQLite schema is created correctly."""

    def test_initialize_creates_tables(self, tmp_db: Path) -> None:
        """All four tables must exist after initialize_database()."""
        conn = sqlite3.connect(str(tmp_db))
        cursor = conn.execute(
            "SELECT name FROM sqlite_master WHERE type='table' ORDER BY name"
        )
        tables = {row[0] for row in cursor.fetchall()}
        conn.close()
        expected = {"knowledge_bases", "documents", "chunks", "query_log"}
        assert expected.issubset(tables), f"Missing tables: {expected - tables}"

    def test_initialize_is_idempotent(self, tmp_path: Path) -> None:
        """Calling initialize_database() twice must not raise or duplicate tables."""
        from voicevault.storage.sqlite_store import initialize_database
        db_path = tmp_path / "voicevault.db"
        initialize_database(db_path)
        initialize_database(db_path)  # Second call — must be silent

        conn = sqlite3.connect(str(db_path))
        cursor = conn.execute(
            "SELECT COUNT(*) FROM sqlite_master WHERE type='table'"
        )
        count = cursor.fetchone()[0]
        conn.close()
        assert count >= 4

    def test_initialize_raises_if_dir_missing(self, tmp_path: Path) -> None:
        """initialize_database() must raise FileNotFoundError if parent dir is missing."""
        from voicevault.storage.sqlite_store import initialize_database
        bad_path = tmp_path / "nonexistent" / "voicevault.db"
        with pytest.raises(FileNotFoundError):
            initialize_database(bad_path)

    def test_kb_table_schema(self, tmp_db: Path) -> None:
        """knowledge_bases table must have all required columns."""
        conn = sqlite3.connect(str(tmp_db))
        cursor = conn.execute("PRAGMA table_info(knowledge_bases)")
        columns = {row[1] for row in cursor.fetchall()}
        conn.close()
        required = {"kb_name", "display_name", "password_hash", "owner",
                    "doc_count", "chunk_count", "created_at", "last_updated"}
        assert required.issubset(columns)

    def test_documents_table_schema(self, tmp_db: Path) -> None:
        """documents table must have all required columns."""
        conn = sqlite3.connect(str(tmp_db))
        cursor = conn.execute("PRAGMA table_info(documents)")
        columns = {row[1] for row in cursor.fetchall()}
        conn.close()
        required = {"doc_id", "kb_name", "filename", "file_hash",
                    "page_count", "chunk_count", "is_private", "ingested_at"}
        assert required.issubset(columns)

    def test_query_log_schema(self, tmp_db: Path) -> None:
        """query_log table must have anonymized query hash, not raw query text."""
        conn = sqlite3.connect(str(tmp_db))
        cursor = conn.execute("PRAGMA table_info(query_log)")
        columns = {row[1] for row in cursor.fetchall()}
        conn.close()
        # Security: raw voice_query must NOT be stored — only hash
        assert "voice_query_hash" in columns, "Audit log must store query hash, not raw text"
        assert "voice_query" not in columns, "Raw voice query must NOT be in audit log"


# ------------------------------------------------------------------ #
# SQLite CRUD Tests                                                     #
# ------------------------------------------------------------------ #


class TestSQLiteCRUD:
    """Verify SQLite CRUD operations are correct and use parameterized queries."""

    def test_create_and_get_kb(self, tmp_db: Path) -> None:
        """create_kb + get_kb round-trip."""
        from voicevault.storage.sqlite_store import create_kb, get_kb
        create_kb(tmp_db, "my-kb", "My Knowledge Base", owner="navnit")
        kb = get_kb(tmp_db, "my-kb")
        assert kb is not None
        assert kb["kb_name"] == "my-kb"
        assert kb["display_name"] == "My Knowledge Base"
        assert kb["owner"] == "navnit"
        assert kb["password_hash"] is None

    def test_create_kb_duplicate_raises(self, tmp_db: Path) -> None:
        """Inserting a KB with a duplicate name must raise."""
        from voicevault.storage.sqlite_store import create_kb
        create_kb(tmp_db, "dupe-kb", "Dupe")
        with pytest.raises(Exception):
            create_kb(tmp_db, "dupe-kb", "Dupe Again")

    def test_list_kbs_empty(self, tmp_db: Path) -> None:
        """list_kbs on empty DB returns empty list."""
        from voicevault.storage.sqlite_store import list_kbs
        assert list_kbs(tmp_db) == []

    def test_list_kbs_multiple(self, tmp_db: Path) -> None:
        """list_kbs returns all created KBs."""
        from voicevault.storage.sqlite_store import create_kb, list_kbs
        create_kb(tmp_db, "kb-a", "KB A")
        create_kb(tmp_db, "kb-b", "KB B")
        result = list_kbs(tmp_db)
        names = {kb["kb_name"] for kb in result}
        assert {"kb-a", "kb-b"} == names

    def test_delete_kb(self, tmp_db: Path) -> None:
        """delete_kb removes the KB row."""
        from voicevault.storage.sqlite_store import create_kb, delete_kb, get_kb
        create_kb(tmp_db, "to-delete", "To Delete")
        assert get_kb(tmp_db, "to-delete") is not None
        delete_kb(tmp_db, "to-delete")
        assert get_kb(tmp_db, "to-delete") is None

    def test_update_kb_counts(self, tmp_db: Path) -> None:
        """update_kb_counts updates doc/chunk counts correctly."""
        from voicevault.storage.sqlite_store import create_kb, get_kb, update_kb_counts
        create_kb(tmp_db, "count-kb", "Count KB")
        update_kb_counts(tmp_db, "count-kb", doc_count=5, chunk_count=120)
        kb = get_kb(tmp_db, "count-kb")
        assert kb["doc_count"] == 5
        assert kb["chunk_count"] == 120

    def test_register_and_list_document(self, tmp_db: Path) -> None:
        """register_document + list_documents round-trip."""
        from voicevault.storage.sqlite_store import (
            create_kb, list_documents, register_document
        )
        create_kb(tmp_db, "doc-kb", "Doc KB")
        register_document(
            tmp_db,
            doc_id="doc-uuid-001",
            kb_name="doc-kb",
            filename="report.pdf",
            file_hash="sha256_abc",
            page_count=10,
            chunk_count=25,
        )
        docs = list_documents(tmp_db, "doc-kb")
        assert len(docs) == 1
        assert docs[0]["filename"] == "report.pdf"
        assert docs[0]["file_hash"] == "sha256_abc"

    def test_get_document_by_hash(self, tmp_db: Path) -> None:
        """get_document_by_hash enables deduplication check."""
        from voicevault.storage.sqlite_store import (
            create_kb, get_document_by_hash, register_document
        )
        create_kb(tmp_db, "dedup-kb", "Dedup KB")
        register_document(tmp_db, "uuid-x", "dedup-kb", "file.pdf", "unique_hash_123")
        found = get_document_by_hash(tmp_db, "unique_hash_123", "dedup-kb")
        assert found is not None
        not_found = get_document_by_hash(tmp_db, "nonexistent_hash", "dedup-kb")
        assert not_found is None

    def test_register_chunk_and_dedup(self, tmp_db: Path) -> None:
        """chunk_hash_exists returns True only for registered chunk hashes."""
        from voicevault.storage.sqlite_store import (
            chunk_hash_exists, create_kb, register_chunk, register_document
        )
        create_kb(tmp_db, "chunk-kb", "Chunk KB")
        register_document(tmp_db, "doc-001", "chunk-kb", "doc.pdf", "file_hash_001")
        register_chunk(
            tmp_db,
            chunk_id="chunk-uuid-001",
            kb_name="chunk-kb",
            doc_id="doc-001",
            source_file="doc.pdf",
            page_number=1,
            section="Intro",
            chunk_index=0,
            text_hash="chunk_hash_abc",
            token_count=50,
        )
        assert chunk_hash_exists(tmp_db, "chunk_hash_abc", "chunk-kb") is True
        assert chunk_hash_exists(tmp_db, "nonexistent_hash", "chunk-kb") is False

    def test_delete_document_cascades_chunks(self, tmp_db: Path) -> None:
        """Deleting a document must cascade-delete its chunks."""
        from voicevault.storage.sqlite_store import (
            chunk_hash_exists, create_kb, delete_document,
            register_chunk, register_document
        )
        create_kb(tmp_db, "cascade-kb", "Cascade KB")
        register_document(tmp_db, "doc-cascade", "cascade-kb", "x.pdf", "hash_x")
        register_chunk(
            tmp_db, "chunk-cascade", "cascade-kb", "doc-cascade",
            "x.pdf", 1, "Intro", 0, "chunk_hash_cascade", 10
        )
        assert chunk_hash_exists(tmp_db, "chunk_hash_cascade", "cascade-kb") is True
        delete_document(tmp_db, "doc-cascade")
        assert chunk_hash_exists(tmp_db, "chunk_hash_cascade", "cascade-kb") is False

    def test_log_query_and_retrieve(self, tmp_db: Path) -> None:
        """log_query + get_query_log round-trip."""
        import uuid
        from voicevault.storage.sqlite_store import get_query_log, log_query
        log_id = str(uuid.uuid4())
        session_id = str(uuid.uuid4())
        log_query(
            tmp_db, log_id, session_id,
            kb_names=["kb1", "kb2"],
            voice_query_hash="sha256_of_query",
            processed_query="what is machine learning",
            query_type="factual",
            answer_length=150,
            citation_count=3,
            latency_asr_ms=2000,
            latency_ret_ms=300,
            latency_llm_ms=1500,
            total_latency_ms=3800,
            groq_tokens_used=200,
        )
        logs = get_query_log(tmp_db)
        assert len(logs) == 1
        assert logs[0]["session_id"] == session_id
        assert logs[0]["query_type"] == "factual"
        assert logs[0]["citation_count"] == 3

    def test_get_query_stats_empty(self, tmp_db: Path) -> None:
        """get_query_stats on empty log returns zeros."""
        from voicevault.storage.sqlite_store import get_query_stats
        stats = get_query_stats(tmp_db)
        assert stats["total_queries"] == 0


# ------------------------------------------------------------------ #
# Package Import Tests                                                  #
# ------------------------------------------------------------------ #


class TestPackageImports:
    """All package __init__.py files must import without errors."""

    def test_voicevault_package(self) -> None:
        import voicevault  # noqa: F401
        assert voicevault.__version__ == "1.0.0"

    def test_asr_package(self) -> None:
        import voicevault.asr  # noqa: F401

    def test_ingestion_package(self) -> None:
        import voicevault.ingestion  # noqa: F401

    def test_retrieval_package(self) -> None:
        import voicevault.retrieval  # noqa: F401

    def test_generation_package(self) -> None:
        import voicevault.generation  # noqa: F401

    def test_kb_package(self) -> None:
        import voicevault.kb  # noqa: F401

    def test_tts_package(self) -> None:
        import voicevault.tts  # noqa: F401

    def test_storage_package(self) -> None:
        import voicevault.storage  # noqa: F401

    def test_ui_package(self) -> None:
        import ui  # noqa: F401

    def test_ui_tabs_package(self) -> None:
        import ui.tabs  # noqa: F401

    def test_ui_components_package(self) -> None:
        import ui.components  # noqa: F401

    def test_config_module(self) -> None:
        from config import cfg, VoiceVaultConfig  # noqa: F401
        assert cfg is not None

    def test_models_module(self) -> None:
        from voicevault.models import (  # noqa: F401
            Citation, Document, DocumentChunk, IngestionReport,
            KnowledgeBase, QuerySession, RetrievalResult, TranscriptResult,
        )

    def test_sqlite_store_module(self) -> None:
        from voicevault.storage.sqlite_store import (  # noqa: F401
            initialize_database, create_kb, get_kb, list_kbs,
            update_kb_counts, delete_kb, register_document,
            list_documents, delete_document, register_chunk,
            chunk_hash_exists, log_query, get_query_log,
        )

    def test_citation_panel_module(self) -> None:
        from ui.components.citation_panel import format_citations_markdown  # noqa: F401

    def test_audio_controls_module(self) -> None:
        from ui.components.audio_controls import get_tts_html, WEB_SPEECH_JS  # noqa: F401


# ------------------------------------------------------------------ #
# UI Component Tests                                                    #
# ------------------------------------------------------------------ #


class TestUIComponents:
    """Verify UI helper functions produce correct output."""

    def test_format_citations_empty(self) -> None:
        """Empty citation list must return placeholder text."""
        from ui.components.citation_panel import format_citations_markdown
        result = format_citations_markdown([])
        assert "No citations" in result

    def test_format_citations_single(self, sample_citation) -> None:
        """Single citation must include filename and page number."""
        from ui.components.citation_panel import format_citations_markdown
        result = format_citations_markdown([sample_citation])
        assert "sample.pdf" in result
        assert "1" in result  # page number
        assert "Introduction" in result

    def test_format_citations_multiple(self) -> None:
        """Multiple citations must be numbered correctly."""
        from voicevault.models import Citation
        from ui.components.citation_panel import format_citations_markdown
        citations = [
            Citation(source_file="a.pdf", page_number=1, section="S1",
                     excerpt="Text A", relevance_score=0.9),
            Citation(source_file="b.pdf", page_number=5, section="S2",
                     excerpt="Text B", relevance_score=0.8),
        ]
        result = format_citations_markdown(citations)
        assert "[1]" in result
        assert "[2]" in result
        assert "a.pdf" in result
        assert "b.pdf" in result

    def test_tts_html_contains_js(self) -> None:
        """Web Speech API bridge must include speechSynthesis."""
        from ui.components.audio_controls import get_tts_html
        html = get_tts_html()
        assert "speechSynthesis" in html
        assert "_vv_tts" in html

    def test_gradio_app_builds(self, tmp_path) -> None:
        """build_app() must return a gr.Blocks instance without error."""
        from unittest.mock import MagicMock
        import gradio as gr
        from app import build_app
        from voicevault.kb.kb_manager import KBManager
        kb_manager = KBManager(db_path=tmp_path / "test.db")
        demo = build_app(kb_manager, MagicMock(), MagicMock())
        assert isinstance(demo, gr.Blocks)