File size: 3,621 Bytes
634117a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
tests/test_core.py
Unit tests for the KerdosRAG public API (no HF token required).
"""

import os
import sys
import tempfile

import pytest

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

from kerdos_rag import KerdosRAG


# ── Fixtures ──────────────────────────────────────────────────────────────────

@pytest.fixture
def engine():
    """A fresh KerdosRAG instance for each test."""
    return KerdosRAG(hf_token="hf_dummy")  # token won't be used in indexing tests


@pytest.fixture
def indexed_engine(tmp_path):
    """Engine with one plain-text document already indexed."""
    doc = tmp_path / "policy.txt"
    doc.write_text(
        "The refund policy allows returns within 30 days of purchase. "
        "Contact support at support@example.com for assistance.",
        encoding="utf-8",
    )
    eng = KerdosRAG(hf_token="hf_dummy")
    eng.index([str(doc)])
    return eng


# ── Tests ─────────────────────────────────────────────────────────────────────

def test_initial_state(engine):
    assert engine.is_ready is False
    assert engine.chunk_count == 0
    assert engine.indexed_sources == set()


def test_index_returns_correct_metadata(indexed_engine):
    assert indexed_engine.is_ready
    assert indexed_engine.chunk_count > 0
    assert "policy.txt" in indexed_engine.indexed_sources


def test_index_skips_duplicates(indexed_engine, tmp_path):
    doc = tmp_path / "policy.txt"
    doc.write_text("Some extra content.", encoding="utf-8")

    result = indexed_engine.index([str(doc)])
    assert "policy.txt" in result["skipped"]
    assert "policy.txt" not in result["indexed"]


def test_index_multiple_files(engine, tmp_path):
    (tmp_path / "a.txt").write_text("Alpha content here.", encoding="utf-8")
    (tmp_path / "b.txt").write_text("Beta content here.", encoding="utf-8")

    result = engine.index([str(tmp_path / "a.txt"), str(tmp_path / "b.txt")])
    assert len(result["indexed"]) == 2
    assert result["chunk_count"] > 0


def test_reset_clears_index(indexed_engine):
    assert indexed_engine.is_ready
    indexed_engine.reset()
    assert not indexed_engine.is_ready
    assert indexed_engine.chunk_count == 0
    assert indexed_engine.indexed_sources == set()


def test_chat_raises_when_not_indexed(engine):
    with pytest.raises(RuntimeError, match="No documents indexed"):
        list(engine.chat("What is the policy?"))


def test_chat_raises_without_token(tmp_path):
    doc = tmp_path / "doc.txt"
    doc.write_text("Hello world.", encoding="utf-8")
    eng = KerdosRAG(hf_token="")
    eng.index([str(doc)])
    with pytest.raises(ValueError, match="No Hugging Face token"):
        list(eng.chat("What does it say?"))


def test_save_and_load(indexed_engine, tmp_path):
    save_dir = tmp_path / "saved_index"
    indexed_engine.save(str(save_dir))

    assert (save_dir / "kerdos_index.faiss").exists()
    assert (save_dir / "kerdos_meta.pkl").exists()

    restored = KerdosRAG.load(str(save_dir), hf_token="hf_dummy")
    assert restored.is_ready
    assert restored.chunk_count == indexed_engine.chunk_count
    assert restored.indexed_sources == indexed_engine.indexed_sources


def test_save_raises_when_empty(engine, tmp_path):
    with pytest.raises(RuntimeError, match="Nothing to save"):
        engine.save(str(tmp_path / "empty"))