File size: 4,498 Bytes
2ece486 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | """Tests for kvcos.engram.knowledge_index — HNSW knowledge search."""
import json
from pathlib import Path
import pytest
import torch
from kvcos.engram.embedder import get_fingerprint
from kvcos.engram.format import EigramEncoder
from kvcos.engram.knowledge_index import KnowledgeIndex
@pytest.fixture
def knowledge_dir(tmp_path):
"""Create a temporary knowledge directory with test .eng files."""
encoder = EigramEncoder()
project_dir = tmp_path / "test_project"
project_dir.mkdir()
docs = [
("doc_ml", "Machine learning model training and optimization"),
("doc_db", "PostgreSQL database schema migration tools"),
("doc_api", "REST API endpoint authentication and authorization"),
("doc_test", "Unit testing with pytest fixtures and mocking"),
("doc_deploy", "Docker container deployment to Kubernetes cluster"),
]
for doc_id, text in docs:
fp, source = get_fingerprint(text)
dim = fp.shape[0]
blob = encoder.encode(
vec_perdoc=torch.zeros(116),
vec_fcdb=torch.zeros(116),
joint_center=torch.zeros(128),
corpus_hash="test" * 8,
model_id=source[:16],
basis_rank=116,
n_corpus=0,
layer_range=(0, 0),
context_len=len(text),
l2_norm=float(torch.norm(fp).item()),
scs=0.0,
margin_proof=0.0,
task_description=text[:256],
cache_id=doc_id,
vec_fourier=fp if dim == 2048 else None,
vec_fourier_v2=fp,
confusion_flag=False,
)
eng_path = project_dir / f"{doc_id}.eng"
eng_path.write_bytes(blob)
meta = {
"cache_id": doc_id,
"task_description": text,
"source_path": f"/test/{doc_id}.md",
"project": "test_project",
"fp_source": source,
"chunk_index": 0,
"chunk_total": 1,
"headers": [],
}
meta_path = Path(str(eng_path) + ".meta.json")
meta_path.write_text(json.dumps(meta))
return tmp_path
class TestKnowledgeIndexBuild:
def test_build_from_directory(self, knowledge_dir):
kidx = KnowledgeIndex.build_from_knowledge_dir(
knowledge_dir, verbose=False
)
assert len(kidx) == 5
def test_build_empty_directory(self, tmp_path):
with pytest.raises(ValueError, match="No .eng files"):
KnowledgeIndex.build_from_knowledge_dir(tmp_path, verbose=False)
class TestKnowledgeIndexSearch:
def test_search_returns_results(self, knowledge_dir):
kidx = KnowledgeIndex.build_from_knowledge_dir(
knowledge_dir, verbose=False
)
results = kidx.search("database query optimization", k=3)
assert len(results) == 3
assert all(r.score > 0 for r in results)
def test_search_result_fields(self, knowledge_dir):
kidx = KnowledgeIndex.build_from_knowledge_dir(
knowledge_dir, verbose=False
)
results = kidx.search("testing", k=1)
r = results[0]
assert r.doc_id
assert isinstance(r.score, float)
assert r.rank == 0
assert r.project == "test_project"
def test_search_with_tensor(self, knowledge_dir):
kidx = KnowledgeIndex.build_from_knowledge_dir(
knowledge_dir, verbose=False
)
query_fp, _ = get_fingerprint("unit tests")
results = kidx.search(query_fp, k=2)
assert len(results) == 2
def test_search_margin(self, knowledge_dir):
kidx = KnowledgeIndex.build_from_knowledge_dir(
knowledge_dir, verbose=False
)
results = kidx.search("testing", k=3)
# Top result should have a margin
assert results[0].margin >= 0
class TestKnowledgeIndexPersistence:
def test_save_and_load(self, knowledge_dir, tmp_path):
kidx = KnowledgeIndex.build_from_knowledge_dir(
knowledge_dir, verbose=False
)
index_dir = tmp_path / "index"
kidx.save(index_dir)
loaded = KnowledgeIndex.load(index_dir)
assert len(loaded) == len(kidx)
# Search should work on loaded index
results = loaded.search("database", k=2)
assert len(results) == 2
def test_load_nonexistent(self, tmp_path):
with pytest.raises(FileNotFoundError):
KnowledgeIndex.load(tmp_path / "nonexistent")
|