hackathon-advisor / tests /test_build_project_index.py
JacobLinCool's picture
fix: use full embedding context
e493b7e verified
from __future__ import annotations
from hashlib import sha256
import json
from pathlib import Path
from hackathon_advisor.data import Project
from scripts import build_project_index
def test_build_project_index_reuses_matching_digest_vectors(monkeypatch, tmp_path: Path) -> None:
project_row = {
"id": "build-small-hackathon/reused-project",
"title": "Reused Project",
"summary": "compact local model demo",
"tags": ["gradio"],
"models": [],
"datasets": [],
"likes": 0,
"sdk": "gradio",
"license": "",
"created_at": "",
"last_modified": "",
"host": "",
"url": "https://example.test",
}
project = Project.from_dict(project_row)
digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
project_path = tmp_path / "projects.json"
reuse_path = tmp_path / "reuse.json"
project_path.write_text(
json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
encoding="utf-8",
)
reuse_path.write_text(
json.dumps(
{
"embedding": {
"model_repo": "test/repo",
"model_file": "model.gguf",
"n_ctx": build_project_index.DEFAULT_N_CTX,
},
"documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}],
}
),
encoding="utf-8",
)
def fail_embedder(**_kwargs):
raise AssertionError("matching digest vectors should not initialize llama.cpp")
monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", fail_embedder)
payload = build_project_index.build_payload(
project_path,
model_repo="test/repo",
model_file="model.gguf",
build_source="test",
builder="test",
reuse_index_path=reuse_path,
)
assert payload["document_count"] == 1
assert payload["documents"][0]["project_id"] == project.id
assert payload["documents"][0]["text_digest"] == digest
assert payload["documents"][0]["vector"] == [1.0, 0.0, 0.0]
def test_build_project_index_rejects_vectors_when_embedding_config_changes(monkeypatch, tmp_path: Path) -> None:
project_row = {
"id": "build-small-hackathon/rebuilt-project",
"title": "Rebuilt Project",
"summary": "compact local model demo",
"tags": ["gradio"],
"models": [],
"datasets": [],
"likes": 0,
"sdk": "gradio",
"license": "",
"created_at": "",
"last_modified": "",
"host": "",
"url": "https://example.test",
}
project = Project.from_dict(project_row)
digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
project_path = tmp_path / "projects.json"
reuse_path = tmp_path / "reuse.json"
project_path.write_text(
json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
encoding="utf-8",
)
reuse_path.write_text(
json.dumps(
{
"embedding": {
"model_repo": "test/repo",
"model_file": "model.gguf",
"n_ctx": 768,
},
"documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}],
}
),
encoding="utf-8",
)
class FakeEmbedder:
def __init__(self, **kwargs) -> None:
assert kwargs["n_ctx"] == 2048
def embed(self, _text: str) -> list[float]:
return [0.0, 1.0, 0.0]
monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", FakeEmbedder)
payload = build_project_index.build_payload(
project_path,
model_repo="test/repo",
model_file="model.gguf",
n_ctx=2048,
build_source="test",
builder="test",
reuse_index_path=reuse_path,
)
assert payload["document_count"] == 1
assert payload["documents"][0]["project_id"] == project.id
assert payload["documents"][0]["text_digest"] == digest
assert payload["documents"][0]["vector"] == [0.0, 1.0, 0.0]