Spaces:
Running on Zero
Running on Zero
File size: 4,267 Bytes
d1e80bb e493b7e d1e80bb e493b7e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | from __future__ import annotations
from hashlib import sha256
import json
from pathlib import Path
from hackathon_advisor.data import Project
from scripts import build_project_index
def test_build_project_index_reuses_matching_digest_vectors(monkeypatch, tmp_path: Path) -> None:
project_row = {
"id": "build-small-hackathon/reused-project",
"title": "Reused Project",
"summary": "compact local model demo",
"tags": ["gradio"],
"models": [],
"datasets": [],
"likes": 0,
"sdk": "gradio",
"license": "",
"created_at": "",
"last_modified": "",
"host": "",
"url": "https://example.test",
}
project = Project.from_dict(project_row)
digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
project_path = tmp_path / "projects.json"
reuse_path = tmp_path / "reuse.json"
project_path.write_text(
json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
encoding="utf-8",
)
reuse_path.write_text(
json.dumps(
{
"embedding": {
"model_repo": "test/repo",
"model_file": "model.gguf",
"n_ctx": build_project_index.DEFAULT_N_CTX,
},
"documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}],
}
),
encoding="utf-8",
)
def fail_embedder(**_kwargs):
raise AssertionError("matching digest vectors should not initialize llama.cpp")
monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", fail_embedder)
payload = build_project_index.build_payload(
project_path,
model_repo="test/repo",
model_file="model.gguf",
build_source="test",
builder="test",
reuse_index_path=reuse_path,
)
assert payload["document_count"] == 1
assert payload["documents"][0]["project_id"] == project.id
assert payload["documents"][0]["text_digest"] == digest
assert payload["documents"][0]["vector"] == [1.0, 0.0, 0.0]
def test_build_project_index_rejects_vectors_when_embedding_config_changes(monkeypatch, tmp_path: Path) -> None:
project_row = {
"id": "build-small-hackathon/rebuilt-project",
"title": "Rebuilt Project",
"summary": "compact local model demo",
"tags": ["gradio"],
"models": [],
"datasets": [],
"likes": 0,
"sdk": "gradio",
"license": "",
"created_at": "",
"last_modified": "",
"host": "",
"url": "https://example.test",
}
project = Project.from_dict(project_row)
digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
project_path = tmp_path / "projects.json"
reuse_path = tmp_path / "reuse.json"
project_path.write_text(
json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
encoding="utf-8",
)
reuse_path.write_text(
json.dumps(
{
"embedding": {
"model_repo": "test/repo",
"model_file": "model.gguf",
"n_ctx": 768,
},
"documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}],
}
),
encoding="utf-8",
)
class FakeEmbedder:
def __init__(self, **kwargs) -> None:
assert kwargs["n_ctx"] == 2048
def embed(self, _text: str) -> list[float]:
return [0.0, 1.0, 0.0]
monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", FakeEmbedder)
payload = build_project_index.build_payload(
project_path,
model_repo="test/repo",
model_file="model.gguf",
n_ctx=2048,
build_source="test",
builder="test",
reuse_index_path=reuse_path,
)
assert payload["document_count"] == 1
assert payload["documents"][0]["project_id"] == project.id
assert payload["documents"][0]["text_digest"] == digest
assert payload["documents"][0]["vector"] == [0.0, 1.0, 0.0]
|