Spaces:
Running on Zero
Running on Zero
| from __future__ import annotations | |
| from hashlib import sha256 | |
| import json | |
| from pathlib import Path | |
| from hackathon_advisor.data import Project | |
| from scripts import build_project_index | |
| def test_build_project_index_reuses_matching_digest_vectors(monkeypatch, tmp_path: Path) -> None: | |
| project_row = { | |
| "id": "build-small-hackathon/reused-project", | |
| "title": "Reused Project", | |
| "summary": "compact local model demo", | |
| "tags": ["gradio"], | |
| "models": [], | |
| "datasets": [], | |
| "likes": 0, | |
| "sdk": "gradio", | |
| "license": "", | |
| "created_at": "", | |
| "last_modified": "", | |
| "host": "", | |
| "url": "https://example.test", | |
| } | |
| project = Project.from_dict(project_row) | |
| digest = sha256(project.searchable_text.encode("utf-8")).hexdigest() | |
| project_path = tmp_path / "projects.json" | |
| reuse_path = tmp_path / "reuse.json" | |
| project_path.write_text( | |
| json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}), | |
| encoding="utf-8", | |
| ) | |
| reuse_path.write_text( | |
| json.dumps( | |
| { | |
| "embedding": { | |
| "model_repo": "test/repo", | |
| "model_file": "model.gguf", | |
| "n_ctx": build_project_index.DEFAULT_N_CTX, | |
| }, | |
| "documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}], | |
| } | |
| ), | |
| encoding="utf-8", | |
| ) | |
| def fail_embedder(**_kwargs): | |
| raise AssertionError("matching digest vectors should not initialize llama.cpp") | |
| monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", fail_embedder) | |
| payload = build_project_index.build_payload( | |
| project_path, | |
| model_repo="test/repo", | |
| model_file="model.gguf", | |
| build_source="test", | |
| builder="test", | |
| reuse_index_path=reuse_path, | |
| ) | |
| assert payload["document_count"] == 1 | |
| assert payload["documents"][0]["project_id"] == project.id | |
| assert payload["documents"][0]["text_digest"] == digest | |
| assert payload["documents"][0]["vector"] == [1.0, 0.0, 0.0] | |
| def test_build_project_index_rejects_vectors_when_embedding_config_changes(monkeypatch, tmp_path: Path) -> None: | |
| project_row = { | |
| "id": "build-small-hackathon/rebuilt-project", | |
| "title": "Rebuilt Project", | |
| "summary": "compact local model demo", | |
| "tags": ["gradio"], | |
| "models": [], | |
| "datasets": [], | |
| "likes": 0, | |
| "sdk": "gradio", | |
| "license": "", | |
| "created_at": "", | |
| "last_modified": "", | |
| "host": "", | |
| "url": "https://example.test", | |
| } | |
| project = Project.from_dict(project_row) | |
| digest = sha256(project.searchable_text.encode("utf-8")).hexdigest() | |
| project_path = tmp_path / "projects.json" | |
| reuse_path = tmp_path / "reuse.json" | |
| project_path.write_text( | |
| json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}), | |
| encoding="utf-8", | |
| ) | |
| reuse_path.write_text( | |
| json.dumps( | |
| { | |
| "embedding": { | |
| "model_repo": "test/repo", | |
| "model_file": "model.gguf", | |
| "n_ctx": 768, | |
| }, | |
| "documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}], | |
| } | |
| ), | |
| encoding="utf-8", | |
| ) | |
| class FakeEmbedder: | |
| def __init__(self, **kwargs) -> None: | |
| assert kwargs["n_ctx"] == 2048 | |
| def embed(self, _text: str) -> list[float]: | |
| return [0.0, 1.0, 0.0] | |
| monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", FakeEmbedder) | |
| payload = build_project_index.build_payload( | |
| project_path, | |
| model_repo="test/repo", | |
| model_file="model.gguf", | |
| n_ctx=2048, | |
| build_source="test", | |
| builder="test", | |
| reuse_index_path=reuse_path, | |
| ) | |
| assert payload["document_count"] == 1 | |
| assert payload["documents"][0]["project_id"] == project.id | |
| assert payload["documents"][0]["text_digest"] == digest | |
| assert payload["documents"][0]["vector"] == [0.0, 1.0, 0.0] | |