Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

App Files Files Community

hackathon-advisor / tests /test_build_project_index.py

JacobLinCool

fix: use full embedding context

e493b7e verified 1 day ago

raw

history blame contribute delete

4.27 kB

	from __future__ import annotations

	from hashlib import sha256
	import json
	from pathlib import Path

	from hackathon_advisor.data import Project
	from scripts import build_project_index


	def test_build_project_index_reuses_matching_digest_vectors(monkeypatch, tmp_path: Path) -> None:
	project_row = {
	"id": "build-small-hackathon/reused-project",
	"title": "Reused Project",
	"summary": "compact local model demo",
	"tags": ["gradio"],
	"models": [],
	"datasets": [],
	"likes": 0,
	"sdk": "gradio",
	"license": "",
	"created_at": "",
	"last_modified": "",
	"host": "",
	"url": "https://example.test",
	}
	project = Project.from_dict(project_row)
	digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
	project_path = tmp_path / "projects.json"
	reuse_path = tmp_path / "reuse.json"
	project_path.write_text(
	json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
	encoding="utf-8",
	)
	reuse_path.write_text(
	json.dumps(
	{
	"embedding": {
	"model_repo": "test/repo",
	"model_file": "model.gguf",
	"n_ctx": build_project_index.DEFAULT_N_CTX,
	},
	"documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}],
	}
	),
	encoding="utf-8",
	)

	def fail_embedder(**_kwargs):
	raise AssertionError("matching digest vectors should not initialize llama.cpp")

	monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", fail_embedder)

	payload = build_project_index.build_payload(
	project_path,
	model_repo="test/repo",
	model_file="model.gguf",
	build_source="test",
	builder="test",
	reuse_index_path=reuse_path,
	)

	assert payload["document_count"] == 1
	assert payload["documents"][0]["project_id"] == project.id
	assert payload["documents"][0]["text_digest"] == digest
	assert payload["documents"][0]["vector"] == [1.0, 0.0, 0.0]


	def test_build_project_index_rejects_vectors_when_embedding_config_changes(monkeypatch, tmp_path: Path) -> None:
	project_row = {
	"id": "build-small-hackathon/rebuilt-project",
	"title": "Rebuilt Project",
	"summary": "compact local model demo",
	"tags": ["gradio"],
	"models": [],
	"datasets": [],
	"likes": 0,
	"sdk": "gradio",
	"license": "",
	"created_at": "",
	"last_modified": "",
	"host": "",
	"url": "https://example.test",
	}
	project = Project.from_dict(project_row)
	digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
	project_path = tmp_path / "projects.json"
	reuse_path = tmp_path / "reuse.json"
	project_path.write_text(
	json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
	encoding="utf-8",
	)
	reuse_path.write_text(
	json.dumps(
	{
	"embedding": {
	"model_repo": "test/repo",
	"model_file": "model.gguf",
	"n_ctx": 768,
	},
	"documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}],
	}
	),
	encoding="utf-8",
	)

	class FakeEmbedder:
	def __init__(self, **kwargs) -> None:
	assert kwargs["n_ctx"] == 2048

	def embed(self, _text: str) -> list[float]:
	return [0.0, 1.0, 0.0]

	monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", FakeEmbedder)

	payload = build_project_index.build_payload(
	project_path,
	model_repo="test/repo",
	model_file="model.gguf",
	n_ctx=2048,
	build_source="test",
	builder="test",
	reuse_index_path=reuse_path,
	)

	assert payload["document_count"] == 1
	assert payload["documents"][0]["project_id"] == project.id
	assert payload["documents"][0]["text_digest"] == digest
	assert payload["documents"][0]["vector"] == [0.0, 1.0, 0.0]