Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

File size: 4,267 Bytes

from __future__ import annotations

from hashlib import sha256
import json
from pathlib import Path

from hackathon_advisor.data import Project
from scripts import build_project_index


def test_build_project_index_reuses_matching_digest_vectors(monkeypatch, tmp_path: Path) -> None:
    project_row = {
        "id": "build-small-hackathon/reused-project",
        "title": "Reused Project",
        "summary": "compact local model demo",
        "tags": ["gradio"],
        "models": [],
        "datasets": [],
        "likes": 0,
        "sdk": "gradio",
        "license": "",
        "created_at": "",
        "last_modified": "",
        "host": "",
        "url": "https://example.test",
    }
    project = Project.from_dict(project_row)
    digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
    project_path = tmp_path / "projects.json"
    reuse_path = tmp_path / "reuse.json"
    project_path.write_text(
        json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
        encoding="utf-8",
    )
    reuse_path.write_text(
        json.dumps(
            {
                "embedding": {
                    "model_repo": "test/repo",
                    "model_file": "model.gguf",
                    "n_ctx": build_project_index.DEFAULT_N_CTX,
                },
                "documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}],
            }
        ),
        encoding="utf-8",
    )

    def fail_embedder(**_kwargs):
        raise AssertionError("matching digest vectors should not initialize llama.cpp")

    monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", fail_embedder)

    payload = build_project_index.build_payload(
        project_path,
        model_repo="test/repo",
        model_file="model.gguf",
        build_source="test",
        builder="test",
        reuse_index_path=reuse_path,
    )

    assert payload["document_count"] == 1
    assert payload["documents"][0]["project_id"] == project.id
    assert payload["documents"][0]["text_digest"] == digest
    assert payload["documents"][0]["vector"] == [1.0, 0.0, 0.0]


def test_build_project_index_rejects_vectors_when_embedding_config_changes(monkeypatch, tmp_path: Path) -> None:
    project_row = {
        "id": "build-small-hackathon/rebuilt-project",
        "title": "Rebuilt Project",
        "summary": "compact local model demo",
        "tags": ["gradio"],
        "models": [],
        "datasets": [],
        "likes": 0,
        "sdk": "gradio",
        "license": "",
        "created_at": "",
        "last_modified": "",
        "host": "",
        "url": "https://example.test",
    }
    project = Project.from_dict(project_row)
    digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
    project_path = tmp_path / "projects.json"
    reuse_path = tmp_path / "reuse.json"
    project_path.write_text(
        json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
        encoding="utf-8",
    )
    reuse_path.write_text(
        json.dumps(
            {
                "embedding": {
                    "model_repo": "test/repo",
                    "model_file": "model.gguf",
                    "n_ctx": 768,
                },
                "documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}],
            }
        ),
        encoding="utf-8",
    )

    class FakeEmbedder:
        def __init__(self, **kwargs) -> None:
            assert kwargs["n_ctx"] == 2048

        def embed(self, _text: str) -> list[float]:
            return [0.0, 1.0, 0.0]

    monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", FakeEmbedder)

    payload = build_project_index.build_payload(
        project_path,
        model_repo="test/repo",
        model_file="model.gguf",
        n_ctx=2048,
        build_source="test",
        builder="test",
        reuse_index_path=reuse_path,
    )

    assert payload["document_count"] == 1
    assert payload["documents"][0]["project_id"] == project.id
    assert payload["documents"][0]["text_digest"] == digest
    assert payload["documents"][0]["vector"] == [0.0, 1.0, 0.0]