File size: 4,267 Bytes
d1e80bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e493b7e
 
 
 
 
 
 
 
 
 
d1e80bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e493b7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from __future__ import annotations

from hashlib import sha256
import json
from pathlib import Path

from hackathon_advisor.data import Project
from scripts import build_project_index


def test_build_project_index_reuses_matching_digest_vectors(monkeypatch, tmp_path: Path) -> None:
    project_row = {
        "id": "build-small-hackathon/reused-project",
        "title": "Reused Project",
        "summary": "compact local model demo",
        "tags": ["gradio"],
        "models": [],
        "datasets": [],
        "likes": 0,
        "sdk": "gradio",
        "license": "",
        "created_at": "",
        "last_modified": "",
        "host": "",
        "url": "https://example.test",
    }
    project = Project.from_dict(project_row)
    digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
    project_path = tmp_path / "projects.json"
    reuse_path = tmp_path / "reuse.json"
    project_path.write_text(
        json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
        encoding="utf-8",
    )
    reuse_path.write_text(
        json.dumps(
            {
                "embedding": {
                    "model_repo": "test/repo",
                    "model_file": "model.gguf",
                    "n_ctx": build_project_index.DEFAULT_N_CTX,
                },
                "documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}],
            }
        ),
        encoding="utf-8",
    )

    def fail_embedder(**_kwargs):
        raise AssertionError("matching digest vectors should not initialize llama.cpp")

    monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", fail_embedder)

    payload = build_project_index.build_payload(
        project_path,
        model_repo="test/repo",
        model_file="model.gguf",
        build_source="test",
        builder="test",
        reuse_index_path=reuse_path,
    )

    assert payload["document_count"] == 1
    assert payload["documents"][0]["project_id"] == project.id
    assert payload["documents"][0]["text_digest"] == digest
    assert payload["documents"][0]["vector"] == [1.0, 0.0, 0.0]


def test_build_project_index_rejects_vectors_when_embedding_config_changes(monkeypatch, tmp_path: Path) -> None:
    project_row = {
        "id": "build-small-hackathon/rebuilt-project",
        "title": "Rebuilt Project",
        "summary": "compact local model demo",
        "tags": ["gradio"],
        "models": [],
        "datasets": [],
        "likes": 0,
        "sdk": "gradio",
        "license": "",
        "created_at": "",
        "last_modified": "",
        "host": "",
        "url": "https://example.test",
    }
    project = Project.from_dict(project_row)
    digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
    project_path = tmp_path / "projects.json"
    reuse_path = tmp_path / "reuse.json"
    project_path.write_text(
        json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
        encoding="utf-8",
    )
    reuse_path.write_text(
        json.dumps(
            {
                "embedding": {
                    "model_repo": "test/repo",
                    "model_file": "model.gguf",
                    "n_ctx": 768,
                },
                "documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}],
            }
        ),
        encoding="utf-8",
    )

    class FakeEmbedder:
        def __init__(self, **kwargs) -> None:
            assert kwargs["n_ctx"] == 2048

        def embed(self, _text: str) -> list[float]:
            return [0.0, 1.0, 0.0]

    monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", FakeEmbedder)

    payload = build_project_index.build_payload(
        project_path,
        model_repo="test/repo",
        model_file="model.gguf",
        n_ctx=2048,
        build_source="test",
        builder="test",
        reuse_index_path=reuse_path,
    )

    assert payload["document_count"] == 1
    assert payload["documents"][0]["project_id"] == project.id
    assert payload["documents"][0]["text_digest"] == digest
    assert payload["documents"][0]["vector"] == [0.0, 1.0, 0.0]