File size: 4,603 Bytes
f44aac9
e12a049
 
9219266
f44aac9
d0718ca
 
 
 
 
 
f44aac9
 
 
e12a049
f44aac9
 
 
 
 
902a11f
e12a049
f44aac9
 
 
e12a049
f44aac9
 
 
 
 
9219266
 
490a71e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d0718ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04ad98e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742999b
f984c60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
742999b
f984c60
 
 
9219266
 
 
 
 
 
 
 
 
 
 
 
c9f8f52
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from pathlib import Path

from tests.helpers import load_test_index
import json

from hackathon_advisor.data import (
    Project,
    ProjectIndex,
    public_project_summary,
    public_project_title,
)


def test_project_index_searches_snapshot() -> None:
    index = load_test_index()

    hits = index.search("lullaby children audio", limit=3)

    assert hits
    assert hits[0].project.id.startswith("build-small-hackathon/")
    assert hits[0].page_number >= 1
    assert index.index_algorithm == "llama-cpp-embedding-v1"


def test_project_index_whitespace() -> None:
    index = load_test_index()

    items = index.find_whitespace(limit=3)

    assert len(items) == 3
    assert all(item.label for item in items)


def test_public_project_cards_hide_generic_submission_copy() -> None:
    assert public_project_title("My Build Small Hackathon") == "Untitled project"
    assert public_project_summary("This is my submission for the build-small-hackathon") == ""
    assert public_project_summary("Todo") == ""
    assert public_project_summary("Local-first personal knowledge agent") == "Local-first personal knowledge agent"

    project = Project(
        id="build-small-hackathon/my-build-small-hackathon",
        title="My Build Small Hackathon",
        summary="This is my submission for the build-small-hackathon",
        tags=(),
        models=(),
        datasets=(),
        likes=0,
        sdk="gradio",
        license="",
        created_at="",
        last_modified="",
        host="",
        url="https://example.test",
    )

    public = project.to_public_dict()

    assert public["title"] == "Untitled project"
    assert public["summary"] == ""


def test_searchable_text_includes_main_app_file_signals() -> None:
    project = Project(
        id="build-small-hackathon/idea-canvas",
        title="Idea Canvas",
        summary="",
        tags=("gradio",),
        models=(),
        datasets=(),
        likes=0,
        sdk="gradio",
        license="",
        created_at="",
        last_modified="",
        host="",
        url="https://example.test",
        app_file="app.py",
        app_file_embedding_text="score_idea\ngr.Textbox\nProject idea",
    )

    searchable = project.searchable_text

    assert "main app file: app.py" in searchable
    assert "score_idea" in searchable
    assert "Project idea" in searchable


def test_public_project_tags_exclude_hosting_metadata() -> None:
    project = Project.from_dict(
        {
            "id": "build-small-hackathon/idea-canvas",
            "title": "Idea Canvas",
            "summary": "",
            "tags": ["gradio", "region:us", "local-first", "region:eu", "gradio"],
            "models": [],
            "datasets": [],
            "url": "https://example.test",
        }
    )

    assert project.tags == ("gradio", "region:us", "local-first", "region:eu", "gradio")
    assert project.to_public_dict()["tags"] == ["gradio", "local-first"]


def test_searchable_text_excludes_refresh_readme_body_for_stable_reuse() -> None:
    project = Project(
        id="build-small-hackathon/long-readme",
        title="Long README",
        summary="",
        tags=(),
        models=(),
        datasets=(),
        likes=0,
        sdk="gradio",
        license="",
        created_at="",
        last_modified="",
        host="",
        url="https://example.test",
        readme_body="a" * 2500 + "middle should not be embedded" + "b" * 2500,
    )

    searchable = project.searchable_text

    assert "readme:" not in searchable
    assert "middle should not be embedded" not in searchable


def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
    payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
    payload["snapshot_generated_at"] = "2000-01-01T00:00:00+00:00"
    bad_index = tmp_path / "project_index.json"
    bad_index.write_text(json.dumps(payload), encoding="utf-8")

    try:
        ProjectIndex.from_files(Path("data/projects.json"), bad_index)
    except ValueError as error:
        assert "different snapshot timestamp" in str(error)
    else:
        raise AssertionError("mismatched index should be rejected")


def test_project_index_retains_validated_payload() -> None:
    payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
    index = ProjectIndex.from_files(Path("data/projects.json"), Path("data/project_index.json"))

    assert index.index_payload["snapshot_digest"] == payload["snapshot_digest"]
    assert len(index.index_payload["documents"]) == len(index.projects)