Spaces:
Running on Zero
Running on Zero
fix: bound dashboard embedding text
Browse filesSync GitHub commit 29b7083; bound README text used for retrieval embeddings while keeping full README/app source for MiniCPM quest analysis.
- data/project_index.json +2 -2
- hackathon_advisor/data.py +4 -1
- tests/test_data.py +24 -0
data/project_index.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"schema_version": 3,
|
| 3 |
"algorithm": "llama-cpp-embedding-v1",
|
| 4 |
-
"generated_at": "2026-06-
|
| 5 |
"snapshot_generated_at": "2026-06-07T11:51:09+00:00",
|
| 6 |
"snapshot_source": "https://huggingface.co/api/spaces?author=build-small-hackathon",
|
| 7 |
"snapshot_digest": "a442d8146fa01965567a27e43d42587de7048b93352e016151a524d2ecbf2034",
|
|
@@ -14,7 +14,7 @@
|
|
| 14 |
"normalized": true,
|
| 15 |
"build_source": "local",
|
| 16 |
"builder": "scripts/build_project_index.py",
|
| 17 |
-
"llama_cpp_python_version": "0.3.
|
| 18 |
"n_ctx": 2048
|
| 19 |
},
|
| 20 |
"documents": [
|
|
|
|
| 1 |
{
|
| 2 |
"schema_version": 3,
|
| 3 |
"algorithm": "llama-cpp-embedding-v1",
|
| 4 |
+
"generated_at": "2026-06-08T00:28:50+00:00",
|
| 5 |
"snapshot_generated_at": "2026-06-07T11:51:09+00:00",
|
| 6 |
"snapshot_source": "https://huggingface.co/api/spaces?author=build-small-hackathon",
|
| 7 |
"snapshot_digest": "a442d8146fa01965567a27e43d42587de7048b93352e016151a524d2ecbf2034",
|
|
|
|
| 14 |
"normalized": true,
|
| 15 |
"build_source": "local",
|
| 16 |
"builder": "scripts/build_project_index.py",
|
| 17 |
+
"llama_cpp_python_version": "0.3.27",
|
| 18 |
"n_ctx": 2048
|
| 19 |
},
|
| 20 |
"documents": [
|
hackathon_advisor/data.py
CHANGED
|
@@ -32,6 +32,7 @@ DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
|
|
| 32 |
DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
|
| 33 |
DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
|
| 34 |
APP_FILE_EMBEDDING_CHAR_LIMIT = 8000
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
EmbeddingFunction = Callable[[str], Sequence[float]]
|
|
@@ -91,7 +92,9 @@ class Project:
|
|
| 91 |
f"title: {self.title}",
|
| 92 |
f"slug: {self.slug.replace('-', ' ').replace('_', ' ')}",
|
| 93 |
f"summary: {self.summary}",
|
| 94 |
-
f"readme:\n{self.readme_body}"
|
|
|
|
|
|
|
| 95 |
f"tags: {' '.join(self.tags)}",
|
| 96 |
f"models: {' '.join(self.models)}",
|
| 97 |
f"datasets: {' '.join(self.datasets)}",
|
|
|
|
| 32 |
DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
|
| 33 |
DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
|
| 34 |
APP_FILE_EMBEDDING_CHAR_LIMIT = 8000
|
| 35 |
+
README_EMBEDDING_CHAR_LIMIT = 4000
|
| 36 |
|
| 37 |
|
| 38 |
EmbeddingFunction = Callable[[str], Sequence[float]]
|
|
|
|
| 92 |
f"title: {self.title}",
|
| 93 |
f"slug: {self.slug.replace('-', ' ').replace('_', ' ')}",
|
| 94 |
f"summary: {self.summary}",
|
| 95 |
+
f"readme:\n{bounded_embedding_text(self.readme_body, README_EMBEDDING_CHAR_LIMIT)}"
|
| 96 |
+
if self.readme_body
|
| 97 |
+
else "",
|
| 98 |
f"tags: {' '.join(self.tags)}",
|
| 99 |
f"models: {' '.join(self.models)}",
|
| 100 |
f"datasets: {' '.join(self.datasets)}",
|
tests/test_data.py
CHANGED
|
@@ -85,6 +85,30 @@ def test_searchable_text_includes_main_app_file_signals() -> None:
|
|
| 85 |
assert "Project idea" in searchable
|
| 86 |
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
|
| 89 |
payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
|
| 90 |
payload["snapshot_generated_at"] = "2000-01-01T00:00:00+00:00"
|
|
|
|
| 85 |
assert "Project idea" in searchable
|
| 86 |
|
| 87 |
|
| 88 |
+
def test_searchable_text_bounds_readme_body_for_embedding() -> None:
|
| 89 |
+
project = Project(
|
| 90 |
+
id="build-small-hackathon/long-readme",
|
| 91 |
+
title="Long README",
|
| 92 |
+
summary="",
|
| 93 |
+
tags=(),
|
| 94 |
+
models=(),
|
| 95 |
+
datasets=(),
|
| 96 |
+
likes=0,
|
| 97 |
+
sdk="gradio",
|
| 98 |
+
license="",
|
| 99 |
+
created_at="",
|
| 100 |
+
last_modified="",
|
| 101 |
+
host="",
|
| 102 |
+
url="https://example.test",
|
| 103 |
+
readme_body="a" * 2500 + "middle should not be embedded" + "b" * 2500,
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
searchable = project.searchable_text
|
| 107 |
+
|
| 108 |
+
assert "middle should not be embedded" not in searchable
|
| 109 |
+
assert len(searchable) < 4300
|
| 110 |
+
|
| 111 |
+
|
| 112 |
def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
|
| 113 |
payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
|
| 114 |
payload["snapshot_generated_at"] = "2000-01-01T00:00:00+00:00"
|