JacobLinCool commited on
Commit
f984c60
·
verified ·
1 Parent(s): b7d5967

fix: bound dashboard embedding text

Browse files

Sync GitHub commit 29b7083; bound README text used for retrieval embeddings while keeping full README/app source for MiniCPM quest analysis.

data/project_index.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "schema_version": 3,
3
  "algorithm": "llama-cpp-embedding-v1",
4
- "generated_at": "2026-06-07T11:52:16+00:00",
5
  "snapshot_generated_at": "2026-06-07T11:51:09+00:00",
6
  "snapshot_source": "https://huggingface.co/api/spaces?author=build-small-hackathon",
7
  "snapshot_digest": "a442d8146fa01965567a27e43d42587de7048b93352e016151a524d2ecbf2034",
@@ -14,7 +14,7 @@
14
  "normalized": true,
15
  "build_source": "local",
16
  "builder": "scripts/build_project_index.py",
17
- "llama_cpp_python_version": "0.3.26",
18
  "n_ctx": 2048
19
  },
20
  "documents": [
 
1
  {
2
  "schema_version": 3,
3
  "algorithm": "llama-cpp-embedding-v1",
4
+ "generated_at": "2026-06-08T00:28:50+00:00",
5
  "snapshot_generated_at": "2026-06-07T11:51:09+00:00",
6
  "snapshot_source": "https://huggingface.co/api/spaces?author=build-small-hackathon",
7
  "snapshot_digest": "a442d8146fa01965567a27e43d42587de7048b93352e016151a524d2ecbf2034",
 
14
  "normalized": true,
15
  "build_source": "local",
16
  "builder": "scripts/build_project_index.py",
17
+ "llama_cpp_python_version": "0.3.27",
18
  "n_ctx": 2048
19
  },
20
  "documents": [
hackathon_advisor/data.py CHANGED
@@ -32,6 +32,7 @@ DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
32
  DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
33
  DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
34
  APP_FILE_EMBEDDING_CHAR_LIMIT = 8000
 
35
 
36
 
37
  EmbeddingFunction = Callable[[str], Sequence[float]]
@@ -91,7 +92,9 @@ class Project:
91
  f"title: {self.title}",
92
  f"slug: {self.slug.replace('-', ' ').replace('_', ' ')}",
93
  f"summary: {self.summary}",
94
- f"readme:\n{self.readme_body}" if self.readme_body else "",
 
 
95
  f"tags: {' '.join(self.tags)}",
96
  f"models: {' '.join(self.models)}",
97
  f"datasets: {' '.join(self.datasets)}",
 
32
  DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
33
  DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
34
  APP_FILE_EMBEDDING_CHAR_LIMIT = 8000
35
+ README_EMBEDDING_CHAR_LIMIT = 4000
36
 
37
 
38
  EmbeddingFunction = Callable[[str], Sequence[float]]
 
92
  f"title: {self.title}",
93
  f"slug: {self.slug.replace('-', ' ').replace('_', ' ')}",
94
  f"summary: {self.summary}",
95
+ f"readme:\n{bounded_embedding_text(self.readme_body, README_EMBEDDING_CHAR_LIMIT)}"
96
+ if self.readme_body
97
+ else "",
98
  f"tags: {' '.join(self.tags)}",
99
  f"models: {' '.join(self.models)}",
100
  f"datasets: {' '.join(self.datasets)}",
tests/test_data.py CHANGED
@@ -85,6 +85,30 @@ def test_searchable_text_includes_main_app_file_signals() -> None:
85
  assert "Project idea" in searchable
86
 
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
89
  payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
90
  payload["snapshot_generated_at"] = "2000-01-01T00:00:00+00:00"
 
85
  assert "Project idea" in searchable
86
 
87
 
88
+ def test_searchable_text_bounds_readme_body_for_embedding() -> None:
89
+ project = Project(
90
+ id="build-small-hackathon/long-readme",
91
+ title="Long README",
92
+ summary="",
93
+ tags=(),
94
+ models=(),
95
+ datasets=(),
96
+ likes=0,
97
+ sdk="gradio",
98
+ license="",
99
+ created_at="",
100
+ last_modified="",
101
+ host="",
102
+ url="https://example.test",
103
+ readme_body="a" * 2500 + "middle should not be embedded" + "b" * 2500,
104
+ )
105
+
106
+ searchable = project.searchable_text
107
+
108
+ assert "middle should not be embedded" not in searchable
109
+ assert len(searchable) < 4300
110
+
111
+
112
  def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
113
  payload = json.loads(Path("data/project_index.json").read_text(encoding="utf-8"))
114
  payload["snapshot_generated_at"] = "2000-01-01T00:00:00+00:00"