JacobLinCool commited on
Commit
99bcb68
·
verified ·
1 Parent(s): f984c60

fix: speed up refresh embeddings

Browse files

Sync GitHub commit 01f6182; reduce retrieval embedding text bounds so Space refresh can rebuild the live atlas promptly.

data/project_index.json CHANGED
The diff for this file is too large to render. See raw diff
 
hackathon_advisor/data.py CHANGED
@@ -31,8 +31,8 @@ INDEX_ALGORITHM = "llama-cpp-embedding-v1"
31
  DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
32
  DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
33
  DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
34
- APP_FILE_EMBEDDING_CHAR_LIMIT = 8000
35
- README_EMBEDDING_CHAR_LIMIT = 4000
36
 
37
 
38
  EmbeddingFunction = Callable[[str], Sequence[float]]
@@ -99,7 +99,8 @@ class Project:
99
  f"models: {' '.join(self.models)}",
100
  f"datasets: {' '.join(self.datasets)}",
101
  f"main app file: {self.app_file}" if self.app_file else "",
102
- f"main app file content:\n{self.app_file_embedding_text}"
 
103
  if self.app_file_embedding_text
104
  else "",
105
  ]
 
31
  DEFAULT_EMBEDDING_MODEL_REPO = "ggml-org/embeddinggemma-300m-qat-q8_0-GGUF"
32
  DEFAULT_EMBEDDING_MODEL_FILE = "embeddinggemma-300m-qat-Q8_0.gguf"
33
  DEFAULT_EMBEDDING_RUNTIME = "llama.cpp via llama-cpp-python"
34
+ APP_FILE_EMBEDDING_CHAR_LIMIT = 2000
35
+ README_EMBEDDING_CHAR_LIMIT = 1200
36
 
37
 
38
  EmbeddingFunction = Callable[[str], Sequence[float]]
 
99
  f"models: {' '.join(self.models)}",
100
  f"datasets: {' '.join(self.datasets)}",
101
  f"main app file: {self.app_file}" if self.app_file else "",
102
+ "main app file content:\n"
103
+ f"{bounded_embedding_text(self.app_file_embedding_text, APP_FILE_EMBEDDING_CHAR_LIMIT)}"
104
  if self.app_file_embedding_text
105
  else "",
106
  ]
tests/test_data.py CHANGED
@@ -106,7 +106,7 @@ def test_searchable_text_bounds_readme_body_for_embedding() -> None:
106
  searchable = project.searchable_text
107
 
108
  assert "middle should not be embedded" not in searchable
109
- assert len(searchable) < 4300
110
 
111
 
112
  def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None:
 
106
  searchable = project.searchable_text
107
 
108
  assert "middle should not be embedded" not in searchable
109
+ assert len(searchable) < 1600
110
 
111
 
112
  def test_project_index_rejects_mismatched_snapshot(tmp_path: Path) -> None: