JacobLinCool commited on
Commit
d1e80bb
·
verified ·
1 Parent(s): 8cd6de3

fix: reuse unchanged project embeddings

Browse files

Sync GitHub commit 2bff45e; dashboard refresh now reuses matching text-digest vectors and embeds only new or changed projects.

app.py CHANGED
@@ -239,7 +239,14 @@ def _build_refresh_payloads(run_id: str) -> tuple[dict[str, Any], dict[str, Any]
239
  with tempfile.TemporaryDirectory(prefix="advisor-refresh-") as directory:
240
  project_path = Path(directory) / "projects.json"
241
  project_path.write_text(json.dumps(projects_payload, ensure_ascii=False), encoding="utf-8")
242
- index_payload = _build_refresh_index_payload(project_path, Path(directory) / "project_index.json")
 
 
 
 
 
 
 
243
 
244
  projects = [Project.from_dict(item) for item in projects_payload["projects"]]
245
  refreshed_index = ProjectIndex(
@@ -260,7 +267,12 @@ def _build_refresh_payloads(run_id: str) -> tuple[dict[str, Any], dict[str, Any]
260
  return projects_payload, index_payload, refreshed_dashboard
261
 
262
 
263
- def _build_refresh_index_payload(project_path: Path, index_path: Path) -> dict[str, Any]:
 
 
 
 
 
264
  command = [
265
  sys.executable,
266
  str(ROOT / "scripts" / "build_project_index.py"),
@@ -277,6 +289,8 @@ def _build_refresh_index_payload(project_path: Path, index_path: Path) -> dict[s
277
  "--builder",
278
  "app.py:/api/dashboard/refresh",
279
  ]
 
 
280
  model_path = os.environ.get("ADVISOR_EMBEDDING_MODEL_PATH", "").strip()
281
  if model_path:
282
  command.extend(["--model-path", model_path])
 
239
  with tempfile.TemporaryDirectory(prefix="advisor-refresh-") as directory:
240
  project_path = Path(directory) / "projects.json"
241
  project_path.write_text(json.dumps(projects_payload, ensure_ascii=False), encoding="utf-8")
242
+ reuse_index_path = Path(directory) / "reuse_project_index.json"
243
+ with _runtime_lock:
244
+ reuse_index_path.write_text(json.dumps(index.index_payload, ensure_ascii=False), encoding="utf-8")
245
+ index_payload = _build_refresh_index_payload(
246
+ project_path,
247
+ Path(directory) / "project_index.json",
248
+ reuse_index_path=reuse_index_path,
249
+ )
250
 
251
  projects = [Project.from_dict(item) for item in projects_payload["projects"]]
252
  refreshed_index = ProjectIndex(
 
267
  return projects_payload, index_payload, refreshed_dashboard
268
 
269
 
270
+ def _build_refresh_index_payload(
271
+ project_path: Path,
272
+ index_path: Path,
273
+ *,
274
+ reuse_index_path: Path | None = None,
275
+ ) -> dict[str, Any]:
276
  command = [
277
  sys.executable,
278
  str(ROOT / "scripts" / "build_project_index.py"),
 
289
  "--builder",
290
  "app.py:/api/dashboard/refresh",
291
  ]
292
+ if reuse_index_path is not None:
293
+ command.extend(["--reuse-index", str(reuse_index_path)])
294
  model_path = os.environ.get("ADVISOR_EMBEDDING_MODEL_PATH", "").strip()
295
  if model_path:
296
  command.extend(["--model-path", model_path])
scripts/build_project_index.py CHANGED
@@ -2,6 +2,7 @@
2
  from __future__ import annotations
3
 
4
  import argparse
 
5
  import importlib.metadata
6
  import json
7
  from pathlib import Path
@@ -32,6 +33,7 @@ def main() -> None:
32
  parser.add_argument("--n-threads", type=int, default=0)
33
  parser.add_argument("--build-source", default="local")
34
  parser.add_argument("--builder", default="scripts/build_project_index.py")
 
35
  args = parser.parse_args()
36
 
37
  payload = build_payload(
@@ -43,6 +45,7 @@ def main() -> None:
43
  n_threads=args.n_threads or None,
44
  build_source=args.build_source,
45
  builder=args.builder,
 
46
  )
47
  output = Path(args.out)
48
  output.parent.mkdir(parents=True, exist_ok=True)
@@ -65,28 +68,47 @@ def build_payload(
65
  build_source: str,
66
  builder: str,
67
  modal_app: str = "",
 
68
  ) -> dict:
69
  data = json.loads(project_path.read_text(encoding="utf-8"))
70
  projects = [Project.from_dict(item) for item in data["projects"]]
71
  print(f"loaded {len(projects)} projects from {project_path}", flush=True)
72
- embedder = LlamaCppEmbedder(
73
- model_repo=model_repo,
74
- model_file=model_file,
75
- model_path=model_path,
76
- n_ctx=n_ctx,
77
- n_threads=n_threads,
78
- verbose=False,
79
- )
80
  print(
81
  "embedding projects with "
82
  f"{model_repo}/{model_file}; first vector may download and load the GGUF model",
83
  flush=True,
84
  )
85
  embeddings = []
 
 
 
86
  for index, project in enumerate(projects, start=1):
87
- embeddings.append(embedder.embed(project.searchable_text))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  if index == 1 or index % 10 == 0 or index == len(projects):
89
- print(f"embedded {index}/{len(projects)} projects", flush=True)
 
 
 
 
90
  metadata = {
91
  "model_repo": model_repo,
92
  "model_file": model_file,
@@ -106,5 +128,24 @@ def build_payload(
106
  )
107
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  if __name__ == "__main__":
110
  main()
 
2
  from __future__ import annotations
3
 
4
  import argparse
5
+ from hashlib import sha256
6
  import importlib.metadata
7
  import json
8
  from pathlib import Path
 
33
  parser.add_argument("--n-threads", type=int, default=0)
34
  parser.add_argument("--build-source", default="local")
35
  parser.add_argument("--builder", default="scripts/build_project_index.py")
36
+ parser.add_argument("--reuse-index", default="")
37
  args = parser.parse_args()
38
 
39
  payload = build_payload(
 
45
  n_threads=args.n_threads or None,
46
  build_source=args.build_source,
47
  builder=args.builder,
48
+ reuse_index_path=Path(args.reuse_index) if args.reuse_index else None,
49
  )
50
  output = Path(args.out)
51
  output.parent.mkdir(parents=True, exist_ok=True)
 
68
  build_source: str,
69
  builder: str,
70
  modal_app: str = "",
71
+ reuse_index_path: Path | None = None,
72
  ) -> dict:
73
  data = json.loads(project_path.read_text(encoding="utf-8"))
74
  projects = [Project.from_dict(item) for item in data["projects"]]
75
  print(f"loaded {len(projects)} projects from {project_path}", flush=True)
76
+ reusable_vectors = load_reusable_vectors(reuse_index_path)
77
+ if reusable_vectors:
78
+ print(f"loaded {len(reusable_vectors)} reusable vectors from {reuse_index_path}", flush=True)
 
 
 
 
 
79
  print(
80
  "embedding projects with "
81
  f"{model_repo}/{model_file}; first vector may download and load the GGUF model",
82
  flush=True,
83
  )
84
  embeddings = []
85
+ embedder = None
86
+ reused_count = 0
87
+ embedded_count = 0
88
  for index, project in enumerate(projects, start=1):
89
+ digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
90
+ reusable_vector = reusable_vectors.get((project.id, digest))
91
+ if reusable_vector is not None:
92
+ embeddings.append(reusable_vector)
93
+ reused_count += 1
94
+ else:
95
+ if embedder is None:
96
+ embedder = LlamaCppEmbedder(
97
+ model_repo=model_repo,
98
+ model_file=model_file,
99
+ model_path=model_path,
100
+ n_ctx=n_ctx,
101
+ n_threads=n_threads,
102
+ verbose=False,
103
+ )
104
+ embeddings.append(embedder.embed(project.searchable_text))
105
+ embedded_count += 1
106
  if index == 1 or index % 10 == 0 or index == len(projects):
107
+ print(
108
+ f"indexed {index}/{len(projects)} projects "
109
+ f"(reused={reused_count}, embedded={embedded_count})",
110
+ flush=True,
111
+ )
112
  metadata = {
113
  "model_repo": model_repo,
114
  "model_file": model_file,
 
128
  )
129
 
130
 
131
+ def load_reusable_vectors(reuse_index_path: Path | None) -> dict[tuple[str, str], list[float]]:
132
+ if reuse_index_path is None:
133
+ return {}
134
+ payload = json.loads(reuse_index_path.read_text(encoding="utf-8"))
135
+ documents = payload.get("documents")
136
+ if not isinstance(documents, list):
137
+ return {}
138
+ reusable: dict[tuple[str, str], list[float]] = {}
139
+ for document in documents:
140
+ if not isinstance(document, dict):
141
+ continue
142
+ project_id = str(document.get("project_id") or "")
143
+ text_digest = str(document.get("text_digest") or "")
144
+ vector = document.get("vector")
145
+ if project_id and text_digest and isinstance(vector, list) and vector:
146
+ reusable[(project_id, text_digest)] = [float(value) for value in vector]
147
+ return reusable
148
+
149
+
150
  if __name__ == "__main__":
151
  main()
tests/test_app.py CHANGED
@@ -164,10 +164,12 @@ def test_dashboard_refresh_rejects_concurrent_run(monkeypatch, tmp_path) -> None
164
  def test_dashboard_refresh_embedding_build_runs_in_subprocess(monkeypatch, tmp_path) -> None:
165
  project_path = tmp_path / "projects.json"
166
  index_path = tmp_path / "project_index.json"
 
167
  project_path.write_text(
168
  json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": []}),
169
  encoding="utf-8",
170
  )
 
171
  monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_REPO", "test/repo")
172
  monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_FILE", "model.gguf")
173
  monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_PATH", "/tmp/model.gguf")
@@ -179,7 +181,7 @@ def test_dashboard_refresh_embedding_build_runs_in_subprocess(monkeypatch, tmp_p
179
 
180
  monkeypatch.setattr(app_module, "_run_refresh_index_command", fake_run_refresh_index_command)
181
 
182
- payload = app_module._build_refresh_index_payload(project_path, index_path)
183
 
184
  command = captured["command"]
185
  assert payload == {"schema": "ok"}
@@ -187,6 +189,7 @@ def test_dashboard_refresh_embedding_build_runs_in_subprocess(monkeypatch, tmp_p
187
  assert command[command.index("--model-repo") + 1] == "test/repo"
188
  assert command[command.index("--model-file") + 1] == "model.gguf"
189
  assert command[command.index("--model-path") + 1] == "/tmp/model.gguf"
 
190
  assert command[command.index("--build-source") + 1] == "space dashboard refresh"
191
  assert command[command.index("--builder") + 1] == "app.py:/api/dashboard/refresh"
192
 
 
164
  def test_dashboard_refresh_embedding_build_runs_in_subprocess(monkeypatch, tmp_path) -> None:
165
  project_path = tmp_path / "projects.json"
166
  index_path = tmp_path / "project_index.json"
167
+ reuse_index_path = tmp_path / "reuse_project_index.json"
168
  project_path.write_text(
169
  json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": []}),
170
  encoding="utf-8",
171
  )
172
+ reuse_index_path.write_text(json.dumps({"documents": []}), encoding="utf-8")
173
  monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_REPO", "test/repo")
174
  monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_FILE", "model.gguf")
175
  monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_PATH", "/tmp/model.gguf")
 
181
 
182
  monkeypatch.setattr(app_module, "_run_refresh_index_command", fake_run_refresh_index_command)
183
 
184
+ payload = app_module._build_refresh_index_payload(project_path, index_path, reuse_index_path=reuse_index_path)
185
 
186
  command = captured["command"]
187
  assert payload == {"schema": "ok"}
 
189
  assert command[command.index("--model-repo") + 1] == "test/repo"
190
  assert command[command.index("--model-file") + 1] == "model.gguf"
191
  assert command[command.index("--model-path") + 1] == "/tmp/model.gguf"
192
+ assert command[command.index("--reuse-index") + 1] == str(reuse_index_path)
193
  assert command[command.index("--build-source") + 1] == "space dashboard refresh"
194
  assert command[command.index("--builder") + 1] == "app.py:/api/dashboard/refresh"
195
 
tests/test_build_project_index.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from hashlib import sha256
4
+ import json
5
+ from pathlib import Path
6
+
7
+ from hackathon_advisor.data import Project
8
+ from scripts import build_project_index
9
+
10
+
11
+ def test_build_project_index_reuses_matching_digest_vectors(monkeypatch, tmp_path: Path) -> None:
12
+ project_row = {
13
+ "id": "build-small-hackathon/reused-project",
14
+ "title": "Reused Project",
15
+ "summary": "compact local model demo",
16
+ "tags": ["gradio"],
17
+ "models": [],
18
+ "datasets": [],
19
+ "likes": 0,
20
+ "sdk": "gradio",
21
+ "license": "",
22
+ "created_at": "",
23
+ "last_modified": "",
24
+ "host": "",
25
+ "url": "https://example.test",
26
+ }
27
+ project = Project.from_dict(project_row)
28
+ digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
29
+ project_path = tmp_path / "projects.json"
30
+ reuse_path = tmp_path / "reuse.json"
31
+ project_path.write_text(
32
+ json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
33
+ encoding="utf-8",
34
+ )
35
+ reuse_path.write_text(
36
+ json.dumps({"documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}]}),
37
+ encoding="utf-8",
38
+ )
39
+
40
+ def fail_embedder(**_kwargs):
41
+ raise AssertionError("matching digest vectors should not initialize llama.cpp")
42
+
43
+ monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", fail_embedder)
44
+
45
+ payload = build_project_index.build_payload(
46
+ project_path,
47
+ model_repo="test/repo",
48
+ model_file="model.gguf",
49
+ build_source="test",
50
+ builder="test",
51
+ reuse_index_path=reuse_path,
52
+ )
53
+
54
+ assert payload["document_count"] == 1
55
+ assert payload["documents"][0]["project_id"] == project.id
56
+ assert payload["documents"][0]["text_digest"] == digest
57
+ assert payload["documents"][0]["vector"] == [1.0, 0.0, 0.0]