Spaces:

build-small-hackathon
/

hackathon-advisor

Running on Zero

App Files Files Community

JacobLinCool commited on Jun 8

Commit

d1e80bb

verified ·

1 Parent(s): 8cd6de3

fix: reuse unchanged project embeddings

Browse files

Sync GitHub commit 2bff45e; dashboard refresh now reuses matching text-digest vectors and embeds only new or changed projects.

Files changed (4) hide show

app.py +16 -2
scripts/build_project_index.py +51 -10
tests/test_app.py +4 -1
tests/test_build_project_index.py +57 -0

app.py CHANGED Viewed

@@ -239,7 +239,14 @@ def _build_refresh_payloads(run_id: str) -> tuple[dict[str, Any], dict[str, Any]
     with tempfile.TemporaryDirectory(prefix="advisor-refresh-") as directory:
         project_path = Path(directory) / "projects.json"
         project_path.write_text(json.dumps(projects_payload, ensure_ascii=False), encoding="utf-8")
-        index_payload = _build_refresh_index_payload(project_path, Path(directory) / "project_index.json")
     projects = [Project.from_dict(item) for item in projects_payload["projects"]]
     refreshed_index = ProjectIndex(
@@ -260,7 +267,12 @@ def _build_refresh_payloads(run_id: str) -> tuple[dict[str, Any], dict[str, Any]
     return projects_payload, index_payload, refreshed_dashboard
-def _build_refresh_index_payload(project_path: Path, index_path: Path) -> dict[str, Any]:
     command = [
         sys.executable,
         str(ROOT / "scripts" / "build_project_index.py"),
@@ -277,6 +289,8 @@ def _build_refresh_index_payload(project_path: Path, index_path: Path) -> dict[s
         "--builder",
         "app.py:/api/dashboard/refresh",
     ]
     model_path = os.environ.get("ADVISOR_EMBEDDING_MODEL_PATH", "").strip()
     if model_path:
         command.extend(["--model-path", model_path])

     with tempfile.TemporaryDirectory(prefix="advisor-refresh-") as directory:
         project_path = Path(directory) / "projects.json"
         project_path.write_text(json.dumps(projects_payload, ensure_ascii=False), encoding="utf-8")
+        reuse_index_path = Path(directory) / "reuse_project_index.json"
+        with _runtime_lock:
+            reuse_index_path.write_text(json.dumps(index.index_payload, ensure_ascii=False), encoding="utf-8")
+        index_payload = _build_refresh_index_payload(
+            project_path,
+            Path(directory) / "project_index.json",
+            reuse_index_path=reuse_index_path,
+        )
     projects = [Project.from_dict(item) for item in projects_payload["projects"]]
     refreshed_index = ProjectIndex(
     return projects_payload, index_payload, refreshed_dashboard
+def _build_refresh_index_payload(
+    project_path: Path,
+    index_path: Path,
+    *,
+    reuse_index_path: Path | None = None,
+) -> dict[str, Any]:
     command = [
         sys.executable,
         str(ROOT / "scripts" / "build_project_index.py"),
         "--builder",
         "app.py:/api/dashboard/refresh",
     ]
+    if reuse_index_path is not None:
+        command.extend(["--reuse-index", str(reuse_index_path)])
     model_path = os.environ.get("ADVISOR_EMBEDDING_MODEL_PATH", "").strip()
     if model_path:
         command.extend(["--model-path", model_path])

scripts/build_project_index.py CHANGED Viewed

@@ -2,6 +2,7 @@
 from __future__ import annotations
 import argparse
 import importlib.metadata
 import json
 from pathlib import Path
@@ -32,6 +33,7 @@ def main() -> None:
     parser.add_argument("--n-threads", type=int, default=0)
     parser.add_argument("--build-source", default="local")
     parser.add_argument("--builder", default="scripts/build_project_index.py")
     args = parser.parse_args()
     payload = build_payload(
@@ -43,6 +45,7 @@ def main() -> None:
         n_threads=args.n_threads or None,
         build_source=args.build_source,
         builder=args.builder,
     )
     output = Path(args.out)
     output.parent.mkdir(parents=True, exist_ok=True)
@@ -65,28 +68,47 @@ def build_payload(
     build_source: str,
     builder: str,
     modal_app: str = "",
 ) -> dict:
     data = json.loads(project_path.read_text(encoding="utf-8"))
     projects = [Project.from_dict(item) for item in data["projects"]]
     print(f"loaded {len(projects)} projects from {project_path}", flush=True)
-    embedder = LlamaCppEmbedder(
-        model_repo=model_repo,
-        model_file=model_file,
-        model_path=model_path,
-        n_ctx=n_ctx,
-        n_threads=n_threads,
-        verbose=False,
-    )
     print(
         "embedding projects with "
         f"{model_repo}/{model_file}; first vector may download and load the GGUF model",
         flush=True,
     )
     embeddings = []
     for index, project in enumerate(projects, start=1):
-        embeddings.append(embedder.embed(project.searchable_text))
         if index == 1 or index % 10 == 0 or index == len(projects):
-            print(f"embedded {index}/{len(projects)} projects", flush=True)
     metadata = {
         "model_repo": model_repo,
         "model_file": model_file,
@@ -106,5 +128,24 @@ def build_payload(
     )
 if __name__ == "__main__":
     main()

 from __future__ import annotations
 import argparse
+from hashlib import sha256
 import importlib.metadata
 import json
 from pathlib import Path
     parser.add_argument("--n-threads", type=int, default=0)
     parser.add_argument("--build-source", default="local")
     parser.add_argument("--builder", default="scripts/build_project_index.py")
+    parser.add_argument("--reuse-index", default="")
     args = parser.parse_args()
     payload = build_payload(
         n_threads=args.n_threads or None,
         build_source=args.build_source,
         builder=args.builder,
+        reuse_index_path=Path(args.reuse_index) if args.reuse_index else None,
     )
     output = Path(args.out)
     output.parent.mkdir(parents=True, exist_ok=True)
     build_source: str,
     builder: str,
     modal_app: str = "",
+    reuse_index_path: Path | None = None,
 ) -> dict:
     data = json.loads(project_path.read_text(encoding="utf-8"))
     projects = [Project.from_dict(item) for item in data["projects"]]
     print(f"loaded {len(projects)} projects from {project_path}", flush=True)
+    reusable_vectors = load_reusable_vectors(reuse_index_path)
+    if reusable_vectors:
+        print(f"loaded {len(reusable_vectors)} reusable vectors from {reuse_index_path}", flush=True)
     print(
         "embedding projects with "
         f"{model_repo}/{model_file}; first vector may download and load the GGUF model",
         flush=True,
     )
     embeddings = []
+    embedder = None
+    reused_count = 0
+    embedded_count = 0
     for index, project in enumerate(projects, start=1):
+        digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
+        reusable_vector = reusable_vectors.get((project.id, digest))
+        if reusable_vector is not None:
+            embeddings.append(reusable_vector)
+            reused_count += 1
+        else:
+            if embedder is None:
+                embedder = LlamaCppEmbedder(
+                    model_repo=model_repo,
+                    model_file=model_file,
+                    model_path=model_path,
+                    n_ctx=n_ctx,
+                    n_threads=n_threads,
+                    verbose=False,
+                )
+            embeddings.append(embedder.embed(project.searchable_text))
+            embedded_count += 1
         if index == 1 or index % 10 == 0 or index == len(projects):
+            print(
+                f"indexed {index}/{len(projects)} projects "
+                f"(reused={reused_count}, embedded={embedded_count})",
+                flush=True,
+            )
     metadata = {
         "model_repo": model_repo,
         "model_file": model_file,
     )
+def load_reusable_vectors(reuse_index_path: Path | None) -> dict[tuple[str, str], list[float]]:
+    if reuse_index_path is None:
+        return {}
+    payload = json.loads(reuse_index_path.read_text(encoding="utf-8"))
+    documents = payload.get("documents")
+    if not isinstance(documents, list):
+        return {}
+    reusable: dict[tuple[str, str], list[float]] = {}
+    for document in documents:
+        if not isinstance(document, dict):
+            continue
+        project_id = str(document.get("project_id") or "")
+        text_digest = str(document.get("text_digest") or "")
+        vector = document.get("vector")
+        if project_id and text_digest and isinstance(vector, list) and vector:
+            reusable[(project_id, text_digest)] = [float(value) for value in vector]
+    return reusable
 if __name__ == "__main__":
     main()

tests/test_app.py CHANGED Viewed

@@ -164,10 +164,12 @@ def test_dashboard_refresh_rejects_concurrent_run(monkeypatch, tmp_path) -> None
 def test_dashboard_refresh_embedding_build_runs_in_subprocess(monkeypatch, tmp_path) -> None:
     project_path = tmp_path / "projects.json"
     index_path = tmp_path / "project_index.json"
     project_path.write_text(
         json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": []}),
         encoding="utf-8",
     )
     monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_REPO", "test/repo")
     monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_FILE", "model.gguf")
     monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_PATH", "/tmp/model.gguf")
@@ -179,7 +181,7 @@ def test_dashboard_refresh_embedding_build_runs_in_subprocess(monkeypatch, tmp_p
     monkeypatch.setattr(app_module, "_run_refresh_index_command", fake_run_refresh_index_command)
-    payload = app_module._build_refresh_index_payload(project_path, index_path)
     command = captured["command"]
     assert payload == {"schema": "ok"}
@@ -187,6 +189,7 @@ def test_dashboard_refresh_embedding_build_runs_in_subprocess(monkeypatch, tmp_p
     assert command[command.index("--model-repo") + 1] == "test/repo"
     assert command[command.index("--model-file") + 1] == "model.gguf"
     assert command[command.index("--model-path") + 1] == "/tmp/model.gguf"
     assert command[command.index("--build-source") + 1] == "space dashboard refresh"
     assert command[command.index("--builder") + 1] == "app.py:/api/dashboard/refresh"

 def test_dashboard_refresh_embedding_build_runs_in_subprocess(monkeypatch, tmp_path) -> None:
     project_path = tmp_path / "projects.json"
     index_path = tmp_path / "project_index.json"
+    reuse_index_path = tmp_path / "reuse_project_index.json"
     project_path.write_text(
         json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": []}),
         encoding="utf-8",
     )
+    reuse_index_path.write_text(json.dumps({"documents": []}), encoding="utf-8")
     monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_REPO", "test/repo")
     monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_FILE", "model.gguf")
     monkeypatch.setenv("ADVISOR_EMBEDDING_MODEL_PATH", "/tmp/model.gguf")
     monkeypatch.setattr(app_module, "_run_refresh_index_command", fake_run_refresh_index_command)
+    payload = app_module._build_refresh_index_payload(project_path, index_path, reuse_index_path=reuse_index_path)
     command = captured["command"]
     assert payload == {"schema": "ok"}
     assert command[command.index("--model-repo") + 1] == "test/repo"
     assert command[command.index("--model-file") + 1] == "model.gguf"
     assert command[command.index("--model-path") + 1] == "/tmp/model.gguf"
+    assert command[command.index("--reuse-index") + 1] == str(reuse_index_path)
     assert command[command.index("--build-source") + 1] == "space dashboard refresh"
     assert command[command.index("--builder") + 1] == "app.py:/api/dashboard/refresh"

tests/test_build_project_index.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from __future__ import annotations
+from hashlib import sha256
+import json
+from pathlib import Path
+from hackathon_advisor.data import Project
+from scripts import build_project_index
+def test_build_project_index_reuses_matching_digest_vectors(monkeypatch, tmp_path: Path) -> None:
+    project_row = {
+        "id": "build-small-hackathon/reused-project",
+        "title": "Reused Project",
+        "summary": "compact local model demo",
+        "tags": ["gradio"],
+        "models": [],
+        "datasets": [],
+        "likes": 0,
+        "sdk": "gradio",
+        "license": "",
+        "created_at": "",
+        "last_modified": "",
+        "host": "",
+        "url": "https://example.test",
+    }
+    project = Project.from_dict(project_row)
+    digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
+    project_path = tmp_path / "projects.json"
+    reuse_path = tmp_path / "reuse.json"
+    project_path.write_text(
+        json.dumps({"generated_at": "2026-06-08T00:00:00+00:00", "source": "test", "projects": [project_row]}),
+        encoding="utf-8",
+    )
+    reuse_path.write_text(
+        json.dumps({"documents": [{"project_id": project.id, "text_digest": digest, "vector": [1.0, 0.0, 0.0]}]}),
+        encoding="utf-8",
+    )
+    def fail_embedder(**_kwargs):
+        raise AssertionError("matching digest vectors should not initialize llama.cpp")
+    monkeypatch.setattr(build_project_index, "LlamaCppEmbedder", fail_embedder)
+    payload = build_project_index.build_payload(
+        project_path,
+        model_repo="test/repo",
+        model_file="model.gguf",
+        build_source="test",
+        builder="test",
+        reuse_index_path=reuse_path,
+    )
+    assert payload["document_count"] == 1
+    assert payload["documents"][0]["project_id"] == project.id
+    assert payload["documents"][0]["text_digest"] == digest
+    assert payload["documents"][0]["vector"] == [1.0, 0.0, 0.0]