Spaces:
Running on Zero
Running on Zero
| #!/usr/bin/env python3 | |
| from __future__ import annotations | |
| import argparse | |
| from hashlib import sha256 | |
| import importlib.metadata | |
| import json | |
| from pathlib import Path | |
| import sys | |
| ROOT = Path(__file__).resolve().parents[1] | |
| sys.path.insert(0, str(ROOT)) | |
| from hackathon_advisor.data import ( | |
| DEFAULT_EMBEDDING_MODEL_FILE, | |
| DEFAULT_EMBEDDING_MODEL_REPO, | |
| Project, | |
| build_index_payload, | |
| ) | |
| from hackathon_advisor.llama_embedding import DEFAULT_N_CTX, LlamaCppEmbedder | |
| def main() -> None: | |
| parser = argparse.ArgumentParser( | |
| description="Build the offline project retrieval index with llama.cpp embeddings." | |
| ) | |
| parser.add_argument( | |
| "--location", | |
| choices=("local", "modal"), | |
| default="local", | |
| help="Where to run the embedding build (default: local).", | |
| ) | |
| parser.add_argument("--projects", default="data/projects.json") | |
| parser.add_argument("--out", default="data/project_index.json") | |
| parser.add_argument("--model-repo", default=DEFAULT_EMBEDDING_MODEL_REPO) | |
| parser.add_argument("--model-file", default=DEFAULT_EMBEDDING_MODEL_FILE) | |
| parser.add_argument("--model-path", default="") | |
| parser.add_argument("--n-ctx", type=int, default=DEFAULT_N_CTX) | |
| parser.add_argument("--n-threads", type=int, default=0) | |
| parser.add_argument("--build-source", default="local") | |
| parser.add_argument("--builder", default="scripts/build_project_index.py") | |
| parser.add_argument("--reuse-index", default="") | |
| args = parser.parse_args() | |
| if args.location == "modal": | |
| if args.reuse_index: | |
| parser.error("--reuse-index is not supported with --location modal") | |
| # Imported lazily so the local path never requires the `modal` package. | |
| from scripts.modal_build_project_index import run_remote_build | |
| payload = run_remote_build( | |
| Path(args.projects), | |
| model_repo=args.model_repo, | |
| model_file=args.model_file, | |
| model_path=args.model_path, | |
| n_ctx=args.n_ctx, | |
| n_threads=args.n_threads or None, | |
| ) | |
| else: | |
| payload = build_payload( | |
| Path(args.projects), | |
| model_repo=args.model_repo, | |
| model_file=args.model_file, | |
| model_path=args.model_path, | |
| n_ctx=args.n_ctx, | |
| n_threads=args.n_threads or None, | |
| build_source=args.build_source, | |
| builder=args.builder, | |
| reuse_index_path=Path(args.reuse_index) if args.reuse_index else None, | |
| ) | |
| write_payload(Path(args.out), payload) | |
| def write_payload(output: Path, payload: dict) -> None: | |
| output.parent.mkdir(parents=True, exist_ok=True) | |
| output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") | |
| print( | |
| "wrote " | |
| f"{payload['document_count']} docs, {payload['embedding']['dimensions']} dims " | |
| f"to {output}" | |
| ) | |
| def build_payload( | |
| project_path: Path, | |
| *, | |
| model_repo: str, | |
| model_file: str, | |
| model_path: str = "", | |
| n_ctx: int = DEFAULT_N_CTX, | |
| n_threads: int | None = None, | |
| build_source: str, | |
| builder: str, | |
| modal_app: str = "", | |
| reuse_index_path: Path | None = None, | |
| ) -> dict: | |
| data = json.loads(project_path.read_text(encoding="utf-8")) | |
| projects = [Project.from_dict(item) for item in data["projects"]] | |
| print(f"loaded {len(projects)} projects from {project_path}", flush=True) | |
| reusable_vectors = load_reusable_vectors( | |
| reuse_index_path, | |
| model_repo=model_repo, | |
| model_file=model_file, | |
| n_ctx=n_ctx, | |
| ) | |
| if reusable_vectors: | |
| print(f"loaded {len(reusable_vectors)} reusable vectors from {reuse_index_path}", flush=True) | |
| print( | |
| "embedding projects with " | |
| f"{model_repo}/{model_file}; first vector may download and load the GGUF model", | |
| flush=True, | |
| ) | |
| embeddings = [] | |
| embedder = None | |
| reused_count = 0 | |
| embedded_count = 0 | |
| for index, project in enumerate(projects, start=1): | |
| digest = sha256(project.searchable_text.encode("utf-8")).hexdigest() | |
| reusable_vector = reusable_vectors.get((project.id, digest)) | |
| if reusable_vector is not None: | |
| embeddings.append(reusable_vector) | |
| reused_count += 1 | |
| else: | |
| if embedder is None: | |
| embedder = LlamaCppEmbedder( | |
| model_repo=model_repo, | |
| model_file=model_file, | |
| model_path=model_path, | |
| n_ctx=n_ctx, | |
| n_threads=n_threads, | |
| verbose=False, | |
| ) | |
| embeddings.append(embedder.embed(project.searchable_text)) | |
| embedded_count += 1 | |
| if index == 1 or index % 10 == 0 or index == len(projects): | |
| print( | |
| f"indexed {index}/{len(projects)} projects " | |
| f"(reused={reused_count}, embedded={embedded_count})", | |
| flush=True, | |
| ) | |
| metadata = { | |
| "model_repo": model_repo, | |
| "model_file": model_file, | |
| "build_source": build_source, | |
| "builder": builder, | |
| "llama_cpp_python_version": importlib.metadata.version("llama-cpp-python"), | |
| "n_ctx": n_ctx, | |
| } | |
| if modal_app: | |
| metadata["modal_app"] = modal_app | |
| return build_index_payload( | |
| projects=projects, | |
| snapshot_generated_at=str(data.get("generated_at") or ""), | |
| source=str(data.get("source") or ""), | |
| embeddings=embeddings, | |
| embedding_metadata=metadata, | |
| ) | |
| def load_reusable_vectors( | |
| reuse_index_path: Path | None, | |
| *, | |
| model_repo: str, | |
| model_file: str, | |
| n_ctx: int, | |
| ) -> dict[tuple[str, str], list[float]]: | |
| if reuse_index_path is None: | |
| return {} | |
| payload = json.loads(reuse_index_path.read_text(encoding="utf-8")) | |
| embedding = payload.get("embedding") | |
| if not isinstance(embedding, dict): | |
| print(f"skipping reusable vectors from {reuse_index_path}: missing embedding metadata", flush=True) | |
| return {} | |
| expected = { | |
| "model_repo": model_repo, | |
| "model_file": model_file, | |
| "n_ctx": n_ctx, | |
| } | |
| try: | |
| actual_n_ctx = int(embedding.get("n_ctx") or 0) | |
| except (TypeError, ValueError): | |
| actual_n_ctx = 0 | |
| actual = { | |
| "model_repo": str(embedding.get("model_repo") or ""), | |
| "model_file": str(embedding.get("model_file") or ""), | |
| "n_ctx": actual_n_ctx, | |
| } | |
| if actual != expected: | |
| print( | |
| f"skipping reusable vectors from {reuse_index_path}: " | |
| f"embedding config changed from {actual} to {expected}", | |
| flush=True, | |
| ) | |
| return {} | |
| documents = payload.get("documents") | |
| if not isinstance(documents, list): | |
| return {} | |
| reusable: dict[tuple[str, str], list[float]] = {} | |
| for document in documents: | |
| if not isinstance(document, dict): | |
| continue | |
| project_id = str(document.get("project_id") or "") | |
| text_digest = str(document.get("text_digest") or "") | |
| vector = document.get("vector") | |
| if project_id and text_digest and isinstance(vector, list) and vector: | |
| reusable[(project_id, text_digest)] = [float(value) for value in vector] | |
| return reusable | |
| if __name__ == "__main__": | |
| main() | |