hackathon-advisor / scripts /build_project_index.py
JacobLinCool's picture
deploy: sync GitHub main de5dbf9
13fe947 verified
#!/usr/bin/env python3
from __future__ import annotations
import argparse
from hashlib import sha256
import importlib.metadata
import json
from pathlib import Path
import sys
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from hackathon_advisor.data import (
DEFAULT_EMBEDDING_MODEL_FILE,
DEFAULT_EMBEDDING_MODEL_REPO,
Project,
build_index_payload,
)
from hackathon_advisor.llama_embedding import DEFAULT_N_CTX, LlamaCppEmbedder
def main() -> None:
parser = argparse.ArgumentParser(
description="Build the offline project retrieval index with llama.cpp embeddings."
)
parser.add_argument(
"--location",
choices=("local", "modal"),
default="local",
help="Where to run the embedding build (default: local).",
)
parser.add_argument("--projects", default="data/projects.json")
parser.add_argument("--out", default="data/project_index.json")
parser.add_argument("--model-repo", default=DEFAULT_EMBEDDING_MODEL_REPO)
parser.add_argument("--model-file", default=DEFAULT_EMBEDDING_MODEL_FILE)
parser.add_argument("--model-path", default="")
parser.add_argument("--n-ctx", type=int, default=DEFAULT_N_CTX)
parser.add_argument("--n-threads", type=int, default=0)
parser.add_argument("--build-source", default="local")
parser.add_argument("--builder", default="scripts/build_project_index.py")
parser.add_argument("--reuse-index", default="")
args = parser.parse_args()
if args.location == "modal":
if args.reuse_index:
parser.error("--reuse-index is not supported with --location modal")
# Imported lazily so the local path never requires the `modal` package.
from scripts.modal_build_project_index import run_remote_build
payload = run_remote_build(
Path(args.projects),
model_repo=args.model_repo,
model_file=args.model_file,
model_path=args.model_path,
n_ctx=args.n_ctx,
n_threads=args.n_threads or None,
)
else:
payload = build_payload(
Path(args.projects),
model_repo=args.model_repo,
model_file=args.model_file,
model_path=args.model_path,
n_ctx=args.n_ctx,
n_threads=args.n_threads or None,
build_source=args.build_source,
builder=args.builder,
reuse_index_path=Path(args.reuse_index) if args.reuse_index else None,
)
write_payload(Path(args.out), payload)
def write_payload(output: Path, payload: dict) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
print(
"wrote "
f"{payload['document_count']} docs, {payload['embedding']['dimensions']} dims "
f"to {output}"
)
def build_payload(
project_path: Path,
*,
model_repo: str,
model_file: str,
model_path: str = "",
n_ctx: int = DEFAULT_N_CTX,
n_threads: int | None = None,
build_source: str,
builder: str,
modal_app: str = "",
reuse_index_path: Path | None = None,
) -> dict:
data = json.loads(project_path.read_text(encoding="utf-8"))
projects = [Project.from_dict(item) for item in data["projects"]]
print(f"loaded {len(projects)} projects from {project_path}", flush=True)
reusable_vectors = load_reusable_vectors(
reuse_index_path,
model_repo=model_repo,
model_file=model_file,
n_ctx=n_ctx,
)
if reusable_vectors:
print(f"loaded {len(reusable_vectors)} reusable vectors from {reuse_index_path}", flush=True)
print(
"embedding projects with "
f"{model_repo}/{model_file}; first vector may download and load the GGUF model",
flush=True,
)
embeddings = []
embedder = None
reused_count = 0
embedded_count = 0
for index, project in enumerate(projects, start=1):
digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
reusable_vector = reusable_vectors.get((project.id, digest))
if reusable_vector is not None:
embeddings.append(reusable_vector)
reused_count += 1
else:
if embedder is None:
embedder = LlamaCppEmbedder(
model_repo=model_repo,
model_file=model_file,
model_path=model_path,
n_ctx=n_ctx,
n_threads=n_threads,
verbose=False,
)
embeddings.append(embedder.embed(project.searchable_text))
embedded_count += 1
if index == 1 or index % 10 == 0 or index == len(projects):
print(
f"indexed {index}/{len(projects)} projects "
f"(reused={reused_count}, embedded={embedded_count})",
flush=True,
)
metadata = {
"model_repo": model_repo,
"model_file": model_file,
"build_source": build_source,
"builder": builder,
"llama_cpp_python_version": importlib.metadata.version("llama-cpp-python"),
"n_ctx": n_ctx,
}
if modal_app:
metadata["modal_app"] = modal_app
return build_index_payload(
projects=projects,
snapshot_generated_at=str(data.get("generated_at") or ""),
source=str(data.get("source") or ""),
embeddings=embeddings,
embedding_metadata=metadata,
)
def load_reusable_vectors(
reuse_index_path: Path | None,
*,
model_repo: str,
model_file: str,
n_ctx: int,
) -> dict[tuple[str, str], list[float]]:
if reuse_index_path is None:
return {}
payload = json.loads(reuse_index_path.read_text(encoding="utf-8"))
embedding = payload.get("embedding")
if not isinstance(embedding, dict):
print(f"skipping reusable vectors from {reuse_index_path}: missing embedding metadata", flush=True)
return {}
expected = {
"model_repo": model_repo,
"model_file": model_file,
"n_ctx": n_ctx,
}
try:
actual_n_ctx = int(embedding.get("n_ctx") or 0)
except (TypeError, ValueError):
actual_n_ctx = 0
actual = {
"model_repo": str(embedding.get("model_repo") or ""),
"model_file": str(embedding.get("model_file") or ""),
"n_ctx": actual_n_ctx,
}
if actual != expected:
print(
f"skipping reusable vectors from {reuse_index_path}: "
f"embedding config changed from {actual} to {expected}",
flush=True,
)
return {}
documents = payload.get("documents")
if not isinstance(documents, list):
return {}
reusable: dict[tuple[str, str], list[float]] = {}
for document in documents:
if not isinstance(document, dict):
continue
project_id = str(document.get("project_id") or "")
text_digest = str(document.get("text_digest") or "")
vector = document.get("vector")
if project_id and text_digest and isinstance(vector, list) and vector:
reusable[(project_id, text_digest)] = [float(value) for value in vector]
return reusable
if __name__ == "__main__":
main()