Spaces:
Running on Zero
Running on Zero
File size: 7,347 Bytes
9219266 d1e80bb e12a049 9219266 e12a049 ca766b5 9219266 e12a049 13fe947 9219266 e12a049 ca766b5 e12a049 4791c0a d1e80bb 9219266 13fe947 9219266 e12a049 9219266 e12a049 ca766b5 e12a049 d1e80bb e12a049 b7d5967 e493b7e d1e80bb b7d5967 d1e80bb b7d5967 d1e80bb b7d5967 d1e80bb e12a049 d0718ca e12a049 e493b7e d1e80bb e493b7e d1e80bb 9219266 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 | #!/usr/bin/env python3
from __future__ import annotations
import argparse
from hashlib import sha256
import importlib.metadata
import json
from pathlib import Path
import sys
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from hackathon_advisor.data import (
DEFAULT_EMBEDDING_MODEL_FILE,
DEFAULT_EMBEDDING_MODEL_REPO,
Project,
build_index_payload,
)
from hackathon_advisor.llama_embedding import DEFAULT_N_CTX, LlamaCppEmbedder
def main() -> None:
parser = argparse.ArgumentParser(
description="Build the offline project retrieval index with llama.cpp embeddings."
)
parser.add_argument(
"--location",
choices=("local", "modal"),
default="local",
help="Where to run the embedding build (default: local).",
)
parser.add_argument("--projects", default="data/projects.json")
parser.add_argument("--out", default="data/project_index.json")
parser.add_argument("--model-repo", default=DEFAULT_EMBEDDING_MODEL_REPO)
parser.add_argument("--model-file", default=DEFAULT_EMBEDDING_MODEL_FILE)
parser.add_argument("--model-path", default="")
parser.add_argument("--n-ctx", type=int, default=DEFAULT_N_CTX)
parser.add_argument("--n-threads", type=int, default=0)
parser.add_argument("--build-source", default="local")
parser.add_argument("--builder", default="scripts/build_project_index.py")
parser.add_argument("--reuse-index", default="")
args = parser.parse_args()
if args.location == "modal":
if args.reuse_index:
parser.error("--reuse-index is not supported with --location modal")
# Imported lazily so the local path never requires the `modal` package.
from scripts.modal_build_project_index import run_remote_build
payload = run_remote_build(
Path(args.projects),
model_repo=args.model_repo,
model_file=args.model_file,
model_path=args.model_path,
n_ctx=args.n_ctx,
n_threads=args.n_threads or None,
)
else:
payload = build_payload(
Path(args.projects),
model_repo=args.model_repo,
model_file=args.model_file,
model_path=args.model_path,
n_ctx=args.n_ctx,
n_threads=args.n_threads or None,
build_source=args.build_source,
builder=args.builder,
reuse_index_path=Path(args.reuse_index) if args.reuse_index else None,
)
write_payload(Path(args.out), payload)
def write_payload(output: Path, payload: dict) -> None:
output.parent.mkdir(parents=True, exist_ok=True)
output.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
print(
"wrote "
f"{payload['document_count']} docs, {payload['embedding']['dimensions']} dims "
f"to {output}"
)
def build_payload(
project_path: Path,
*,
model_repo: str,
model_file: str,
model_path: str = "",
n_ctx: int = DEFAULT_N_CTX,
n_threads: int | None = None,
build_source: str,
builder: str,
modal_app: str = "",
reuse_index_path: Path | None = None,
) -> dict:
data = json.loads(project_path.read_text(encoding="utf-8"))
projects = [Project.from_dict(item) for item in data["projects"]]
print(f"loaded {len(projects)} projects from {project_path}", flush=True)
reusable_vectors = load_reusable_vectors(
reuse_index_path,
model_repo=model_repo,
model_file=model_file,
n_ctx=n_ctx,
)
if reusable_vectors:
print(f"loaded {len(reusable_vectors)} reusable vectors from {reuse_index_path}", flush=True)
print(
"embedding projects with "
f"{model_repo}/{model_file}; first vector may download and load the GGUF model",
flush=True,
)
embeddings = []
embedder = None
reused_count = 0
embedded_count = 0
for index, project in enumerate(projects, start=1):
digest = sha256(project.searchable_text.encode("utf-8")).hexdigest()
reusable_vector = reusable_vectors.get((project.id, digest))
if reusable_vector is not None:
embeddings.append(reusable_vector)
reused_count += 1
else:
if embedder is None:
embedder = LlamaCppEmbedder(
model_repo=model_repo,
model_file=model_file,
model_path=model_path,
n_ctx=n_ctx,
n_threads=n_threads,
verbose=False,
)
embeddings.append(embedder.embed(project.searchable_text))
embedded_count += 1
if index == 1 or index % 10 == 0 or index == len(projects):
print(
f"indexed {index}/{len(projects)} projects "
f"(reused={reused_count}, embedded={embedded_count})",
flush=True,
)
metadata = {
"model_repo": model_repo,
"model_file": model_file,
"build_source": build_source,
"builder": builder,
"llama_cpp_python_version": importlib.metadata.version("llama-cpp-python"),
"n_ctx": n_ctx,
}
if modal_app:
metadata["modal_app"] = modal_app
return build_index_payload(
projects=projects,
snapshot_generated_at=str(data.get("generated_at") or ""),
source=str(data.get("source") or ""),
embeddings=embeddings,
embedding_metadata=metadata,
)
def load_reusable_vectors(
reuse_index_path: Path | None,
*,
model_repo: str,
model_file: str,
n_ctx: int,
) -> dict[tuple[str, str], list[float]]:
if reuse_index_path is None:
return {}
payload = json.loads(reuse_index_path.read_text(encoding="utf-8"))
embedding = payload.get("embedding")
if not isinstance(embedding, dict):
print(f"skipping reusable vectors from {reuse_index_path}: missing embedding metadata", flush=True)
return {}
expected = {
"model_repo": model_repo,
"model_file": model_file,
"n_ctx": n_ctx,
}
try:
actual_n_ctx = int(embedding.get("n_ctx") or 0)
except (TypeError, ValueError):
actual_n_ctx = 0
actual = {
"model_repo": str(embedding.get("model_repo") or ""),
"model_file": str(embedding.get("model_file") or ""),
"n_ctx": actual_n_ctx,
}
if actual != expected:
print(
f"skipping reusable vectors from {reuse_index_path}: "
f"embedding config changed from {actual} to {expected}",
flush=True,
)
return {}
documents = payload.get("documents")
if not isinstance(documents, list):
return {}
reusable: dict[tuple[str, str], list[float]] = {}
for document in documents:
if not isinstance(document, dict):
continue
project_id = str(document.get("project_id") or "")
text_digest = str(document.get("text_digest") or "")
vector = document.get("vector")
if project_id and text_digest and isinstance(vector, list) and vector:
reusable[(project_id, text_digest)] = [float(value) for value in vector]
return reusable
if __name__ == "__main__":
main()
|