Enhance Docker configuration by adding HUGGING_FACE_HUB_TOKEN as a build argument in docker-compose.yml and Dockerfile. Update Dockerfile to install CPU-only PyTorch and adjust thread counts for reduced memory usage during builds. Modify docker_build_assets.py to support both HF_TOKEN and HUGGING_FACE_HUB_TOKEN for model caching. Update README.md to clarify token usage and build instructions.
be705e8 | #!/usr/bin/env python3 | |
| """Docker build: HF auth, cache model files on disk (low RAM), produce data/*.jsonl.""" | |
| from __future__ import annotations | |
| import hashlib | |
| import json | |
| import os | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| from typing import Any | |
| def root() -> Path: | |
| return Path(__file__).resolve().parents[1] | |
| def hf_token() -> str | None: | |
| t = ( | |
| os.environ.get("HF_TOKEN", "").strip() | |
| or os.environ.get("HUGGING_FACE_HUB_TOKEN", "").strip() | |
| ) | |
| return t or None | |
| def hf_login() -> None: | |
| tok = hf_token() | |
| if not tok: | |
| print( | |
| "docker_build_assets: No HF_TOKEN / HUGGING_FACE_HUB_TOKEN — anonymous Hub access " | |
| "(rate limits). On HF Spaces, pass token into the *Docker build* (not only runtime)." | |
| ) | |
| return | |
| try: | |
| from huggingface_hub import login # type: ignore[import-untyped] | |
| except ImportError: | |
| print("docker_build_assets: huggingface_hub not installed; skipping login.") | |
| return | |
| login(token=tok, add_to_git_credential=False) | |
| print("docker_build_assets: Hugging Face Hub login OK.") | |
| def embedding_model_repo(hub_name: str) -> str: | |
| if "/" not in hub_name.strip(): | |
| return f"sentence-transformers/{hub_name.strip()}" | |
| return hub_name.strip() | |
| def prefetch_hub_files_only() -> None: | |
| """Download weights into HF cache without loading full LLM into RAM (avoids build OOM).""" | |
| emb_name = os.environ.get("TASK_B_LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2") | |
| llm_name = os.environ.get("TASK_B_LOCAL_LLM_MODEL", "Qwen/Qwen2.5-1.5B-Instruct") | |
| tok = hf_token() | |
| try: | |
| from huggingface_hub import snapshot_download # type: ignore[import-untyped] | |
| except ImportError: | |
| print("docker_build_assets: huggingface_hub missing; skipping prefetch.") | |
| return | |
| kw: dict[str, Any] = {} | |
| if tok: | |
| kw["token"] = tok | |
| emb_repo = embedding_model_repo(emb_name) | |
| print(f"docker_build_assets: snapshot_download (disk cache) -> {emb_repo}") | |
| snapshot_download(repo_id=emb_repo, local_files_only=False, **kw) | |
| print(f"docker_build_assets: snapshot_download (disk cache) -> {llm_name}") | |
| snapshot_download(repo_id=llm_name, local_files_only=False, **kw) | |
| print("docker_build_assets: Hub snapshots cached (LLM not loaded into RAM).") | |
| def yelp_business_path(rt: Path) -> Path: | |
| env_p = os.environ.get("YELP_BUSINESS_JSON", "").strip() | |
| if env_p: | |
| return Path(env_p) | |
| return rt / "yelp_dataset" / "extracted" / "yelp_academic_dataset_business.json" | |
| def build_from_yelp(rt: Path, yelp: Path) -> None: | |
| max_rows = os.environ.get("DOCKER_CATALOG_MAX_ROWS", "15000") | |
| out_cat = rt / "data" / "business_catalog.jsonl" | |
| py = sys.executable | |
| subprocess.check_call( | |
| [ | |
| py, | |
| str(rt / "scripts" / "build_business_catalog.py"), | |
| "--business-json", | |
| str(yelp), | |
| "--output", | |
| str(out_cat), | |
| "--max-rows", | |
| max_rows, | |
| "--only-open", | |
| ] | |
| ) | |
| subprocess.check_call( | |
| [ | |
| py, | |
| str(rt / "scripts" / "embed_catalog_azure_openai.py"), | |
| "--backend", | |
| "local", | |
| "--input", | |
| str(out_cat), | |
| "--output", | |
| str(rt / "data" / "business_catalog_embedded.jsonl"), | |
| "--batch-size", | |
| "32", | |
| ] | |
| ) | |
| def stub_catalog_rows(n: int = 48) -> list[dict[str, Any]]: | |
| templates = [ | |
| ("Riverfront Ramen", "Restaurants, Japanese, Ramen", "Portland", "OR"), | |
| ("Oak Street Bakery", "Food, Bakeries, Coffee & Tea", "Austin", "TX"), | |
| ("Queen Vietnamese", "Restaurants, Vietnamese", "Philadelphia", "PA"), | |
| ("Campus Espresso", "Coffee & Tea, Cafes", "Seattle", "WA"), | |
| ("Park Yoga Studio", "Active Life, Yoga", "Denver", "CO"), | |
| ("Midtown Books", "Shopping, Books", "Chicago", "IL"), | |
| ("East Side Brewpub", "Nightlife, Breweries", "Milwaukee", "WI"), | |
| ("Family Thai Kitchen", "Restaurants, Thai", "Tempe", "AZ"), | |
| ("Uptown Nail Spa", "Beauty & Spas, Nail Salons", "Miami", "FL"), | |
| ("Lakeside Pizza", "Restaurants, Pizza", "Minneapolis", "MN"), | |
| ] | |
| rows = [] | |
| for i in range(n): | |
| name, cats, city, state = templates[i % len(templates)] | |
| suffix = i // len(templates) | |
| disp = f"{name}" if suffix == 0 else f"{name} #{suffix}" | |
| h = hashlib.sha256(f"{i}-{disp}".encode()).hexdigest()[:22] | |
| bid = h | |
| text_for_embedding = ( | |
| f"name: {disp}\n" | |
| f"categories: {cats}\n" | |
| f"location: {city}, {state}\n" | |
| f"address: {100 + i} Main St\n" | |
| f"business_avg_stars: {3.5 + (i % 15) / 10:.1f}\n" | |
| f"business_review_count: {20 + i * 7}\n" | |
| f"is_open: 1" | |
| ) | |
| rows.append( | |
| { | |
| "business_id": bid, | |
| "name": disp, | |
| "categories": cats, | |
| "city": city, | |
| "state": state, | |
| "stars": float(3.5 + (i % 15) / 10), | |
| "review_count": int(20 + i * 7), | |
| "is_open": 1, | |
| "text_for_embedding": text_for_embedding, | |
| } | |
| ) | |
| return rows | |
| def build_stub_embedded(rt: Path) -> None: | |
| from sentence_transformers import SentenceTransformer # type: ignore[import-untyped] | |
| emb_name = os.environ.get("TASK_B_LOCAL_EMBEDDING_MODEL", "all-MiniLM-L6-v2") | |
| model = SentenceTransformer(emb_name) | |
| rows = stub_catalog_rows() | |
| texts = [r["text_for_embedding"] for r in rows] | |
| # Small batches keep peak RAM low on HF builders. | |
| mat = model.encode(texts, batch_size=8, convert_to_numpy=True, normalize_embeddings=False) | |
| out_path = rt / "data" / "business_catalog_embedded.jsonl" | |
| cat_path = rt / "data" / "business_catalog.jsonl" | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| with out_path.open("w", encoding="utf-8") as fe, cat_path.open("w", encoding="utf-8") as fc: | |
| for row, vec in zip(rows, mat, strict=True): | |
| fc.write(json.dumps(row, ensure_ascii=False) + "\n") | |
| emb_row = {**row, "embedding": vec.astype(float).tolist()} | |
| fe.write(json.dumps(emb_row, ensure_ascii=False) + "\n") | |
| print(f"docker_build_assets: wrote stub catalog -> {cat_path} and {out_path}") | |
| def main() -> None: | |
| rt = root() | |
| (rt / "data").mkdir(parents=True, exist_ok=True) | |
| hf_login() | |
| prefetch_hub_files_only() | |
| yelp = yelp_business_path(rt) | |
| if yelp.is_file(): | |
| print(f"docker_build_assets: building catalog from {yelp}") | |
| build_from_yelp(rt, yelp) | |
| else: | |
| print( | |
| "docker_build_assets: Yelp business JSON not found; " | |
| "writing stub JSONL (mount real data at runtime or bake yelp_dataset into build context)." | |
| ) | |
| build_stub_embedded(rt) | |
| if __name__ == "__main__": | |
| main() | |