Spaces:
Running
Running
| """Build refresh_data.json from the canonical Google Sheet client source. | |
| Client source of truth: | |
| Google Sheet 1RYmxN855LhpKU9B3ZEwFcGAkg3auR6adhJN9pQZOgDM, tab gid=1047572149 | |
| Columns: LookupCode, Name, __PowerAppsId__ (auto-synced from PowerApps/Dataverse) | |
| Producers are NOT sourced from the Sheet (its team-mapping tab is empty). The 75 | |
| producer rows are curated and managed by start.py migrations (locked pairs, Bella | |
| Santana, House Standard), so we PRESERVE them from the existing refresh_data.json. | |
| Output: refresh_data.json = {"producers": [...], "clients": [...]} β the exact | |
| shape start.py and /admin/data-refresh already consume. | |
| Run locally: | |
| python scripts/build_refresh_data.py | |
| Run in CI (GitHub Actions): same command; no secrets needed to read the Sheet | |
| (public CSV export). HF_TOKEN is only needed by the separate upload step. | |
| Dependency-free: stdlib only (urllib, csv, json), so the HF Space requirements | |
| are untouched. | |
| """ | |
| from __future__ import annotations | |
| import csv | |
| import io | |
| import json | |
| import os | |
| import sys | |
| import time | |
| import urllib.error | |
| import urllib.request | |
| from typing import Any | |
| # ββ Config βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| SHEET_ID = "1RYmxN855LhpKU9B3ZEwFcGAkg3auR6adhJN9pQZOgDM" | |
| CLIENTS_GID = "1047572149" | |
| CSV_URL = ( | |
| f"https://docs.google.com/spreadsheets/d/{SHEET_ID}" | |
| f"/export?format=csv&gid={CLIENTS_GID}" | |
| ) | |
| # Resolve paths relative to the repo root (this file lives in scripts/) | |
| REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| REFRESH_FILE = os.path.join(REPO_ROOT, "refresh_data.json") | |
| # Tracked, committed producer roster (names/codes only, no client PII). This is | |
| # the durable source for producers in CI, since refresh_data.json is gitignored | |
| # (it carries client records) and is therefore ABSENT on a fresh CI checkout β | |
| # which previously caused the daily job to upload an EMPTY producer list. | |
| PRODUCERS_SEED_FILE = os.path.join(REPO_ROOT, "producers_seed.json") | |
| MAX_FETCH_ATTEMPTS = 5 # explicit upper bound on retry loop | |
| FETCH_TIMEOUT_SECONDS = 60 | |
| MIN_EXPECTED_CLIENTS = 15000 # guard: Sheet has ~18.6K; refuse to truncate | |
| def fetch_sheet_csv(url: str = CSV_URL) -> str: | |
| """Download the Sheet tab as CSV text with exponential backoff.""" | |
| last_error: Exception | None = None | |
| for attempt in range(MAX_FETCH_ATTEMPTS): | |
| try: | |
| req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) | |
| with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT_SECONDS) as resp: | |
| return resp.read().decode("utf-8", "replace") | |
| except (urllib.error.URLError, TimeoutError) as exc: | |
| last_error = exc | |
| backoff = 2 ** attempt | |
| print(f" fetch attempt {attempt + 1} failed: {exc}; retrying in {backoff}s") | |
| time.sleep(backoff) | |
| raise RuntimeError(f"Failed to fetch Sheet CSV after {MAX_FETCH_ATTEMPTS} attempts: {last_error}") | |
| def parse_clients(csv_text: str) -> list[dict[str, Any]]: | |
| """Parse the LookupCode/Name CSV into client records, deduped by code.""" | |
| rows = list(csv.reader(io.StringIO(csv_text))) | |
| if not rows or rows[0][0].strip() != "LookupCode": | |
| raise ValueError(f"Unexpected header in Sheet CSV: {rows[0] if rows else 'EMPTY'}") | |
| clients: list[dict[str, Any]] = [] | |
| seen: set[str] = set() | |
| for row in rows[1:]: # bounded by the CSV row count | |
| if len(row) < 2: | |
| continue | |
| code = row[0].strip() | |
| name = row[1].strip() | |
| if not code or not name: # skip the blank/whitespace rows | |
| continue | |
| if code in seen: | |
| continue | |
| seen.add(code) | |
| clients.append({"lookup_code": code, "name": name}) | |
| if len(clients) < MIN_EXPECTED_CLIENTS: | |
| raise ValueError( | |
| f"Only parsed {len(clients)} clients (< {MIN_EXPECTED_CLIENTS}); " | |
| "refusing to write a truncated refresh_data.json." | |
| ) | |
| return clients | |
| def load_existing_producers(path: str = REFRESH_FILE) -> list[dict[str, Any]]: | |
| """Load the curated producer roster. | |
| Prefers the tracked PRODUCERS_SEED_FILE (always present in CI). Falls back to | |
| the gitignored refresh_data.json for local runs that predate the seed file. | |
| """ | |
| if os.path.exists(PRODUCERS_SEED_FILE): | |
| with open(PRODUCERS_SEED_FILE, encoding="utf-8") as fh: | |
| producers = json.load(fh).get("producers", []) | |
| if not isinstance(producers, list): | |
| raise ValueError("producers_seed.json has a non-list 'producers' field.") | |
| print(f" Loaded {len(producers)} producers from producers_seed.json (tracked).") | |
| return producers | |
| if not os.path.exists(path): | |
| print(" WARNING: no producers_seed.json and no refresh_data.json; emitting empty producer list.") | |
| return [] | |
| with open(path, encoding="utf-8") as fh: | |
| data = json.load(fh) | |
| producers = data.get("producers", []) | |
| if not isinstance(producers, list): | |
| raise ValueError("Existing refresh_data.json has a non-list 'producers' field.") | |
| return producers | |
| def build(path: str = REFRESH_FILE) -> dict[str, int]: | |
| """Fetch + parse + write refresh_data.json. Returns counts.""" | |
| print(f"Fetching client list from Sheet (gid={CLIENTS_GID})...") | |
| csv_text = fetch_sheet_csv() | |
| clients = parse_clients(csv_text) | |
| producers = load_existing_producers(path) | |
| print(f" Parsed {len(clients)} clients; preserving {len(producers)} producers.") | |
| payload = {"producers": producers, "clients": clients} | |
| tmp_path = f"{path}.tmp" | |
| with open(tmp_path, "w", encoding="utf-8") as fh: | |
| json.dump(payload, fh, ensure_ascii=False, indent=0) | |
| os.replace(tmp_path, path) # atomic write | |
| print(f"Wrote {path}") | |
| return {"producers": len(producers), "clients": len(clients)} | |
| if __name__ == "__main__": | |
| try: | |
| counts = build() | |
| except Exception as exc: # surface failure to CI (non-zero exit), no silent swallow | |
| print(f"ERROR: {exc}", file=sys.stderr) | |
| sys.exit(1) | |
| print(f"Done. producers={counts['producers']} clients={counts['clients']}") | |