"""Build refresh_data.json from the canonical Google Sheet client source. Client source of truth: Google Sheet 1RYmxN855LhpKU9B3ZEwFcGAkg3auR6adhJN9pQZOgDM, tab gid=1047572149 Columns: LookupCode, Name, __PowerAppsId__ (auto-synced from PowerApps/Dataverse) Producers are NOT sourced from the Sheet (its team-mapping tab is empty). The 75 producer rows are curated and managed by start.py migrations (locked pairs, Bella Santana, House Standard), so we PRESERVE them from the existing refresh_data.json. Output: refresh_data.json = {"producers": [...], "clients": [...]} — the exact shape start.py and /admin/data-refresh already consume. Run locally: python scripts/build_refresh_data.py Run in CI (GitHub Actions): same command; no secrets needed to read the Sheet (public CSV export). HF_TOKEN is only needed by the separate upload step. Dependency-free: stdlib only (urllib, csv, json), so the HF Space requirements are untouched. """ from __future__ import annotations import csv import io import json import os import sys import time import urllib.error import urllib.request from typing import Any # ── Config ─────────────────────────────────────────────────────────── SHEET_ID = "1RYmxN855LhpKU9B3ZEwFcGAkg3auR6adhJN9pQZOgDM" CLIENTS_GID = "1047572149" CSV_URL = ( f"https://docs.google.com/spreadsheets/d/{SHEET_ID}" f"/export?format=csv&gid={CLIENTS_GID}" ) # Resolve paths relative to the repo root (this file lives in scripts/) REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) REFRESH_FILE = os.path.join(REPO_ROOT, "refresh_data.json") # Tracked, committed producer roster (names/codes only, no client PII). This is # the durable source for producers in CI, since refresh_data.json is gitignored # (it carries client records) and is therefore ABSENT on a fresh CI checkout — # which previously caused the daily job to upload an EMPTY producer list. PRODUCERS_SEED_FILE = os.path.join(REPO_ROOT, "producers_seed.json") MAX_FETCH_ATTEMPTS = 5 # explicit upper bound on retry loop FETCH_TIMEOUT_SECONDS = 60 MIN_EXPECTED_CLIENTS = 15000 # guard: Sheet has ~18.6K; refuse to truncate def fetch_sheet_csv(url: str = CSV_URL) -> str: """Download the Sheet tab as CSV text with exponential backoff.""" last_error: Exception | None = None for attempt in range(MAX_FETCH_ATTEMPTS): try: req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"}) with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT_SECONDS) as resp: return resp.read().decode("utf-8", "replace") except (urllib.error.URLError, TimeoutError) as exc: last_error = exc backoff = 2 ** attempt print(f" fetch attempt {attempt + 1} failed: {exc}; retrying in {backoff}s") time.sleep(backoff) raise RuntimeError(f"Failed to fetch Sheet CSV after {MAX_FETCH_ATTEMPTS} attempts: {last_error}") def parse_clients(csv_text: str) -> list[dict[str, Any]]: """Parse the LookupCode/Name CSV into client records, deduped by code.""" rows = list(csv.reader(io.StringIO(csv_text))) if not rows or rows[0][0].strip() != "LookupCode": raise ValueError(f"Unexpected header in Sheet CSV: {rows[0] if rows else 'EMPTY'}") clients: list[dict[str, Any]] = [] seen: set[str] = set() for row in rows[1:]: # bounded by the CSV row count if len(row) < 2: continue code = row[0].strip() name = row[1].strip() if not code or not name: # skip the blank/whitespace rows continue if code in seen: continue seen.add(code) clients.append({"lookup_code": code, "name": name}) if len(clients) < MIN_EXPECTED_CLIENTS: raise ValueError( f"Only parsed {len(clients)} clients (< {MIN_EXPECTED_CLIENTS}); " "refusing to write a truncated refresh_data.json." ) return clients def load_existing_producers(path: str = REFRESH_FILE) -> list[dict[str, Any]]: """Load the curated producer roster. Prefers the tracked PRODUCERS_SEED_FILE (always present in CI). Falls back to the gitignored refresh_data.json for local runs that predate the seed file. """ if os.path.exists(PRODUCERS_SEED_FILE): with open(PRODUCERS_SEED_FILE, encoding="utf-8") as fh: producers = json.load(fh).get("producers", []) if not isinstance(producers, list): raise ValueError("producers_seed.json has a non-list 'producers' field.") print(f" Loaded {len(producers)} producers from producers_seed.json (tracked).") return producers if not os.path.exists(path): print(" WARNING: no producers_seed.json and no refresh_data.json; emitting empty producer list.") return [] with open(path, encoding="utf-8") as fh: data = json.load(fh) producers = data.get("producers", []) if not isinstance(producers, list): raise ValueError("Existing refresh_data.json has a non-list 'producers' field.") return producers def build(path: str = REFRESH_FILE) -> dict[str, int]: """Fetch + parse + write refresh_data.json. Returns counts.""" print(f"Fetching client list from Sheet (gid={CLIENTS_GID})...") csv_text = fetch_sheet_csv() clients = parse_clients(csv_text) producers = load_existing_producers(path) print(f" Parsed {len(clients)} clients; preserving {len(producers)} producers.") payload = {"producers": producers, "clients": clients} tmp_path = f"{path}.tmp" with open(tmp_path, "w", encoding="utf-8") as fh: json.dump(payload, fh, ensure_ascii=False, indent=0) os.replace(tmp_path, path) # atomic write print(f"Wrote {path}") return {"producers": len(producers), "clients": len(clients)} if __name__ == "__main__": try: counts = build() except Exception as exc: # surface failure to CI (non-zero exit), no silent swallow print(f"ERROR: {exc}", file=sys.stderr) sys.exit(1) print(f"Done. producers={counts['producers']} clients={counts['clients']}")