pif / scripts /build_refresh_data.py
pramodmisra's picture
Fix: producers wiped from refresh by gitignored data file in CI
dfe063a
Raw
History Blame Contribute Delete
6.33 kB
"""Build refresh_data.json from the canonical Google Sheet client source.
Client source of truth:
Google Sheet 1RYmxN855LhpKU9B3ZEwFcGAkg3auR6adhJN9pQZOgDM, tab gid=1047572149
Columns: LookupCode, Name, __PowerAppsId__ (auto-synced from PowerApps/Dataverse)
Producers are NOT sourced from the Sheet (its team-mapping tab is empty). The 75
producer rows are curated and managed by start.py migrations (locked pairs, Bella
Santana, House Standard), so we PRESERVE them from the existing refresh_data.json.
Output: refresh_data.json = {"producers": [...], "clients": [...]} β€” the exact
shape start.py and /admin/data-refresh already consume.
Run locally:
python scripts/build_refresh_data.py
Run in CI (GitHub Actions): same command; no secrets needed to read the Sheet
(public CSV export). HF_TOKEN is only needed by the separate upload step.
Dependency-free: stdlib only (urllib, csv, json), so the HF Space requirements
are untouched.
"""
from __future__ import annotations
import csv
import io
import json
import os
import sys
import time
import urllib.error
import urllib.request
from typing import Any
# ── Config ───────────────────────────────────────────────────────────
SHEET_ID = "1RYmxN855LhpKU9B3ZEwFcGAkg3auR6adhJN9pQZOgDM"
CLIENTS_GID = "1047572149"
CSV_URL = (
f"https://docs.google.com/spreadsheets/d/{SHEET_ID}"
f"/export?format=csv&gid={CLIENTS_GID}"
)
# Resolve paths relative to the repo root (this file lives in scripts/)
REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
REFRESH_FILE = os.path.join(REPO_ROOT, "refresh_data.json")
# Tracked, committed producer roster (names/codes only, no client PII). This is
# the durable source for producers in CI, since refresh_data.json is gitignored
# (it carries client records) and is therefore ABSENT on a fresh CI checkout β€”
# which previously caused the daily job to upload an EMPTY producer list.
PRODUCERS_SEED_FILE = os.path.join(REPO_ROOT, "producers_seed.json")
MAX_FETCH_ATTEMPTS = 5 # explicit upper bound on retry loop
FETCH_TIMEOUT_SECONDS = 60
MIN_EXPECTED_CLIENTS = 15000 # guard: Sheet has ~18.6K; refuse to truncate
def fetch_sheet_csv(url: str = CSV_URL) -> str:
"""Download the Sheet tab as CSV text with exponential backoff."""
last_error: Exception | None = None
for attempt in range(MAX_FETCH_ATTEMPTS):
try:
req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT_SECONDS) as resp:
return resp.read().decode("utf-8", "replace")
except (urllib.error.URLError, TimeoutError) as exc:
last_error = exc
backoff = 2 ** attempt
print(f" fetch attempt {attempt + 1} failed: {exc}; retrying in {backoff}s")
time.sleep(backoff)
raise RuntimeError(f"Failed to fetch Sheet CSV after {MAX_FETCH_ATTEMPTS} attempts: {last_error}")
def parse_clients(csv_text: str) -> list[dict[str, Any]]:
"""Parse the LookupCode/Name CSV into client records, deduped by code."""
rows = list(csv.reader(io.StringIO(csv_text)))
if not rows or rows[0][0].strip() != "LookupCode":
raise ValueError(f"Unexpected header in Sheet CSV: {rows[0] if rows else 'EMPTY'}")
clients: list[dict[str, Any]] = []
seen: set[str] = set()
for row in rows[1:]: # bounded by the CSV row count
if len(row) < 2:
continue
code = row[0].strip()
name = row[1].strip()
if not code or not name: # skip the blank/whitespace rows
continue
if code in seen:
continue
seen.add(code)
clients.append({"lookup_code": code, "name": name})
if len(clients) < MIN_EXPECTED_CLIENTS:
raise ValueError(
f"Only parsed {len(clients)} clients (< {MIN_EXPECTED_CLIENTS}); "
"refusing to write a truncated refresh_data.json."
)
return clients
def load_existing_producers(path: str = REFRESH_FILE) -> list[dict[str, Any]]:
"""Load the curated producer roster.
Prefers the tracked PRODUCERS_SEED_FILE (always present in CI). Falls back to
the gitignored refresh_data.json for local runs that predate the seed file.
"""
if os.path.exists(PRODUCERS_SEED_FILE):
with open(PRODUCERS_SEED_FILE, encoding="utf-8") as fh:
producers = json.load(fh).get("producers", [])
if not isinstance(producers, list):
raise ValueError("producers_seed.json has a non-list 'producers' field.")
print(f" Loaded {len(producers)} producers from producers_seed.json (tracked).")
return producers
if not os.path.exists(path):
print(" WARNING: no producers_seed.json and no refresh_data.json; emitting empty producer list.")
return []
with open(path, encoding="utf-8") as fh:
data = json.load(fh)
producers = data.get("producers", [])
if not isinstance(producers, list):
raise ValueError("Existing refresh_data.json has a non-list 'producers' field.")
return producers
def build(path: str = REFRESH_FILE) -> dict[str, int]:
"""Fetch + parse + write refresh_data.json. Returns counts."""
print(f"Fetching client list from Sheet (gid={CLIENTS_GID})...")
csv_text = fetch_sheet_csv()
clients = parse_clients(csv_text)
producers = load_existing_producers(path)
print(f" Parsed {len(clients)} clients; preserving {len(producers)} producers.")
payload = {"producers": producers, "clients": clients}
tmp_path = f"{path}.tmp"
with open(tmp_path, "w", encoding="utf-8") as fh:
json.dump(payload, fh, ensure_ascii=False, indent=0)
os.replace(tmp_path, path) # atomic write
print(f"Wrote {path}")
return {"producers": len(producers), "clients": len(clients)}
if __name__ == "__main__":
try:
counts = build()
except Exception as exc: # surface failure to CI (non-zero exit), no silent swallow
print(f"ERROR: {exc}", file=sys.stderr)
sys.exit(1)
print(f"Done. producers={counts['producers']} clients={counts['clients']}")