Spaces:

pramodmisra
/

pif

Running

App Files Files Community

pif / scripts /build_refresh_data.py

pramodmisra

Fix: producers wiped from refresh by gitignored data file in CI

dfe063a about 1 month ago

Raw

History Blame Contribute Delete

6.33 kB

	"""Build refresh_data.json from the canonical Google Sheet client source.

	Client source of truth:
	Google Sheet 1RYmxN855LhpKU9B3ZEwFcGAkg3auR6adhJN9pQZOgDM, tab gid=1047572149
	Columns: LookupCode, Name, __PowerAppsId__ (auto-synced from PowerApps/Dataverse)

	Producers are NOT sourced from the Sheet (its team-mapping tab is empty). The 75
	producer rows are curated and managed by start.py migrations (locked pairs, Bella
	Santana, House Standard), so we PRESERVE them from the existing refresh_data.json.

	Output: refresh_data.json = {"producers": [...], "clients": [...]} — the exact
	shape start.py and /admin/data-refresh already consume.

	Run locally:
	python scripts/build_refresh_data.py
	Run in CI (GitHub Actions): same command; no secrets needed to read the Sheet
	(public CSV export). HF_TOKEN is only needed by the separate upload step.

	Dependency-free: stdlib only (urllib, csv, json), so the HF Space requirements
	are untouched.
	"""
	from __future__ import annotations

	import csv
	import io
	import json
	import os
	import sys
	import time
	import urllib.error
	import urllib.request
	from typing import Any

	# ── Config ───────────────────────────────────────────────────────────
	SHEET_ID = "1RYmxN855LhpKU9B3ZEwFcGAkg3auR6adhJN9pQZOgDM"
	CLIENTS_GID = "1047572149"
	CSV_URL = (
	f"https://docs.google.com/spreadsheets/d/{SHEET_ID}"
	f"/export?format=csv&gid={CLIENTS_GID}"
	)

	# Resolve paths relative to the repo root (this file lives in scripts/)
	REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	REFRESH_FILE = os.path.join(REPO_ROOT, "refresh_data.json")
	# Tracked, committed producer roster (names/codes only, no client PII). This is
	# the durable source for producers in CI, since refresh_data.json is gitignored
	# (it carries client records) and is therefore ABSENT on a fresh CI checkout —
	# which previously caused the daily job to upload an EMPTY producer list.
	PRODUCERS_SEED_FILE = os.path.join(REPO_ROOT, "producers_seed.json")

	MAX_FETCH_ATTEMPTS = 5 # explicit upper bound on retry loop
	FETCH_TIMEOUT_SECONDS = 60
	MIN_EXPECTED_CLIENTS = 15000 # guard: Sheet has ~18.6K; refuse to truncate


	def fetch_sheet_csv(url: str = CSV_URL) -> str:
	"""Download the Sheet tab as CSV text with exponential backoff."""
	last_error: Exception \| None = None
	for attempt in range(MAX_FETCH_ATTEMPTS):
	try:
	req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
	with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT_SECONDS) as resp:
	return resp.read().decode("utf-8", "replace")
	except (urllib.error.URLError, TimeoutError) as exc:
	last_error = exc
	backoff = 2 ** attempt
	print(f" fetch attempt {attempt + 1} failed: {exc}; retrying in {backoff}s")
	time.sleep(backoff)
	raise RuntimeError(f"Failed to fetch Sheet CSV after {MAX_FETCH_ATTEMPTS} attempts: {last_error}")


	def parse_clients(csv_text: str) -> list[dict[str, Any]]:
	"""Parse the LookupCode/Name CSV into client records, deduped by code."""
	rows = list(csv.reader(io.StringIO(csv_text)))
	if not rows or rows[0][0].strip() != "LookupCode":
	raise ValueError(f"Unexpected header in Sheet CSV: {rows[0] if rows else 'EMPTY'}")

	clients: list[dict[str, Any]] = []
	seen: set[str] = set()
	for row in rows[1:]: # bounded by the CSV row count
	if len(row) < 2:
	continue
	code = row[0].strip()
	name = row[1].strip()
	if not code or not name: # skip the blank/whitespace rows
	continue
	if code in seen:
	continue
	seen.add(code)
	clients.append({"lookup_code": code, "name": name})

	if len(clients) < MIN_EXPECTED_CLIENTS:
	raise ValueError(
	f"Only parsed {len(clients)} clients (< {MIN_EXPECTED_CLIENTS}); "
	"refusing to write a truncated refresh_data.json."
	)
	return clients


	def load_existing_producers(path: str = REFRESH_FILE) -> list[dict[str, Any]]:
	"""Load the curated producer roster.

	Prefers the tracked PRODUCERS_SEED_FILE (always present in CI). Falls back to
	the gitignored refresh_data.json for local runs that predate the seed file.
	"""
	if os.path.exists(PRODUCERS_SEED_FILE):
	with open(PRODUCERS_SEED_FILE, encoding="utf-8") as fh:
	producers = json.load(fh).get("producers", [])
	if not isinstance(producers, list):
	raise ValueError("producers_seed.json has a non-list 'producers' field.")
	print(f" Loaded {len(producers)} producers from producers_seed.json (tracked).")
	return producers
	if not os.path.exists(path):
	print(" WARNING: no producers_seed.json and no refresh_data.json; emitting empty producer list.")
	return []
	with open(path, encoding="utf-8") as fh:
	data = json.load(fh)
	producers = data.get("producers", [])
	if not isinstance(producers, list):
	raise ValueError("Existing refresh_data.json has a non-list 'producers' field.")
	return producers


	def build(path: str = REFRESH_FILE) -> dict[str, int]:
	"""Fetch + parse + write refresh_data.json. Returns counts."""
	print(f"Fetching client list from Sheet (gid={CLIENTS_GID})...")
	csv_text = fetch_sheet_csv()
	clients = parse_clients(csv_text)
	producers = load_existing_producers(path)
	print(f" Parsed {len(clients)} clients; preserving {len(producers)} producers.")

	payload = {"producers": producers, "clients": clients}
	tmp_path = f"{path}.tmp"
	with open(tmp_path, "w", encoding="utf-8") as fh:
	json.dump(payload, fh, ensure_ascii=False, indent=0)
	os.replace(tmp_path, path) # atomic write
	print(f"Wrote {path}")
	return {"producers": len(producers), "clients": len(clients)}


	if __name__ == "__main__":
	try:
	counts = build()
	except Exception as exc: # surface failure to CI (non-zero exit), no silent swallow
	print(f"ERROR: {exc}", file=sys.stderr)
	sys.exit(1)
	print(f"Done. producers={counts['producers']} clients={counts['clients']}")