"""Build refresh_data.json from the canonical Google Sheet client source.

Client source of truth:
  Google Sheet 1RYmxN855LhpKU9B3ZEwFcGAkg3auR6adhJN9pQZOgDM, tab gid=1047572149
  Columns: LookupCode, Name, __PowerAppsId__  (auto-synced from PowerApps/Dataverse)

Producers are NOT sourced from the Sheet (its team-mapping tab is empty). The 75
producer rows are curated and managed by start.py migrations (locked pairs, Bella
Santana, House Standard), so we PRESERVE them from the existing refresh_data.json.

Output: refresh_data.json = {"producers": [...], "clients": [...]} — the exact
shape start.py and /admin/data-refresh already consume.

Run locally:
    python scripts/build_refresh_data.py
Run in CI (GitHub Actions): same command; no secrets needed to read the Sheet
(public CSV export). HF_TOKEN is only needed by the separate upload step.

Dependency-free: stdlib only (urllib, csv, json), so the HF Space requirements
are untouched.
"""
from __future__ import annotations

import csv
import io
import json
import os
import sys
import time
import urllib.error
import urllib.request
from typing import Any

# ── Config ───────────────────────────────────────────────────────────
SHEET_ID = "1RYmxN855LhpKU9B3ZEwFcGAkg3auR6adhJN9pQZOgDM"
CLIENTS_GID = "1047572149"
CSV_URL = (
    f"https://docs.google.com/spreadsheets/d/{SHEET_ID}"
    f"/export?format=csv&gid={CLIENTS_GID}"
)

# Resolve paths relative to the repo root (this file lives in scripts/)
REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
REFRESH_FILE = os.path.join(REPO_ROOT, "refresh_data.json")
# Tracked, committed producer roster (names/codes only, no client PII). This is
# the durable source for producers in CI, since refresh_data.json is gitignored
# (it carries client records) and is therefore ABSENT on a fresh CI checkout —
# which previously caused the daily job to upload an EMPTY producer list.
PRODUCERS_SEED_FILE = os.path.join(REPO_ROOT, "producers_seed.json")

MAX_FETCH_ATTEMPTS = 5          # explicit upper bound on retry loop
FETCH_TIMEOUT_SECONDS = 60
MIN_EXPECTED_CLIENTS = 15000    # guard: Sheet has ~18.6K; refuse to truncate


def fetch_sheet_csv(url: str = CSV_URL) -> str:
    """Download the Sheet tab as CSV text with exponential backoff."""
    last_error: Exception | None = None
    for attempt in range(MAX_FETCH_ATTEMPTS):
        try:
            req = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
            with urllib.request.urlopen(req, timeout=FETCH_TIMEOUT_SECONDS) as resp:
                return resp.read().decode("utf-8", "replace")
        except (urllib.error.URLError, TimeoutError) as exc:
            last_error = exc
            backoff = 2 ** attempt
            print(f"  fetch attempt {attempt + 1} failed: {exc}; retrying in {backoff}s")
            time.sleep(backoff)
    raise RuntimeError(f"Failed to fetch Sheet CSV after {MAX_FETCH_ATTEMPTS} attempts: {last_error}")


def parse_clients(csv_text: str) -> list[dict[str, Any]]:
    """Parse the LookupCode/Name CSV into client records, deduped by code."""
    rows = list(csv.reader(io.StringIO(csv_text)))
    if not rows or rows[0][0].strip() != "LookupCode":
        raise ValueError(f"Unexpected header in Sheet CSV: {rows[0] if rows else 'EMPTY'}")

    clients: list[dict[str, Any]] = []
    seen: set[str] = set()
    for row in rows[1:]:                       # bounded by the CSV row count
        if len(row) < 2:
            continue
        code = row[0].strip()
        name = row[1].strip()
        if not code or not name:               # skip the blank/whitespace rows
            continue
        if code in seen:
            continue
        seen.add(code)
        clients.append({"lookup_code": code, "name": name})

    if len(clients) < MIN_EXPECTED_CLIENTS:
        raise ValueError(
            f"Only parsed {len(clients)} clients (< {MIN_EXPECTED_CLIENTS}); "
            "refusing to write a truncated refresh_data.json."
        )
    return clients


def load_existing_producers(path: str = REFRESH_FILE) -> list[dict[str, Any]]:
    """Load the curated producer roster.

    Prefers the tracked PRODUCERS_SEED_FILE (always present in CI). Falls back to
    the gitignored refresh_data.json for local runs that predate the seed file.
    """
    if os.path.exists(PRODUCERS_SEED_FILE):
        with open(PRODUCERS_SEED_FILE, encoding="utf-8") as fh:
            producers = json.load(fh).get("producers", [])
        if not isinstance(producers, list):
            raise ValueError("producers_seed.json has a non-list 'producers' field.")
        print(f"  Loaded {len(producers)} producers from producers_seed.json (tracked).")
        return producers
    if not os.path.exists(path):
        print("  WARNING: no producers_seed.json and no refresh_data.json; emitting empty producer list.")
        return []
    with open(path, encoding="utf-8") as fh:
        data = json.load(fh)
    producers = data.get("producers", [])
    if not isinstance(producers, list):
        raise ValueError("Existing refresh_data.json has a non-list 'producers' field.")
    return producers


def build(path: str = REFRESH_FILE) -> dict[str, int]:
    """Fetch + parse + write refresh_data.json. Returns counts."""
    print(f"Fetching client list from Sheet (gid={CLIENTS_GID})...")
    csv_text = fetch_sheet_csv()
    clients = parse_clients(csv_text)
    producers = load_existing_producers(path)
    print(f"  Parsed {len(clients)} clients; preserving {len(producers)} producers.")

    payload = {"producers": producers, "clients": clients}
    tmp_path = f"{path}.tmp"
    with open(tmp_path, "w", encoding="utf-8") as fh:
        json.dump(payload, fh, ensure_ascii=False, indent=0)
    os.replace(tmp_path, path)                 # atomic write
    print(f"Wrote {path}")
    return {"producers": len(producers), "clients": len(clients)}


if __name__ == "__main__":
    try:
        counts = build()
    except Exception as exc:        # surface failure to CI (non-zero exit), no silent swallow
        print(f"ERROR: {exc}", file=sys.stderr)
        sys.exit(1)
    print(f"Done. producers={counts['producers']} clients={counts['clients']}")