fugee / data /scripts /refresh_data.py
helmo's picture
Finish Refuge->Fugee rebrand (DESIGN, mockup, scripts, SKILL)
237e510 verified
Raw
History Blame Contribute Delete
3.45 kB
"""
UNHCR Data Refresh Script
==========================
Run this to update the local UNHCR data cache with the latest figures.
Schedule: run once after the hackathon, then annually (UNHCR updates mid-year).
For nowcasting: run monthly if needed.
Usage
-----
python3 refresh_data.py # refresh all annual endpoints
python3 refresh_data.py --full # re-download everything including raw
python3 refresh_data.py --nowcast # refresh only nowcasting (monthly data)
python3 refresh_data.py --check # check what data exists and how old it is
Output
------
Prints a table of each endpoint, record count, last download date, staleness.
Updates data/processed/*.json and data/processed/metadata.json.
Co-authored-by: Codex <noreply@openai.com>
"""
import argparse
import json
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
PROC_DIR = Path(__file__).parent.parent / "processed"
DOWNLOADER = Path(__file__).parent / "unhcr_downloader.py"
# How many days before we consider data stale
STALE_DAYS = {
"annual": 366,
"monthly": 31,
}
def load_manifest() -> dict:
p = PROC_DIR / "metadata.json"
if not p.exists():
return {}
return json.loads(p.read_text())
def check_staleness():
manifest = load_manifest()
downloads = {d["endpoint"]: d for d in manifest.get("downloads", [])}
print(f"\n{'Endpoint':<25} {'Records':>8} {'Downloaded':<22} {'Status'}")
print("-" * 75)
from unhcr_downloader import ENDPOINTS # noqa: E402
now = datetime.now(timezone.utc)
for name, cfg in ENDPOINTS.items():
if name in downloads:
d = downloads[name]
ts = datetime.fromisoformat(d["downloaded_at"].replace("Z", "+00:00"))
age_days = (now - ts).days
threshold = STALE_DAYS.get(cfg["refresh_frequency"], 366)
status = "✅ fresh" if age_days <= threshold else f"⚠️ stale ({age_days}d)"
records = d.get("records", "?")
print(f"{name:<25} {records:>8} {d['downloaded_at']:<22} {status}")
else:
print(f"{name:<25} {'—':>8} {'not downloaded':<22} ❌ missing")
print()
def run_downloader(args: list[str]):
cmd = [sys.executable, str(DOWNLOADER)] + args
print(f"Running: {' '.join(cmd)}\n")
result = subprocess.run(cmd)
return result.returncode
def main():
parser = argparse.ArgumentParser(description="Refresh UNHCR data for Fugee.")
parser.add_argument("--full", action="store_true", help="Re-download all endpoints")
parser.add_argument("--nowcast", action="store_true", help="Refresh nowcasting only")
parser.add_argument("--check", action="store_true", help="Check data freshness and exit")
args = parser.parse_args()
if args.check:
check_staleness()
return
if args.nowcast:
run_downloader(["--endpoint", "nowcasting"])
return
if args.full:
run_downloader(["--all"])
return
# Default: refresh annual endpoints + check staleness after
check_staleness()
print("Refreshing annual endpoints (this may take several minutes)...\n")
rc = run_downloader(["--refresh"])
if rc == 0:
print("\nRefresh complete. Updated manifest:\n")
check_staleness()
else:
print(f"\nRefresh exited with code {rc}. Check logs above.")
if __name__ == "__main__":
main()