""" sync_data.py ------------ Downloads the latest data files from the HuggingFace dataset repo (VAILL/Legislation-Tracker-Data) into the local data/ directory. Runs at container startup before Streamlit launches. """ import os from pathlib import Path from huggingface_hub import hf_hub_download REPO_ID = "VAILL/Legislation-Tracker-Data" DATA_DIR = Path("data") FILES_TO_SYNC = [ "known_bills_visualize.json", "known_bills.json", "bill_summaries.json", "bill_suggested_questions.json", "bill_reports.json", "bill_cache.json", "users.json", ] def sync_data(): token = os.getenv("HUGGINGFACE_HUB_TOKEN") DATA_DIR.mkdir(parents=True, exist_ok=True) for filename in FILES_TO_SYNC: try: print(f"[sync_data] Downloading {filename} from {REPO_ID}...") hf_hub_download( repo_id=REPO_ID, filename=filename, repo_type="dataset", token=token, local_dir=str(DATA_DIR), ) dest = DATA_DIR / filename size_mb = dest.stat().st_size / 1024 / 1024 print(f"[sync_data] OK: {filename} ({size_mb:.1f} MB)") except Exception as e: print(f"[sync_data] WARN: Could not download {filename}: {e}") print("[sync_data] Data sync complete.") if __name__ == "__main__": sync_data()