legislation-tracker / sync_data.py
ramanna's picture
Add known_bills.json and bill_cache.json to data sync, fix deprecation warning
9444c45
"""
sync_data.py
------------
Downloads the latest data files from the HuggingFace dataset repo
(VAILL/Legislation-Tracker-Data) into the local data/ directory.
Runs at container startup before Streamlit launches.
"""
import os
from pathlib import Path
from huggingface_hub import hf_hub_download
REPO_ID = "VAILL/Legislation-Tracker-Data"
DATA_DIR = Path("data")
FILES_TO_SYNC = [
"known_bills_visualize.json",
"known_bills.json",
"bill_summaries.json",
"bill_suggested_questions.json",
"bill_reports.json",
"bill_cache.json",
"users.json",
]
def sync_data():
token = os.getenv("HUGGINGFACE_HUB_TOKEN")
DATA_DIR.mkdir(parents=True, exist_ok=True)
for filename in FILES_TO_SYNC:
try:
print(f"[sync_data] Downloading {filename} from {REPO_ID}...")
hf_hub_download(
repo_id=REPO_ID,
filename=filename,
repo_type="dataset",
token=token,
local_dir=str(DATA_DIR),
)
dest = DATA_DIR / filename
size_mb = dest.stat().st_size / 1024 / 1024
print(f"[sync_data] OK: {filename} ({size_mb:.1f} MB)")
except Exception as e:
print(f"[sync_data] WARN: Could not download {filename}: {e}")
print("[sync_data] Data sync complete.")
if __name__ == "__main__":
sync_data()