File size: 1,392 Bytes
d91eea0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9444c45
d91eea0
 
 
9444c45
d91eea0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
"""
sync_data.py
------------
Downloads the latest data files from the HuggingFace dataset repo
(VAILL/Legislation-Tracker-Data) into the local data/ directory.
Runs at container startup before Streamlit launches.
"""

import os
from pathlib import Path
from huggingface_hub import hf_hub_download

REPO_ID = "VAILL/Legislation-Tracker-Data"
DATA_DIR = Path("data")

FILES_TO_SYNC = [
    "known_bills_visualize.json",
    "known_bills.json",
    "bill_summaries.json",
    "bill_suggested_questions.json",
    "bill_reports.json",
    "bill_cache.json",
    "users.json",
]


def sync_data():
    token = os.getenv("HUGGINGFACE_HUB_TOKEN")

    DATA_DIR.mkdir(parents=True, exist_ok=True)

    for filename in FILES_TO_SYNC:
        try:
            print(f"[sync_data] Downloading {filename} from {REPO_ID}...")
            hf_hub_download(
                repo_id=REPO_ID,
                filename=filename,
                repo_type="dataset",
                token=token,
                local_dir=str(DATA_DIR),
            )
            dest = DATA_DIR / filename
            size_mb = dest.stat().st_size / 1024 / 1024
            print(f"[sync_data] OK: {filename} ({size_mb:.1f} MB)")
        except Exception as e:
            print(f"[sync_data] WARN: Could not download {filename}: {e}")

    print("[sync_data] Data sync complete.")


if __name__ == "__main__":
    sync_data()