legislation-tracker / data /huggingface_upload.py
ramanna's picture
Upload 30 files
0e39328 verified
"""
HuggingFace Dataset Upload Module
- Tests HF connection
- Uploads known_bills_visualize.json (legacy function)
- Uploads ALL core data JSONs (new function) to HuggingFace Datasets Hub
Works with the Admin panel HuggingFace tab
"""
from huggingface_hub import HfApi, create_repo
import streamlit as st
import os
import json
from pathlib import Path
from typing import Dict, List, Tuple, Optional
FILES_TO_UPLOAD = {
"data/known_bills_visualize.json": "known_bills_visualize.json",
"data/bill_summaries.json": "bill_summaries.json",
"data/bill_suggested_questions.json": "bill_suggested_questions.json",
"data/bill_reports.json": "bill_reports.json",
"data/bill_cache.json": "bill_cache.json",
"data/known_bills.json": "known_bills.json",
"data/known_bills_fixed.json": "known_bills_fixed.json",
}
def _get_hf_token_and_repo() -> Tuple[str, str]:
"""
Get HF token + dataset repo.
Priority:
1. Streamlit secrets (for the Admin UI)
2. Environment variables (for CLI scripts like update_data.py)
- HUGGINGFACE_HUB_TOKEN
- HF_REPO_ID
"""
token = None
repo_id = None
try:
token = st.secrets["huggingface"]["token"]
repo_id = st.secrets["huggingface"]["dataset_repo"]
except Exception:
pass
if not token:
token = os.getenv("HUGGINGFACE_HUB_TOKEN")
if not repo_id:
repo_id = os.getenv("HF_REPO_ID")
if not token or not repo_id:
raise KeyError(
"HuggingFace configuration missing. "
"Provide either Streamlit secrets "
"[huggingface.token] and [huggingface.dataset_repo] "
"or environment variables HUGGINGFACE_HUB_TOKEN and HF_REPO_ID."
)
return token, repo_id
def test_hf_connection() -> Tuple[bool, str]:
"""
Test connection to HuggingFace API
Returns:
tuple: (success: bool, message: str)
"""
try:
token, _ = _get_hf_token_and_repo()
api = HfApi()
user = api.whoami(token=token)
username = user.get("name") or user.get("fullname") or user.get("id") or "User"
return True, f"Connected as: {username}"
except KeyError:
return False, "HuggingFace token or dataset_repo not found in secrets"
except Exception as e:
return False, f"Connection failed: {str(e)}"
def get_dataset_url(filename: str = "known_bills_visualize.json") -> Optional[str]:
"""
Get the public URL of a file inside the dataset.
Args:
filename: Name of the file in the HF dataset repo.
Returns:
str | None: URL to the dataset file, or None if config missing
"""
try:
repo = st.secrets["huggingface"]["dataset_repo"]
return f"https://huggingface.co/datasets/{repo}/resolve/main/{filename}"
except KeyError:
return None
def _find_and_validate_json(possible_paths: List[Path]) -> Path:
"""
Given a list of possible paths, return the first that exists,
and validate that it is valid JSON.
"""
file_path = None
for path in possible_paths:
if path.exists():
file_path = path
break
if file_path is None:
raise FileNotFoundError(
"File not found.\n"
"Checked locations:\n" + "\n".join(f" - {p}" for p in possible_paths)
)
try:
with open(file_path, "r", encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data, (dict, list)):
raise ValueError("JSON file must contain a dict or list")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON file: {str(e)}")
return file_path
def _ensure_dataset_exists(api: HfApi, repo_id: str, token: str) -> None:
"""Create the dataset repo if it does not already exist."""
try:
create_repo(
repo_id=repo_id,
repo_type="dataset",
token=token,
exist_ok=True,
private=False,
)
except Exception:
pass
def upload_to_huggingface() -> str:
"""
Legacy function: Upload ONLY known_bills_visualize.json to HuggingFace Datasets Hub.
Used by existing Admin panel code. New code should prefer upload_all_to_huggingface().
Returns:
str: Public URL to the uploaded file
Raises:
FileNotFoundError: If JSON file doesn't exist
Exception: If upload fails
"""
try:
token, repo_id = _get_hf_token_and_repo()
api = HfApi()
_ensure_dataset_exists(api, repo_id, token)
possible_paths = [
Path("data/known_bills_visualize.json"),
Path("known_bills_visualize.json"),
]
file_path = _find_and_validate_json(possible_paths)
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
api.upload_file(
path_or_fileobj=str(file_path),
path_in_repo="known_bills_visualize.json",
repo_id=repo_id,
repo_type="dataset",
token=token,
commit_message=f"Update AI legislation data ({file_size_mb:.2f}MB)",
)
url = get_dataset_url("known_bills_visualize.json")
return url
except FileNotFoundError as e:
raise e
except KeyError as e:
raise Exception(f"Missing configuration in secrets.toml: {e}")
except Exception as e:
raise Exception(f"Upload failed: {str(e)}")
def upload_all_to_huggingface() -> Dict[str, str]:
"""
NEW: Upload ALL core JSON files to HuggingFace Datasets Hub.
Returns:
dict: mapping from dataset filename -> public URL (for successfully uploaded files)
"""
token, repo_id = _get_hf_token_and_repo()
api = HfApi()
_ensure_dataset_exists(api, repo_id, token)
uploaded_urls: Dict[str, str] = {}
for local_path, dest_name in FILES_TO_UPLOAD.items():
possible_paths = [Path(local_path), Path(dest_name)]
try:
file_path = _find_and_validate_json(possible_paths)
except FileNotFoundError:
msg = f"Skipping missing file: {local_path}"
print(msg)
st.write(msg)
continue
except ValueError as e:
msg = f"Skipping invalid JSON in {local_path}: {e}"
print(msg)
st.write(msg)
continue
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
commit_msg = f"Update {dest_name} ({file_size_mb:.2f}MB)"
print(f"Uploading {file_path}{repo_id}/{dest_name} ...")
api.upload_file(
path_or_fileobj=str(file_path),
path_in_repo=dest_name,
repo_id=repo_id,
repo_type="dataset",
token=token,
commit_message=commit_msg,
)
url = get_dataset_url(dest_name)
if url:
uploaded_urls[dest_name] = url
return uploaded_urls
if __name__ == "__main__":
print("Testing HuggingFace connection...")
success, msg = test_hf_connection()
print(msg)
if success:
print("\nAttempting upload of ALL files...")
try:
urls = upload_all_to_huggingface()
print("\nUpload successful!")
for name, url in urls.items():
print(f"- {name}: {url}")
except Exception as e:
print(f"\nUpload failed: {e}")