""" HuggingFace Dataset Upload Module - Tests HF connection - Uploads known_bills_visualize.json (legacy function) - Uploads ALL core data JSONs (new function) to HuggingFace Datasets Hub Works with the Admin panel HuggingFace tab """ from huggingface_hub import HfApi, create_repo import streamlit as st import os import json from pathlib import Path from typing import Dict, List, Tuple, Optional FILES_TO_UPLOAD = { "data/known_bills_visualize.json": "known_bills_visualize.json", "data/bill_summaries.json": "bill_summaries.json", "data/bill_suggested_questions.json": "bill_suggested_questions.json", "data/bill_reports.json": "bill_reports.json", "data/bill_cache.json": "bill_cache.json", "data/known_bills.json": "known_bills.json", "data/known_bills_fixed.json": "known_bills_fixed.json", } def _get_hf_token_and_repo() -> Tuple[str, str]: """ Get HF token + dataset repo. Priority: 1. Streamlit secrets (for the Admin UI) 2. Environment variables (for CLI scripts like update_data.py) - HUGGINGFACE_HUB_TOKEN - HF_REPO_ID """ token = None repo_id = None try: token = st.secrets["huggingface"]["token"] repo_id = st.secrets["huggingface"]["dataset_repo"] except Exception: pass if not token: token = os.getenv("HUGGINGFACE_HUB_TOKEN") if not repo_id: repo_id = os.getenv("HF_REPO_ID") if not token or not repo_id: raise KeyError( "HuggingFace configuration missing. " "Provide either Streamlit secrets " "[huggingface.token] and [huggingface.dataset_repo] " "or environment variables HUGGINGFACE_HUB_TOKEN and HF_REPO_ID." ) return token, repo_id def test_hf_connection() -> Tuple[bool, str]: """ Test connection to HuggingFace API Returns: tuple: (success: bool, message: str) """ try: token, _ = _get_hf_token_and_repo() api = HfApi() user = api.whoami(token=token) username = user.get("name") or user.get("fullname") or user.get("id") or "User" return True, f"Connected as: {username}" except KeyError: return False, "HuggingFace token or dataset_repo not found in secrets" except Exception as e: return False, f"Connection failed: {str(e)}" def get_dataset_url(filename: str = "known_bills_visualize.json") -> Optional[str]: """ Get the public URL of a file inside the dataset. Args: filename: Name of the file in the HF dataset repo. Returns: str | None: URL to the dataset file, or None if config missing """ try: repo = st.secrets["huggingface"]["dataset_repo"] return f"https://huggingface.co/datasets/{repo}/resolve/main/{filename}" except KeyError: return None def _find_and_validate_json(possible_paths: List[Path]) -> Path: """ Given a list of possible paths, return the first that exists, and validate that it is valid JSON. """ file_path = None for path in possible_paths: if path.exists(): file_path = path break if file_path is None: raise FileNotFoundError( "File not found.\n" "Checked locations:\n" + "\n".join(f" - {p}" for p in possible_paths) ) try: with open(file_path, "r", encoding="utf-8") as f: data = json.load(f) if not isinstance(data, (dict, list)): raise ValueError("JSON file must contain a dict or list") except json.JSONDecodeError as e: raise ValueError(f"Invalid JSON file: {str(e)}") return file_path def _ensure_dataset_exists(api: HfApi, repo_id: str, token: str) -> None: """Create the dataset repo if it does not already exist.""" try: create_repo( repo_id=repo_id, repo_type="dataset", token=token, exist_ok=True, private=False, ) except Exception: pass def upload_to_huggingface() -> str: """ Legacy function: Upload ONLY known_bills_visualize.json to HuggingFace Datasets Hub. Used by existing Admin panel code. New code should prefer upload_all_to_huggingface(). Returns: str: Public URL to the uploaded file Raises: FileNotFoundError: If JSON file doesn't exist Exception: If upload fails """ try: token, repo_id = _get_hf_token_and_repo() api = HfApi() _ensure_dataset_exists(api, repo_id, token) possible_paths = [ Path("data/known_bills_visualize.json"), Path("known_bills_visualize.json"), ] file_path = _find_and_validate_json(possible_paths) file_size_mb = os.path.getsize(file_path) / (1024 * 1024) api.upload_file( path_or_fileobj=str(file_path), path_in_repo="known_bills_visualize.json", repo_id=repo_id, repo_type="dataset", token=token, commit_message=f"Update AI legislation data ({file_size_mb:.2f}MB)", ) url = get_dataset_url("known_bills_visualize.json") return url except FileNotFoundError as e: raise e except KeyError as e: raise Exception(f"Missing configuration in secrets.toml: {e}") except Exception as e: raise Exception(f"Upload failed: {str(e)}") def upload_all_to_huggingface() -> Dict[str, str]: """ NEW: Upload ALL core JSON files to HuggingFace Datasets Hub. Returns: dict: mapping from dataset filename -> public URL (for successfully uploaded files) """ token, repo_id = _get_hf_token_and_repo() api = HfApi() _ensure_dataset_exists(api, repo_id, token) uploaded_urls: Dict[str, str] = {} for local_path, dest_name in FILES_TO_UPLOAD.items(): possible_paths = [Path(local_path), Path(dest_name)] try: file_path = _find_and_validate_json(possible_paths) except FileNotFoundError: msg = f"Skipping missing file: {local_path}" print(msg) st.write(msg) continue except ValueError as e: msg = f"Skipping invalid JSON in {local_path}: {e}" print(msg) st.write(msg) continue file_size_mb = os.path.getsize(file_path) / (1024 * 1024) commit_msg = f"Update {dest_name} ({file_size_mb:.2f}MB)" print(f"Uploading {file_path} → {repo_id}/{dest_name} ...") api.upload_file( path_or_fileobj=str(file_path), path_in_repo=dest_name, repo_id=repo_id, repo_type="dataset", token=token, commit_message=commit_msg, ) url = get_dataset_url(dest_name) if url: uploaded_urls[dest_name] = url return uploaded_urls if __name__ == "__main__": print("Testing HuggingFace connection...") success, msg = test_hf_connection() print(msg) if success: print("\nAttempting upload of ALL files...") try: urls = upload_all_to_huggingface() print("\nUpload successful!") for name, url in urls.items(): print(f"- {name}: {url}") except Exception as e: print(f"\nUpload failed: {e}")