Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| """ | |
| HuggingFace Dataset Upload Module | |
| - Tests HF connection | |
| - Uploads known_bills_visualize.json (legacy function) | |
| - Uploads ALL core data JSONs (new function) to HuggingFace Datasets Hub | |
| Works with the Admin panel HuggingFace tab | |
| """ | |
| from huggingface_hub import HfApi, create_repo | |
| import streamlit as st | |
| import os | |
| import json | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple, Optional | |
| FILES_TO_UPLOAD = { | |
| "data/known_bills_visualize.json": "known_bills_visualize.json", | |
| "data/bill_summaries.json": "bill_summaries.json", | |
| "data/bill_suggested_questions.json": "bill_suggested_questions.json", | |
| "data/bill_reports.json": "bill_reports.json", | |
| "data/bill_cache.json": "bill_cache.json", | |
| "data/known_bills.json": "known_bills.json", | |
| "data/known_bills_fixed.json": "known_bills_fixed.json", | |
| } | |
| def _get_hf_token_and_repo() -> Tuple[str, str]: | |
| """ | |
| Get HF token + dataset repo. | |
| Priority: | |
| 1. Streamlit secrets (for the Admin UI) | |
| 2. Environment variables (for CLI scripts like update_data.py) | |
| - HUGGINGFACE_HUB_TOKEN | |
| - HF_REPO_ID | |
| """ | |
| token = None | |
| repo_id = None | |
| try: | |
| token = st.secrets["huggingface"]["token"] | |
| repo_id = st.secrets["huggingface"]["dataset_repo"] | |
| except Exception: | |
| pass | |
| if not token: | |
| token = os.getenv("HUGGINGFACE_HUB_TOKEN") | |
| if not repo_id: | |
| repo_id = os.getenv("HF_REPO_ID") | |
| if not token or not repo_id: | |
| raise KeyError( | |
| "HuggingFace configuration missing. " | |
| "Provide either Streamlit secrets " | |
| "[huggingface.token] and [huggingface.dataset_repo] " | |
| "or environment variables HUGGINGFACE_HUB_TOKEN and HF_REPO_ID." | |
| ) | |
| return token, repo_id | |
| def test_hf_connection() -> Tuple[bool, str]: | |
| """ | |
| Test connection to HuggingFace API | |
| Returns: | |
| tuple: (success: bool, message: str) | |
| """ | |
| try: | |
| token, _ = _get_hf_token_and_repo() | |
| api = HfApi() | |
| user = api.whoami(token=token) | |
| username = user.get("name") or user.get("fullname") or user.get("id") or "User" | |
| return True, f"Connected as: {username}" | |
| except KeyError: | |
| return False, "HuggingFace token or dataset_repo not found in secrets" | |
| except Exception as e: | |
| return False, f"Connection failed: {str(e)}" | |
| def get_dataset_url(filename: str = "known_bills_visualize.json") -> Optional[str]: | |
| """ | |
| Get the public URL of a file inside the dataset. | |
| Args: | |
| filename: Name of the file in the HF dataset repo. | |
| Returns: | |
| str | None: URL to the dataset file, or None if config missing | |
| """ | |
| try: | |
| repo = st.secrets["huggingface"]["dataset_repo"] | |
| return f"https://huggingface.co/datasets/{repo}/resolve/main/{filename}" | |
| except KeyError: | |
| return None | |
| def _find_and_validate_json(possible_paths: List[Path]) -> Path: | |
| """ | |
| Given a list of possible paths, return the first that exists, | |
| and validate that it is valid JSON. | |
| """ | |
| file_path = None | |
| for path in possible_paths: | |
| if path.exists(): | |
| file_path = path | |
| break | |
| if file_path is None: | |
| raise FileNotFoundError( | |
| "File not found.\n" | |
| "Checked locations:\n" + "\n".join(f" - {p}" for p in possible_paths) | |
| ) | |
| try: | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| if not isinstance(data, (dict, list)): | |
| raise ValueError("JSON file must contain a dict or list") | |
| except json.JSONDecodeError as e: | |
| raise ValueError(f"Invalid JSON file: {str(e)}") | |
| return file_path | |
| def _ensure_dataset_exists(api: HfApi, repo_id: str, token: str) -> None: | |
| """Create the dataset repo if it does not already exist.""" | |
| try: | |
| create_repo( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token, | |
| exist_ok=True, | |
| private=False, | |
| ) | |
| except Exception: | |
| pass | |
| def upload_to_huggingface() -> str: | |
| """ | |
| Legacy function: Upload ONLY known_bills_visualize.json to HuggingFace Datasets Hub. | |
| Used by existing Admin panel code. New code should prefer upload_all_to_huggingface(). | |
| Returns: | |
| str: Public URL to the uploaded file | |
| Raises: | |
| FileNotFoundError: If JSON file doesn't exist | |
| Exception: If upload fails | |
| """ | |
| try: | |
| token, repo_id = _get_hf_token_and_repo() | |
| api = HfApi() | |
| _ensure_dataset_exists(api, repo_id, token) | |
| possible_paths = [ | |
| Path("data/known_bills_visualize.json"), | |
| Path("known_bills_visualize.json"), | |
| ] | |
| file_path = _find_and_validate_json(possible_paths) | |
| file_size_mb = os.path.getsize(file_path) / (1024 * 1024) | |
| api.upload_file( | |
| path_or_fileobj=str(file_path), | |
| path_in_repo="known_bills_visualize.json", | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token, | |
| commit_message=f"Update AI legislation data ({file_size_mb:.2f}MB)", | |
| ) | |
| url = get_dataset_url("known_bills_visualize.json") | |
| return url | |
| except FileNotFoundError as e: | |
| raise e | |
| except KeyError as e: | |
| raise Exception(f"Missing configuration in secrets.toml: {e}") | |
| except Exception as e: | |
| raise Exception(f"Upload failed: {str(e)}") | |
| def upload_all_to_huggingface() -> Dict[str, str]: | |
| """ | |
| NEW: Upload ALL core JSON files to HuggingFace Datasets Hub. | |
| Returns: | |
| dict: mapping from dataset filename -> public URL (for successfully uploaded files) | |
| """ | |
| token, repo_id = _get_hf_token_and_repo() | |
| api = HfApi() | |
| _ensure_dataset_exists(api, repo_id, token) | |
| uploaded_urls: Dict[str, str] = {} | |
| for local_path, dest_name in FILES_TO_UPLOAD.items(): | |
| possible_paths = [Path(local_path), Path(dest_name)] | |
| try: | |
| file_path = _find_and_validate_json(possible_paths) | |
| except FileNotFoundError: | |
| msg = f"Skipping missing file: {local_path}" | |
| print(msg) | |
| st.write(msg) | |
| continue | |
| except ValueError as e: | |
| msg = f"Skipping invalid JSON in {local_path}: {e}" | |
| print(msg) | |
| st.write(msg) | |
| continue | |
| file_size_mb = os.path.getsize(file_path) / (1024 * 1024) | |
| commit_msg = f"Update {dest_name} ({file_size_mb:.2f}MB)" | |
| print(f"Uploading {file_path} → {repo_id}/{dest_name} ...") | |
| api.upload_file( | |
| path_or_fileobj=str(file_path), | |
| path_in_repo=dest_name, | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| token=token, | |
| commit_message=commit_msg, | |
| ) | |
| url = get_dataset_url(dest_name) | |
| if url: | |
| uploaded_urls[dest_name] = url | |
| return uploaded_urls | |
| if __name__ == "__main__": | |
| print("Testing HuggingFace connection...") | |
| success, msg = test_hf_connection() | |
| print(msg) | |
| if success: | |
| print("\nAttempting upload of ALL files...") | |
| try: | |
| urls = upload_all_to_huggingface() | |
| print("\nUpload successful!") | |
| for name, url in urls.items(): | |
| print(f"- {name}: {url}") | |
| except Exception as e: | |
| print(f"\nUpload failed: {e}") | |