Spaces:

VAILL
/

legislation-tracker

Running on CPU Upgrade

App Files Files Community

legislation-tracker / data /huggingface_upload.py

ramanna

Upload 30 files

0e39328 verified about 1 month ago

raw

history blame contribute delete

7.44 kB

	"""
	HuggingFace Dataset Upload Module
	- Tests HF connection
	- Uploads known_bills_visualize.json (legacy function)
	- Uploads ALL core data JSONs (new function) to HuggingFace Datasets Hub
	Works with the Admin panel HuggingFace tab
	"""

	from huggingface_hub import HfApi, create_repo
	import streamlit as st
	import os
	import json
	from pathlib import Path
	from typing import Dict, List, Tuple, Optional


	FILES_TO_UPLOAD = {
	"data/known_bills_visualize.json": "known_bills_visualize.json",
	"data/bill_summaries.json": "bill_summaries.json",
	"data/bill_suggested_questions.json": "bill_suggested_questions.json",
	"data/bill_reports.json": "bill_reports.json",
	"data/bill_cache.json": "bill_cache.json",
	"data/known_bills.json": "known_bills.json",
	"data/known_bills_fixed.json": "known_bills_fixed.json",
	}



	def _get_hf_token_and_repo() -> Tuple[str, str]:
	"""
	Get HF token + dataset repo.

	Priority:
	1. Streamlit secrets (for the Admin UI)
	2. Environment variables (for CLI scripts like update_data.py)
	- HUGGINGFACE_HUB_TOKEN
	- HF_REPO_ID
	"""
	token = None
	repo_id = None

	try:
	token = st.secrets["huggingface"]["token"]
	repo_id = st.secrets["huggingface"]["dataset_repo"]
	except Exception:
	pass

	if not token:
	token = os.getenv("HUGGINGFACE_HUB_TOKEN")
	if not repo_id:
	repo_id = os.getenv("HF_REPO_ID")

	if not token or not repo_id:
	raise KeyError(
	"HuggingFace configuration missing. "
	"Provide either Streamlit secrets "
	"[huggingface.token] and [huggingface.dataset_repo] "
	"or environment variables HUGGINGFACE_HUB_TOKEN and HF_REPO_ID."
	)

	return token, repo_id



	def test_hf_connection() -> Tuple[bool, str]:
	"""
	Test connection to HuggingFace API

	Returns:
	tuple: (success: bool, message: str)
	"""
	try:
	token, _ = _get_hf_token_and_repo()
	api = HfApi()
	user = api.whoami(token=token)
	username = user.get("name") or user.get("fullname") or user.get("id") or "User"
	return True, f"Connected as: {username}"
	except KeyError:
	return False, "HuggingFace token or dataset_repo not found in secrets"
	except Exception as e:
	return False, f"Connection failed: {str(e)}"


	def get_dataset_url(filename: str = "known_bills_visualize.json") -> Optional[str]:
	"""
	Get the public URL of a file inside the dataset.

	Args:
	filename: Name of the file in the HF dataset repo.

	Returns:
	str \| None: URL to the dataset file, or None if config missing
	"""
	try:
	repo = st.secrets["huggingface"]["dataset_repo"]
	return f"https://huggingface.co/datasets/{repo}/resolve/main/{filename}"
	except KeyError:
	return None


	def _find_and_validate_json(possible_paths: List[Path]) -> Path:
	"""
	Given a list of possible paths, return the first that exists,
	and validate that it is valid JSON.
	"""
	file_path = None
	for path in possible_paths:
	if path.exists():
	file_path = path
	break

	if file_path is None:
	raise FileNotFoundError(
	"File not found.\n"
	"Checked locations:\n" + "\n".join(f" - {p}" for p in possible_paths)
	)

	try:
	with open(file_path, "r", encoding="utf-8") as f:
	data = json.load(f)
	if not isinstance(data, (dict, list)):
	raise ValueError("JSON file must contain a dict or list")
	except json.JSONDecodeError as e:
	raise ValueError(f"Invalid JSON file: {str(e)}")

	return file_path


	def _ensure_dataset_exists(api: HfApi, repo_id: str, token: str) -> None:
	"""Create the dataset repo if it does not already exist."""
	try:
	create_repo(
	repo_id=repo_id,
	repo_type="dataset",
	token=token,
	exist_ok=True,
	private=False,
	)
	except Exception:
	pass


	def upload_to_huggingface() -> str:
	"""
	Legacy function: Upload ONLY known_bills_visualize.json to HuggingFace Datasets Hub.
	Used by existing Admin panel code. New code should prefer upload_all_to_huggingface().

	Returns:
	str: Public URL to the uploaded file

	Raises:
	FileNotFoundError: If JSON file doesn't exist
	Exception: If upload fails
	"""
	try:
	token, repo_id = _get_hf_token_and_repo()
	api = HfApi()

	_ensure_dataset_exists(api, repo_id, token)

	possible_paths = [
	Path("data/known_bills_visualize.json"),
	Path("known_bills_visualize.json"),
	]
	file_path = _find_and_validate_json(possible_paths)

	file_size_mb = os.path.getsize(file_path) / (1024 * 1024)

	api.upload_file(
	path_or_fileobj=str(file_path),
	path_in_repo="known_bills_visualize.json",
	repo_id=repo_id,
	repo_type="dataset",
	token=token,
	commit_message=f"Update AI legislation data ({file_size_mb:.2f}MB)",
	)

	url = get_dataset_url("known_bills_visualize.json")
	return url

	except FileNotFoundError as e:
	raise e
	except KeyError as e:
	raise Exception(f"Missing configuration in secrets.toml: {e}")
	except Exception as e:
	raise Exception(f"Upload failed: {str(e)}")


	def upload_all_to_huggingface() -> Dict[str, str]:
	"""
	NEW: Upload ALL core JSON files to HuggingFace Datasets Hub.

	Returns:
	dict: mapping from dataset filename -> public URL (for successfully uploaded files)
	"""
	token, repo_id = _get_hf_token_and_repo()
	api = HfApi()
	_ensure_dataset_exists(api, repo_id, token)

	uploaded_urls: Dict[str, str] = {}

	for local_path, dest_name in FILES_TO_UPLOAD.items():
	possible_paths = [Path(local_path), Path(dest_name)]

	try:
	file_path = _find_and_validate_json(possible_paths)
	except FileNotFoundError:
	msg = f"Skipping missing file: {local_path}"
	print(msg)
	st.write(msg)
	continue
	except ValueError as e:
	msg = f"Skipping invalid JSON in {local_path}: {e}"
	print(msg)
	st.write(msg)
	continue

	file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
	commit_msg = f"Update {dest_name} ({file_size_mb:.2f}MB)"

	print(f"Uploading {file_path} → {repo_id}/{dest_name} ...")
	api.upload_file(
	path_or_fileobj=str(file_path),
	path_in_repo=dest_name,
	repo_id=repo_id,
	repo_type="dataset",
	token=token,
	commit_message=commit_msg,
	)

	url = get_dataset_url(dest_name)
	if url:
	uploaded_urls[dest_name] = url

	return uploaded_urls


	if __name__ == "__main__":
	print("Testing HuggingFace connection...")
	success, msg = test_hf_connection()
	print(msg)

	if success:
	print("\nAttempting upload of ALL files...")
	try:
	urls = upload_all_to_huggingface()
	print("\nUpload successful!")
	for name, url in urls.items():
	print(f"- {name}: {url}")
	except Exception as e:
	print(f"\nUpload failed: {e}")