Spaces:

CommunityOne
/

open-navigator

Running on CPU Upgrade

App Files Files Community

open-navigator / scripts /huggingface /publish_gold_datasets.py

jcbowyer

Clean HuggingFace deployment without binary files

61d29fc 28 days ago

raw

history blame contribute delete

4.64 kB

	#!/usr/bin/env python3
	"""
	Publish Gold Layer Parquet Files to HuggingFace

	Publishes national-level gold datasets to HuggingFace for public sharing.
	"""
	import os
	from pathlib import Path
	from datetime import datetime
	import pandas as pd
	from huggingface_hub import HfApi, login, create_repo
	from datasets import Dataset
	from loguru import logger
	from dotenv import load_dotenv

	# Load environment variables
	load_dotenv()

	# Configuration
	HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
	HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')
	HF_DATASET_PREFIX = os.getenv('HF_DATASET_PREFIX', 'one')

	# Paths
	GOLD_DIR = Path("data/gold/national")

	# Dataset mappings (file -> HuggingFace dataset name)
	DATASETS = {
	"meetings_calendar.parquet": "meetings-calendar",
	"nonprofits_organizations.parquet": "nonprofits-organizations",
	"nonprofits_financials.parquet": "nonprofits-financials",
	"nonprofits_programs.parquet": "nonprofits-programs",
	"nonprofits_locations.parquet": "nonprofits-locations",
	}


	def publish_dataset(file_path: Path, dataset_name: str, api: HfApi, private: bool = False) -> dict:
	"""Publish a single parquet file to HuggingFace."""

	if not file_path.exists():
	logger.warning(f"⚠️ Skipping {file_path.name} - file not found")
	return {"error": "File not found"}

	# Create repo ID
	repo_id = f"{HF_ORGANIZATION}/{HF_DATASET_PREFIX}-{dataset_name}"

	logger.info(f"📤 Publishing {file_path.name} to {repo_id}...")

	try:
	# Load parquet file
	df = pd.read_parquet(file_path)
	logger.info(f" Loaded {len(df):,} records")

	# Create HuggingFace dataset
	dataset = Dataset.from_pandas(df)

	# Create repo if it doesn't exist
	try:
	create_repo(
	repo_id=repo_id,
	repo_type="dataset",
	private=private,
	exist_ok=True,
	token=HUGGINGFACE_TOKEN
	)
	except Exception as e:
	logger.debug(f" Repo may already exist: {e}")

	# Push to hub
	dataset.push_to_hub(
	repo_id=repo_id,
	private=private,
	commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
	token=HUGGINGFACE_TOKEN
	)

	url = f"https://huggingface.co/datasets/{repo_id}"
	logger.success(f" ✅ Published {len(df):,} records to {url}")

	return {
	"repo_id": repo_id,
	"url": url,
	"records": len(df),
	"columns": list(df.columns)
	}

	except Exception as e:
	logger.error(f" ❌ Failed: {e}")
	return {"error": str(e)}


	def main():
	"""Publish all gold datasets to HuggingFace."""

	if not HUGGINGFACE_TOKEN:
	logger.error("❌ HUGGINGFACE_TOKEN not set in environment")
	logger.error(" Set it in .env file or export it")
	return

	# Login to HuggingFace
	login(token=HUGGINGFACE_TOKEN)
	api = HfApi(token=HUGGINGFACE_TOKEN)

	# Get user info
	user_info = api.whoami(token=HUGGINGFACE_TOKEN)
	username = user_info['name']

	logger.info("=" * 70)
	logger.info("🚀 Publishing Gold Datasets to HuggingFace")
	logger.info("=" * 70)
	logger.info(f"👤 User: {username}")
	logger.info(f"🏢 Organization: {HF_ORGANIZATION}")
	logger.info(f"📂 Source: {GOLD_DIR}")
	logger.info("")

	results = {}

	for filename, dataset_name in DATASETS.items():
	file_path = GOLD_DIR / filename
	result = publish_dataset(file_path, dataset_name, api, private=False)
	results[dataset_name] = result
	print()

	# Summary
	logger.info("=" * 70)
	logger.info("📊 PUBLICATION SUMMARY")
	logger.info("=" * 70)

	successful = 0
	failed = 0
	total_records = 0

	for name, info in results.items():
	if "url" in info:
	logger.success(f"✅ {name}: {info['records']:,} records")
	logger.info(f" {info['url']}")
	successful += 1
	total_records += info['records']
	else:
	logger.error(f"❌ {name}: {info.get('error', 'Unknown error')}")
	failed += 1

	logger.info("")
	logger.info(f"📈 Published {successful} dataset(s) with {total_records:,} total records")

	if failed > 0:
	logger.warning(f"⚠️ Failed to publish {failed} dataset(s)")

	logger.success("🎉 Done!")


	if __name__ == "__main__":
	main()