open-navigator / scripts /huggingface /publish_gold_datasets.py
jcbowyer's picture
Clean HuggingFace deployment without binary files
61d29fc
#!/usr/bin/env python3
"""
Publish Gold Layer Parquet Files to HuggingFace
Publishes national-level gold datasets to HuggingFace for public sharing.
"""
import os
from pathlib import Path
from datetime import datetime
import pandas as pd
from huggingface_hub import HfApi, login, create_repo
from datasets import Dataset
from loguru import logger
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Configuration
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')
HF_DATASET_PREFIX = os.getenv('HF_DATASET_PREFIX', 'one')
# Paths
GOLD_DIR = Path("data/gold/national")
# Dataset mappings (file -> HuggingFace dataset name)
DATASETS = {
"meetings_calendar.parquet": "meetings-calendar",
"nonprofits_organizations.parquet": "nonprofits-organizations",
"nonprofits_financials.parquet": "nonprofits-financials",
"nonprofits_programs.parquet": "nonprofits-programs",
"nonprofits_locations.parquet": "nonprofits-locations",
}
def publish_dataset(file_path: Path, dataset_name: str, api: HfApi, private: bool = False) -> dict:
"""Publish a single parquet file to HuggingFace."""
if not file_path.exists():
logger.warning(f"⚠️ Skipping {file_path.name} - file not found")
return {"error": "File not found"}
# Create repo ID
repo_id = f"{HF_ORGANIZATION}/{HF_DATASET_PREFIX}-{dataset_name}"
logger.info(f"πŸ“€ Publishing {file_path.name} to {repo_id}...")
try:
# Load parquet file
df = pd.read_parquet(file_path)
logger.info(f" Loaded {len(df):,} records")
# Create HuggingFace dataset
dataset = Dataset.from_pandas(df)
# Create repo if it doesn't exist
try:
create_repo(
repo_id=repo_id,
repo_type="dataset",
private=private,
exist_ok=True,
token=HUGGINGFACE_TOKEN
)
except Exception as e:
logger.debug(f" Repo may already exist: {e}")
# Push to hub
dataset.push_to_hub(
repo_id=repo_id,
private=private,
commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
token=HUGGINGFACE_TOKEN
)
url = f"https://huggingface.co/datasets/{repo_id}"
logger.success(f" βœ… Published {len(df):,} records to {url}")
return {
"repo_id": repo_id,
"url": url,
"records": len(df),
"columns": list(df.columns)
}
except Exception as e:
logger.error(f" ❌ Failed: {e}")
return {"error": str(e)}
def main():
"""Publish all gold datasets to HuggingFace."""
if not HUGGINGFACE_TOKEN:
logger.error("❌ HUGGINGFACE_TOKEN not set in environment")
logger.error(" Set it in .env file or export it")
return
# Login to HuggingFace
login(token=HUGGINGFACE_TOKEN)
api = HfApi(token=HUGGINGFACE_TOKEN)
# Get user info
user_info = api.whoami(token=HUGGINGFACE_TOKEN)
username = user_info['name']
logger.info("=" * 70)
logger.info("πŸš€ Publishing Gold Datasets to HuggingFace")
logger.info("=" * 70)
logger.info(f"πŸ‘€ User: {username}")
logger.info(f"🏒 Organization: {HF_ORGANIZATION}")
logger.info(f"πŸ“‚ Source: {GOLD_DIR}")
logger.info("")
results = {}
for filename, dataset_name in DATASETS.items():
file_path = GOLD_DIR / filename
result = publish_dataset(file_path, dataset_name, api, private=False)
results[dataset_name] = result
print()
# Summary
logger.info("=" * 70)
logger.info("πŸ“Š PUBLICATION SUMMARY")
logger.info("=" * 70)
successful = 0
failed = 0
total_records = 0
for name, info in results.items():
if "url" in info:
logger.success(f"βœ… {name}: {info['records']:,} records")
logger.info(f" {info['url']}")
successful += 1
total_records += info['records']
else:
logger.error(f"❌ {name}: {info.get('error', 'Unknown error')}")
failed += 1
logger.info("")
logger.info(f"πŸ“ˆ Published {successful} dataset(s) with {total_records:,} total records")
if failed > 0:
logger.warning(f"⚠️ Failed to publish {failed} dataset(s)")
logger.success("πŸŽ‰ Done!")
if __name__ == "__main__":
main()