Spaces:

CommunityOne
/

open-navigator

Running on CPU Upgrade

File size: 4,638 Bytes

61d29fc

#!/usr/bin/env python3
"""
Publish Gold Layer Parquet Files to HuggingFace

Publishes national-level gold datasets to HuggingFace for public sharing.
"""
import os
from pathlib import Path
from datetime import datetime
import pandas as pd
from huggingface_hub import HfApi, login, create_repo
from datasets import Dataset
from loguru import logger
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Configuration
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')
HF_DATASET_PREFIX = os.getenv('HF_DATASET_PREFIX', 'one')

# Paths
GOLD_DIR = Path("data/gold/national")

# Dataset mappings (file -> HuggingFace dataset name)
DATASETS = {
    "meetings_calendar.parquet": "meetings-calendar",
    "nonprofits_organizations.parquet": "nonprofits-organizations",
    "nonprofits_financials.parquet": "nonprofits-financials",
    "nonprofits_programs.parquet": "nonprofits-programs",
    "nonprofits_locations.parquet": "nonprofits-locations",
}


def publish_dataset(file_path: Path, dataset_name: str, api: HfApi, private: bool = False) -> dict:
    """Publish a single parquet file to HuggingFace."""
    
    if not file_path.exists():
        logger.warning(f"⚠️  Skipping {file_path.name} - file not found")
        return {"error": "File not found"}
    
    # Create repo ID
    repo_id = f"{HF_ORGANIZATION}/{HF_DATASET_PREFIX}-{dataset_name}"
    
    logger.info(f"📤 Publishing {file_path.name} to {repo_id}...")
    
    try:
        # Load parquet file
        df = pd.read_parquet(file_path)
        logger.info(f"   Loaded {len(df):,} records")
        
        # Create HuggingFace dataset
        dataset = Dataset.from_pandas(df)
        
        # Create repo if it doesn't exist
        try:
            create_repo(
                repo_id=repo_id,
                repo_type="dataset",
                private=private,
                exist_ok=True,
                token=HUGGINGFACE_TOKEN
            )
        except Exception as e:
            logger.debug(f"   Repo may already exist: {e}")
        
        # Push to hub
        dataset.push_to_hub(
            repo_id=repo_id,
            private=private,
            commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            token=HUGGINGFACE_TOKEN
        )
        
        url = f"https://huggingface.co/datasets/{repo_id}"
        logger.success(f"   ✅ Published {len(df):,} records to {url}")
        
        return {
            "repo_id": repo_id,
            "url": url,
            "records": len(df),
            "columns": list(df.columns)
        }
        
    except Exception as e:
        logger.error(f"   ❌ Failed: {e}")
        return {"error": str(e)}


def main():
    """Publish all gold datasets to HuggingFace."""
    
    if not HUGGINGFACE_TOKEN:
        logger.error("❌ HUGGINGFACE_TOKEN not set in environment")
        logger.error("   Set it in .env file or export it")
        return
    
    # Login to HuggingFace
    login(token=HUGGINGFACE_TOKEN)
    api = HfApi(token=HUGGINGFACE_TOKEN)
    
    # Get user info
    user_info = api.whoami(token=HUGGINGFACE_TOKEN)
    username = user_info['name']
    
    logger.info("=" * 70)
    logger.info("🚀 Publishing Gold Datasets to HuggingFace")
    logger.info("=" * 70)
    logger.info(f"👤 User: {username}")
    logger.info(f"🏢 Organization: {HF_ORGANIZATION}")
    logger.info(f"📂 Source: {GOLD_DIR}")
    logger.info("")
    
    results = {}
    
    for filename, dataset_name in DATASETS.items():
        file_path = GOLD_DIR / filename
        result = publish_dataset(file_path, dataset_name, api, private=False)
        results[dataset_name] = result
        print()
    
    # Summary
    logger.info("=" * 70)
    logger.info("📊 PUBLICATION SUMMARY")
    logger.info("=" * 70)
    
    successful = 0
    failed = 0
    total_records = 0
    
    for name, info in results.items():
        if "url" in info:
            logger.success(f"✅ {name}: {info['records']:,} records")
            logger.info(f"   {info['url']}")
            successful += 1
            total_records += info['records']
        else:
            logger.error(f"❌ {name}: {info.get('error', 'Unknown error')}")
            failed += 1
    
    logger.info("")
    logger.info(f"📈 Published {successful} dataset(s) with {total_records:,} total records")
    
    if failed > 0:
        logger.warning(f"⚠️  Failed to publish {failed} dataset(s)")
    
    logger.success("🎉 Done!")


if __name__ == "__main__":
    main()