Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 4,638 Bytes
61d29fc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | #!/usr/bin/env python3
"""
Publish Gold Layer Parquet Files to HuggingFace
Publishes national-level gold datasets to HuggingFace for public sharing.
"""
import os
from pathlib import Path
from datetime import datetime
import pandas as pd
from huggingface_hub import HfApi, login, create_repo
from datasets import Dataset
from loguru import logger
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
# Configuration
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')
HF_DATASET_PREFIX = os.getenv('HF_DATASET_PREFIX', 'one')
# Paths
GOLD_DIR = Path("data/gold/national")
# Dataset mappings (file -> HuggingFace dataset name)
DATASETS = {
"meetings_calendar.parquet": "meetings-calendar",
"nonprofits_organizations.parquet": "nonprofits-organizations",
"nonprofits_financials.parquet": "nonprofits-financials",
"nonprofits_programs.parquet": "nonprofits-programs",
"nonprofits_locations.parquet": "nonprofits-locations",
}
def publish_dataset(file_path: Path, dataset_name: str, api: HfApi, private: bool = False) -> dict:
"""Publish a single parquet file to HuggingFace."""
if not file_path.exists():
logger.warning(f"β οΈ Skipping {file_path.name} - file not found")
return {"error": "File not found"}
# Create repo ID
repo_id = f"{HF_ORGANIZATION}/{HF_DATASET_PREFIX}-{dataset_name}"
logger.info(f"π€ Publishing {file_path.name} to {repo_id}...")
try:
# Load parquet file
df = pd.read_parquet(file_path)
logger.info(f" Loaded {len(df):,} records")
# Create HuggingFace dataset
dataset = Dataset.from_pandas(df)
# Create repo if it doesn't exist
try:
create_repo(
repo_id=repo_id,
repo_type="dataset",
private=private,
exist_ok=True,
token=HUGGINGFACE_TOKEN
)
except Exception as e:
logger.debug(f" Repo may already exist: {e}")
# Push to hub
dataset.push_to_hub(
repo_id=repo_id,
private=private,
commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
token=HUGGINGFACE_TOKEN
)
url = f"https://huggingface.co/datasets/{repo_id}"
logger.success(f" β
Published {len(df):,} records to {url}")
return {
"repo_id": repo_id,
"url": url,
"records": len(df),
"columns": list(df.columns)
}
except Exception as e:
logger.error(f" β Failed: {e}")
return {"error": str(e)}
def main():
"""Publish all gold datasets to HuggingFace."""
if not HUGGINGFACE_TOKEN:
logger.error("β HUGGINGFACE_TOKEN not set in environment")
logger.error(" Set it in .env file or export it")
return
# Login to HuggingFace
login(token=HUGGINGFACE_TOKEN)
api = HfApi(token=HUGGINGFACE_TOKEN)
# Get user info
user_info = api.whoami(token=HUGGINGFACE_TOKEN)
username = user_info['name']
logger.info("=" * 70)
logger.info("π Publishing Gold Datasets to HuggingFace")
logger.info("=" * 70)
logger.info(f"π€ User: {username}")
logger.info(f"π’ Organization: {HF_ORGANIZATION}")
logger.info(f"π Source: {GOLD_DIR}")
logger.info("")
results = {}
for filename, dataset_name in DATASETS.items():
file_path = GOLD_DIR / filename
result = publish_dataset(file_path, dataset_name, api, private=False)
results[dataset_name] = result
print()
# Summary
logger.info("=" * 70)
logger.info("π PUBLICATION SUMMARY")
logger.info("=" * 70)
successful = 0
failed = 0
total_records = 0
for name, info in results.items():
if "url" in info:
logger.success(f"β
{name}: {info['records']:,} records")
logger.info(f" {info['url']}")
successful += 1
total_records += info['records']
else:
logger.error(f"β {name}: {info.get('error', 'Unknown error')}")
failed += 1
logger.info("")
logger.info(f"π Published {successful} dataset(s) with {total_records:,} total records")
if failed > 0:
logger.warning(f"β οΈ Failed to publish {failed} dataset(s)")
logger.success("π Done!")
if __name__ == "__main__":
main()
|