Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| #!/usr/bin/env python3 | |
| """ | |
| Publish Gold Layer Parquet Files to HuggingFace | |
| Publishes national-level gold datasets to HuggingFace for public sharing. | |
| """ | |
| import os | |
| from pathlib import Path | |
| from datetime import datetime | |
| import pandas as pd | |
| from huggingface_hub import HfApi, login, create_repo | |
| from datasets import Dataset | |
| from loguru import logger | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| # Configuration | |
| HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN') | |
| HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne') | |
| HF_DATASET_PREFIX = os.getenv('HF_DATASET_PREFIX', 'one') | |
| # Paths | |
| GOLD_DIR = Path("data/gold/national") | |
| # Dataset mappings (file -> HuggingFace dataset name) | |
| DATASETS = { | |
| "meetings_calendar.parquet": "meetings-calendar", | |
| "nonprofits_organizations.parquet": "nonprofits-organizations", | |
| "nonprofits_financials.parquet": "nonprofits-financials", | |
| "nonprofits_programs.parquet": "nonprofits-programs", | |
| "nonprofits_locations.parquet": "nonprofits-locations", | |
| } | |
| def publish_dataset(file_path: Path, dataset_name: str, api: HfApi, private: bool = False) -> dict: | |
| """Publish a single parquet file to HuggingFace.""" | |
| if not file_path.exists(): | |
| logger.warning(f"β οΈ Skipping {file_path.name} - file not found") | |
| return {"error": "File not found"} | |
| # Create repo ID | |
| repo_id = f"{HF_ORGANIZATION}/{HF_DATASET_PREFIX}-{dataset_name}" | |
| logger.info(f"π€ Publishing {file_path.name} to {repo_id}...") | |
| try: | |
| # Load parquet file | |
| df = pd.read_parquet(file_path) | |
| logger.info(f" Loaded {len(df):,} records") | |
| # Create HuggingFace dataset | |
| dataset = Dataset.from_pandas(df) | |
| # Create repo if it doesn't exist | |
| try: | |
| create_repo( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| private=private, | |
| exist_ok=True, | |
| token=HUGGINGFACE_TOKEN | |
| ) | |
| except Exception as e: | |
| logger.debug(f" Repo may already exist: {e}") | |
| # Push to hub | |
| dataset.push_to_hub( | |
| repo_id=repo_id, | |
| private=private, | |
| commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", | |
| token=HUGGINGFACE_TOKEN | |
| ) | |
| url = f"https://huggingface.co/datasets/{repo_id}" | |
| logger.success(f" β Published {len(df):,} records to {url}") | |
| return { | |
| "repo_id": repo_id, | |
| "url": url, | |
| "records": len(df), | |
| "columns": list(df.columns) | |
| } | |
| except Exception as e: | |
| logger.error(f" β Failed: {e}") | |
| return {"error": str(e)} | |
| def main(): | |
| """Publish all gold datasets to HuggingFace.""" | |
| if not HUGGINGFACE_TOKEN: | |
| logger.error("β HUGGINGFACE_TOKEN not set in environment") | |
| logger.error(" Set it in .env file or export it") | |
| return | |
| # Login to HuggingFace | |
| login(token=HUGGINGFACE_TOKEN) | |
| api = HfApi(token=HUGGINGFACE_TOKEN) | |
| # Get user info | |
| user_info = api.whoami(token=HUGGINGFACE_TOKEN) | |
| username = user_info['name'] | |
| logger.info("=" * 70) | |
| logger.info("π Publishing Gold Datasets to HuggingFace") | |
| logger.info("=" * 70) | |
| logger.info(f"π€ User: {username}") | |
| logger.info(f"π’ Organization: {HF_ORGANIZATION}") | |
| logger.info(f"π Source: {GOLD_DIR}") | |
| logger.info("") | |
| results = {} | |
| for filename, dataset_name in DATASETS.items(): | |
| file_path = GOLD_DIR / filename | |
| result = publish_dataset(file_path, dataset_name, api, private=False) | |
| results[dataset_name] = result | |
| print() | |
| # Summary | |
| logger.info("=" * 70) | |
| logger.info("π PUBLICATION SUMMARY") | |
| logger.info("=" * 70) | |
| successful = 0 | |
| failed = 0 | |
| total_records = 0 | |
| for name, info in results.items(): | |
| if "url" in info: | |
| logger.success(f"β {name}: {info['records']:,} records") | |
| logger.info(f" {info['url']}") | |
| successful += 1 | |
| total_records += info['records'] | |
| else: | |
| logger.error(f"β {name}: {info.get('error', 'Unknown error')}") | |
| failed += 1 | |
| logger.info("") | |
| logger.info(f"π Published {successful} dataset(s) with {total_records:,} total records") | |
| if failed > 0: | |
| logger.warning(f"β οΈ Failed to publish {failed} dataset(s)") | |
| logger.success("π Done!") | |
| if __name__ == "__main__": | |
| main() | |