Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| #!/usr/bin/env python3 | |
| """ | |
| Fix and publish the 5 failed datasets | |
| """ | |
| import os | |
| from pathlib import Path | |
| from datetime import datetime | |
| import pandas as pd | |
| from huggingface_hub import HfApi, create_repo | |
| from datasets import Dataset | |
| from loguru import logger | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN') | |
| HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne') | |
| GOLD_DIR = Path("data/gold") | |
| def fix_and_publish_jurisdictions(): | |
| """Fix and publish the 4 jurisdiction files.""" | |
| jurisdiction_files = [ | |
| 'data/gold/reference/jurisdictions_cities.parquet', | |
| 'data/gold/reference/jurisdictions_counties.parquet', | |
| 'data/gold/reference/jurisdictions_school_districts.parquet', | |
| 'data/gold/reference/jurisdictions_townships.parquet', | |
| ] | |
| api = HfApi(token=HUGGINGFACE_TOKEN) | |
| for file_str in jurisdiction_files: | |
| file_path = Path(file_str) | |
| dataset_name = f"reference-{file_path.stem.replace('_', '-')}" | |
| repo_id = f"{HF_ORGANIZATION}/{dataset_name}" | |
| logger.info(f"π€ Processing {file_path.name}...") | |
| try: | |
| # Load file | |
| df = pd.read_parquet(file_path) | |
| logger.info(f" Loaded {len(df):,} records, {len(df.columns)} columns") | |
| # FIX: Convert ALL columns to standard types | |
| # This fixes the Arrow dictionary/categorical issue | |
| for col in df.columns: | |
| if df[col].dtype.name == 'category': | |
| logger.info(f" Converting categorical column: {col}") | |
| df[col] = df[col].astype(str) | |
| elif df[col].dtype == 'object': | |
| # Ensure all object columns are strings | |
| df[col] = df[col].astype(str) | |
| # Reset index | |
| df = df.reset_index(drop=True) | |
| logger.info(f" Creating HuggingFace dataset...") | |
| dataset = Dataset.from_pandas(df, preserve_index=False) | |
| # Create repo | |
| try: | |
| create_repo( | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| private=False, | |
| exist_ok=True, | |
| token=HUGGINGFACE_TOKEN | |
| ) | |
| except Exception as e: | |
| logger.debug(f" Repo may already exist: {e}") | |
| # Push to hub | |
| logger.info(f" Pushing to {repo_id}...") | |
| dataset.push_to_hub( | |
| repo_id=repo_id, | |
| private=False, | |
| commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", | |
| token=HUGGINGFACE_TOKEN | |
| ) | |
| url = f"https://huggingface.co/datasets/{repo_id}" | |
| logger.success(f" β Published {len(df):,} records to {url}\n") | |
| except Exception as e: | |
| logger.error(f" β Failed: {e}\n") | |
| def check_old_meeting_files(): | |
| """Check if we have old meetings.parquet files that should be replaced.""" | |
| logger.info("π Checking for old meeting file naming...") | |
| events_events = Path('data/gold/national/events_events.parquet') | |
| old_meetings_calendar = Path('data/gold/national/meetings_calendar.parquet') | |
| old_meetings = Path('data/gold/national/meetings.parquet') | |
| if events_events.exists(): | |
| try: | |
| df = pd.read_parquet(events_events) | |
| logger.success(f"β Found events_events.parquet with {len(df):,} records (new naming)") | |
| if old_meetings_calendar.exists() or old_meetings.exists(): | |
| logger.warning("β οΈ Old meeting files still exist - these can be deleted:") | |
| if old_meetings_calendar.exists(): | |
| logger.info(f" - meetings_calendar.parquet") | |
| if old_meetings.exists(): | |
| logger.info(f" - meetings.parquet") | |
| logger.info(" Run migration to rename old files to events_* naming\n") | |
| except Exception as e: | |
| logger.error(f"β events_events.parquet error: {e}") | |
| else: | |
| logger.warning("β οΈ events_events.parquet not found - run pipeline to generate") | |
| # Check if old files exist | |
| if old_meetings_calendar.exists(): | |
| try: | |
| df = pd.read_parquet(old_meetings_calendar) | |
| logger.info(f"π Old meetings_calendar.parquet has {len(df):,} records") | |
| except Exception as e: | |
| logger.error(f"β meetings_calendar.parquet is corrupted: {e}") | |
| if old_meetings.exists(): | |
| try: | |
| df = pd.read_parquet(old_meetings) | |
| logger.info(f"π Old meetings.parquet has {len(df):,} records") | |
| except Exception as e: | |
| logger.error(f"β meetings.parquet is corrupted: {e}") | |
| logger.info(f" File size: {old_meetings.stat().st_size / 1024 / 1024:.2f} MB") | |
| logger.info("") | |
| def main(): | |
| """Fix and publish failed datasets.""" | |
| if not HUGGINGFACE_TOKEN: | |
| logger.error("β HUGGINGFACE_TOKEN not set") | |
| return | |
| logger.info("=" * 80) | |
| logger.info("π§ Fixing and Publishing Failed Datasets") | |
| logger.info("=" * 80) | |
| print() | |
| # Check for old meeting file naming | |
| check_old_meeting_files() | |
| # Fix and publish jurisdiction files | |
| logger.info("π Publishing 4 jurisdiction reference datasets...") | |
| print() | |
| fix_and_publish_jurisdictions() | |
| # Summary | |
| logger.info("=" * 80) | |
| logger.success("β Done! Check your datasets at: https://huggingface.co/CommunityOne") | |
| logger.info("=" * 80) | |
| if __name__ == "__main__": | |
| main() | |