Spaces:

CommunityOne
/

open-navigator

Running on CPU Upgrade

File size: 5,805 Bytes

61d29fc

#!/usr/bin/env python3
"""
Fix and publish the 5 failed datasets
"""
import os
from pathlib import Path
from datetime import datetime
import pandas as pd
from huggingface_hub import HfApi, create_repo
from datasets import Dataset
from loguru import logger
from dotenv import load_dotenv

load_dotenv()

HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')
GOLD_DIR = Path("data/gold")


def fix_and_publish_jurisdictions():
    """Fix and publish the 4 jurisdiction files."""
    
    jurisdiction_files = [
        'data/gold/reference/jurisdictions_cities.parquet',
        'data/gold/reference/jurisdictions_counties.parquet',
        'data/gold/reference/jurisdictions_school_districts.parquet',
        'data/gold/reference/jurisdictions_townships.parquet',
    ]
    
    api = HfApi(token=HUGGINGFACE_TOKEN)
    
    for file_str in jurisdiction_files:
        file_path = Path(file_str)
        dataset_name = f"reference-{file_path.stem.replace('_', '-')}"
        repo_id = f"{HF_ORGANIZATION}/{dataset_name}"
        
        logger.info(f"📤 Processing {file_path.name}...")
        
        try:
            # Load file
            df = pd.read_parquet(file_path)
            logger.info(f"   Loaded {len(df):,} records, {len(df.columns)} columns")
            
            # FIX: Convert ALL columns to standard types
            # This fixes the Arrow dictionary/categorical issue
            for col in df.columns:
                if df[col].dtype.name == 'category':
                    logger.info(f"   Converting categorical column: {col}")
                    df[col] = df[col].astype(str)
                elif df[col].dtype == 'object':
                    # Ensure all object columns are strings
                    df[col] = df[col].astype(str)
            
            # Reset index
            df = df.reset_index(drop=True)
            
            logger.info(f"   Creating HuggingFace dataset...")
            dataset = Dataset.from_pandas(df, preserve_index=False)
            
            # Create repo
            try:
                create_repo(
                    repo_id=repo_id,
                    repo_type="dataset",
                    private=False,
                    exist_ok=True,
                    token=HUGGINGFACE_TOKEN
                )
            except Exception as e:
                logger.debug(f"   Repo may already exist: {e}")
            
            # Push to hub
            logger.info(f"   Pushing to {repo_id}...")
            dataset.push_to_hub(
                repo_id=repo_id,
                private=False,
                commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
                token=HUGGINGFACE_TOKEN
            )
            
            url = f"https://huggingface.co/datasets/{repo_id}"
            logger.success(f"   ✅ Published {len(df):,} records to {url}\n")
            
        except Exception as e:
            logger.error(f"   ❌ Failed: {e}\n")


def check_old_meeting_files():
    """Check if we have old meetings.parquet files that should be replaced."""
    
    logger.info("🔍 Checking for old meeting file naming...")
    
    events_events = Path('data/gold/national/events_events.parquet')
    old_meetings_calendar = Path('data/gold/national/meetings_calendar.parquet')
    old_meetings = Path('data/gold/national/meetings.parquet')
    
    if events_events.exists():
        try:
            df = pd.read_parquet(events_events)
            logger.success(f"✅ Found events_events.parquet with {len(df):,} records (new naming)")
            
            if old_meetings_calendar.exists() or old_meetings.exists():
                logger.warning("⚠️  Old meeting files still exist - these can be deleted:")
                if old_meetings_calendar.exists():
                    logger.info(f"   - meetings_calendar.parquet")
                if old_meetings.exists():
                    logger.info(f"   - meetings.parquet")
                logger.info("   Run migration to rename old files to events_* naming\n")
        except Exception as e:
            logger.error(f"❌ events_events.parquet error: {e}")
    else:
        logger.warning("⚠️  events_events.parquet not found - run pipeline to generate")
        
    # Check if old files exist
    if old_meetings_calendar.exists():
        try:
            df = pd.read_parquet(old_meetings_calendar)
            logger.info(f"📋 Old meetings_calendar.parquet has {len(df):,} records")
        except Exception as e:
            logger.error(f"❌ meetings_calendar.parquet is corrupted: {e}")
    
    if old_meetings.exists():
        try:
            df = pd.read_parquet(old_meetings)
            logger.info(f"📋 Old meetings.parquet has {len(df):,} records")
        except Exception as e:
            logger.error(f"❌ meetings.parquet is corrupted: {e}")
            logger.info(f"   File size: {old_meetings.stat().st_size / 1024 / 1024:.2f} MB")
    
    logger.info("")


def main():
    """Fix and publish failed datasets."""
    
    if not HUGGINGFACE_TOKEN:
        logger.error("❌ HUGGINGFACE_TOKEN not set")
        return
    
    logger.info("=" * 80)
    logger.info("🔧 Fixing and Publishing Failed Datasets")
    logger.info("=" * 80)
    print()
    
    # Check for old meeting file naming
    check_old_meeting_files()
    
    # Fix and publish jurisdiction files
    logger.info("📋 Publishing 4 jurisdiction reference datasets...")
    print()
    fix_and_publish_jurisdictions()
    
    # Summary
    logger.info("=" * 80)
    logger.success("✅ Done! Check your datasets at: https://huggingface.co/CommunityOne")
    logger.info("=" * 80)


if __name__ == "__main__":
    main()