Spaces:

CommunityOne
/

open-navigator

Running on CPU Upgrade

File size: 5,147 Bytes

61d29fc

#!/usr/bin/env python3
"""
Retry publishing just the failed datasets
"""
import os
from pathlib import Path
from datetime import datetime
import pandas as pd
from huggingface_hub import HfApi, create_repo
from datasets import Dataset
from loguru import logger
from dotenv import load_dotenv
import traceback

# Load environment variables
load_dotenv()

# Configuration
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')

# Failed datasets to retry
FAILED_FILES = [
    "data/gold/national/meetings.parquet",
    "data/gold/reference/jurisdictions_cities.parquet",
    "data/gold/reference/jurisdictions_counties.parquet",
    "data/gold/reference/jurisdictions_school_districts.parquet",
    "data/gold/reference/jurisdictions_townships.parquet",
]

GOLD_DIR = Path("data/gold")


def get_dataset_name(file_path: Path, gold_dir: Path) -> str:
    """Generate HuggingFace dataset name from file path."""
    rel_path = file_path.relative_to(gold_dir)
    parts = list(rel_path.parts)
    filename = parts[-1].replace('.parquet', '')
    
    if parts[0] == 'national':
        name = f"national-{filename}"
    elif parts[0] == 'reference':
        name = f"reference-{filename.replace('_', '-')}"
    elif parts[0] == 'states':
        state_code = parts[1].lower()
        name = f"states-{state_code}-{filename.replace('_', '-')}"
    else:
        name = '-'.join(parts).replace('.parquet', '').replace('_', '-')
    
    return name


def publish_dataset(file_path: Path, api: HfApi, private: bool = False) -> dict:
    """Publish a single parquet file to HuggingFace."""
    
    if not file_path.exists():
        logger.warning(f"⚠️  Skipping {file_path} - file not found")
        return {"error": "File not found"}
    
    dataset_name = get_dataset_name(file_path, GOLD_DIR)
    repo_id = f"{HF_ORGANIZATION}/{dataset_name}"
    
    logger.info(f"📤 Publishing {file_path.relative_to(GOLD_DIR)} to {repo_id}...")
    
    try:
        # Load parquet file
        df = pd.read_parquet(file_path)
        logger.info(f"   Loaded {len(df):,} records, {len(df.columns)} columns")
        logger.info(f"   Columns: {list(df.columns)}")
        
        # Reset index and ensure clean data
        df = df.reset_index(drop=True)
        
        # Convert any complex types to strings if needed
        for col in df.columns:
            if df[col].dtype == 'object':
                # Check if it contains complex objects
                try:
                    first_val = df[col].dropna().iloc[0] if len(df[col].dropna()) > 0 else None
                    if first_val is not None and not isinstance(first_val, (str, int, float, bool)):
                        logger.warning(f"   Converting complex column {col} to string")
                        df[col] = df[col].astype(str)
                except:
                    pass
        
        # Create HuggingFace dataset
        logger.info(f"   Creating dataset...")
        dataset = Dataset.from_pandas(df, preserve_index=False)
        
        # Create repo if it doesn't exist
        try:
            create_repo(
                repo_id=repo_id,
                repo_type="dataset",
                private=private,
                exist_ok=True,
                token=HUGGINGFACE_TOKEN
            )
        except Exception as e:
            logger.debug(f"   Repo may already exist: {e}")
        
        # Push to hub
        logger.info(f"   Pushing to hub...")
        dataset.push_to_hub(
            repo_id=repo_id,
            private=private,
            commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
            token=HUGGINGFACE_TOKEN
        )
        
        url = f"https://huggingface.co/datasets/{repo_id}"
        logger.success(f"   ✅ Published {len(df):,} records to {url}")
        
        return {
            "repo_id": repo_id,
            "url": url,
            "records": len(df),
        }
        
    except Exception as e:
        logger.error(f"   ❌ Failed: {e}")
        logger.error(f"   Full traceback:\n{traceback.format_exc()}")
        return {"error": str(e), "file": str(file_path)}


def main():
    """Retry publishing failed datasets."""
    
    if not HUGGINGFACE_TOKEN:
        logger.error("❌ HUGGINGFACE_TOKEN not set in environment")
        return
    
    api = HfApi(token=HUGGINGFACE_TOKEN)
    
    logger.info("=" * 80)
    logger.info(f"♻️  Retrying {len(FAILED_FILES)} failed datasets")
    logger.info("=" * 80)
    print()
    
    successful = 0
    failed = 0
    
    for file_str in FAILED_FILES:
        file_path = Path(file_str)
        logger.info(f"Processing {file_path.relative_to(GOLD_DIR)}")
        result = publish_dataset(file_path, api, private=False)
        
        if "error" in result:
            failed += 1
        else:
            successful += 1
        
        print()
    
    logger.info("=" * 80)
    logger.success(f"✅ Successful: {successful}")
    logger.error(f"❌ Failed: {failed}")
    logger.info("=" * 80)


if __name__ == "__main__":
    main()