#!/usr/bin/env python3 """ Retry publishing just the failed datasets """ import os from pathlib import Path from datetime import datetime import pandas as pd from huggingface_hub import HfApi, create_repo from datasets import Dataset from loguru import logger from dotenv import load_dotenv import traceback # Load environment variables load_dotenv() # Configuration HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN') HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne') # Failed datasets to retry FAILED_FILES = [ "data/gold/national/meetings.parquet", "data/gold/reference/jurisdictions_cities.parquet", "data/gold/reference/jurisdictions_counties.parquet", "data/gold/reference/jurisdictions_school_districts.parquet", "data/gold/reference/jurisdictions_townships.parquet", ] GOLD_DIR = Path("data/gold") def get_dataset_name(file_path: Path, gold_dir: Path) -> str: """Generate HuggingFace dataset name from file path.""" rel_path = file_path.relative_to(gold_dir) parts = list(rel_path.parts) filename = parts[-1].replace('.parquet', '') if parts[0] == 'national': name = f"national-{filename}" elif parts[0] == 'reference': name = f"reference-{filename.replace('_', '-')}" elif parts[0] == 'states': state_code = parts[1].lower() name = f"states-{state_code}-{filename.replace('_', '-')}" else: name = '-'.join(parts).replace('.parquet', '').replace('_', '-') return name def publish_dataset(file_path: Path, api: HfApi, private: bool = False) -> dict: """Publish a single parquet file to HuggingFace.""" if not file_path.exists(): logger.warning(f"⚠️ Skipping {file_path} - file not found") return {"error": "File not found"} dataset_name = get_dataset_name(file_path, GOLD_DIR) repo_id = f"{HF_ORGANIZATION}/{dataset_name}" logger.info(f"📤 Publishing {file_path.relative_to(GOLD_DIR)} to {repo_id}...") try: # Load parquet file df = pd.read_parquet(file_path) logger.info(f" Loaded {len(df):,} records, {len(df.columns)} columns") logger.info(f" Columns: {list(df.columns)}") # Reset index and ensure clean data df = df.reset_index(drop=True) # Convert any complex types to strings if needed for col in df.columns: if df[col].dtype == 'object': # Check if it contains complex objects try: first_val = df[col].dropna().iloc[0] if len(df[col].dropna()) > 0 else None if first_val is not None and not isinstance(first_val, (str, int, float, bool)): logger.warning(f" Converting complex column {col} to string") df[col] = df[col].astype(str) except: pass # Create HuggingFace dataset logger.info(f" Creating dataset...") dataset = Dataset.from_pandas(df, preserve_index=False) # Create repo if it doesn't exist try: create_repo( repo_id=repo_id, repo_type="dataset", private=private, exist_ok=True, token=HUGGINGFACE_TOKEN ) except Exception as e: logger.debug(f" Repo may already exist: {e}") # Push to hub logger.info(f" Pushing to hub...") dataset.push_to_hub( repo_id=repo_id, private=private, commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", token=HUGGINGFACE_TOKEN ) url = f"https://huggingface.co/datasets/{repo_id}" logger.success(f" ✅ Published {len(df):,} records to {url}") return { "repo_id": repo_id, "url": url, "records": len(df), } except Exception as e: logger.error(f" ❌ Failed: {e}") logger.error(f" Full traceback:\n{traceback.format_exc()}") return {"error": str(e), "file": str(file_path)} def main(): """Retry publishing failed datasets.""" if not HUGGINGFACE_TOKEN: logger.error("❌ HUGGINGFACE_TOKEN not set in environment") return api = HfApi(token=HUGGINGFACE_TOKEN) logger.info("=" * 80) logger.info(f"♻️ Retrying {len(FAILED_FILES)} failed datasets") logger.info("=" * 80) print() successful = 0 failed = 0 for file_str in FAILED_FILES: file_path = Path(file_str) logger.info(f"Processing {file_path.relative_to(GOLD_DIR)}") result = publish_dataset(file_path, api, private=False) if "error" in result: failed += 1 else: successful += 1 print() logger.info("=" * 80) logger.success(f"✅ Successful: {successful}") logger.error(f"❌ Failed: {failed}") logger.info("=" * 80) if __name__ == "__main__": main()