Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 5,147 Bytes
61d29fc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | #!/usr/bin/env python3
"""
Retry publishing just the failed datasets
"""
import os
from pathlib import Path
from datetime import datetime
import pandas as pd
from huggingface_hub import HfApi, create_repo
from datasets import Dataset
from loguru import logger
from dotenv import load_dotenv
import traceback
# Load environment variables
load_dotenv()
# Configuration
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')
# Failed datasets to retry
FAILED_FILES = [
"data/gold/national/meetings.parquet",
"data/gold/reference/jurisdictions_cities.parquet",
"data/gold/reference/jurisdictions_counties.parquet",
"data/gold/reference/jurisdictions_school_districts.parquet",
"data/gold/reference/jurisdictions_townships.parquet",
]
GOLD_DIR = Path("data/gold")
def get_dataset_name(file_path: Path, gold_dir: Path) -> str:
"""Generate HuggingFace dataset name from file path."""
rel_path = file_path.relative_to(gold_dir)
parts = list(rel_path.parts)
filename = parts[-1].replace('.parquet', '')
if parts[0] == 'national':
name = f"national-{filename}"
elif parts[0] == 'reference':
name = f"reference-{filename.replace('_', '-')}"
elif parts[0] == 'states':
state_code = parts[1].lower()
name = f"states-{state_code}-{filename.replace('_', '-')}"
else:
name = '-'.join(parts).replace('.parquet', '').replace('_', '-')
return name
def publish_dataset(file_path: Path, api: HfApi, private: bool = False) -> dict:
"""Publish a single parquet file to HuggingFace."""
if not file_path.exists():
logger.warning(f"⚠️ Skipping {file_path} - file not found")
return {"error": "File not found"}
dataset_name = get_dataset_name(file_path, GOLD_DIR)
repo_id = f"{HF_ORGANIZATION}/{dataset_name}"
logger.info(f"📤 Publishing {file_path.relative_to(GOLD_DIR)} to {repo_id}...")
try:
# Load parquet file
df = pd.read_parquet(file_path)
logger.info(f" Loaded {len(df):,} records, {len(df.columns)} columns")
logger.info(f" Columns: {list(df.columns)}")
# Reset index and ensure clean data
df = df.reset_index(drop=True)
# Convert any complex types to strings if needed
for col in df.columns:
if df[col].dtype == 'object':
# Check if it contains complex objects
try:
first_val = df[col].dropna().iloc[0] if len(df[col].dropna()) > 0 else None
if first_val is not None and not isinstance(first_val, (str, int, float, bool)):
logger.warning(f" Converting complex column {col} to string")
df[col] = df[col].astype(str)
except:
pass
# Create HuggingFace dataset
logger.info(f" Creating dataset...")
dataset = Dataset.from_pandas(df, preserve_index=False)
# Create repo if it doesn't exist
try:
create_repo(
repo_id=repo_id,
repo_type="dataset",
private=private,
exist_ok=True,
token=HUGGINGFACE_TOKEN
)
except Exception as e:
logger.debug(f" Repo may already exist: {e}")
# Push to hub
logger.info(f" Pushing to hub...")
dataset.push_to_hub(
repo_id=repo_id,
private=private,
commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
token=HUGGINGFACE_TOKEN
)
url = f"https://huggingface.co/datasets/{repo_id}"
logger.success(f" ✅ Published {len(df):,} records to {url}")
return {
"repo_id": repo_id,
"url": url,
"records": len(df),
}
except Exception as e:
logger.error(f" ❌ Failed: {e}")
logger.error(f" Full traceback:\n{traceback.format_exc()}")
return {"error": str(e), "file": str(file_path)}
def main():
"""Retry publishing failed datasets."""
if not HUGGINGFACE_TOKEN:
logger.error("❌ HUGGINGFACE_TOKEN not set in environment")
return
api = HfApi(token=HUGGINGFACE_TOKEN)
logger.info("=" * 80)
logger.info(f"♻️ Retrying {len(FAILED_FILES)} failed datasets")
logger.info("=" * 80)
print()
successful = 0
failed = 0
for file_str in FAILED_FILES:
file_path = Path(file_str)
logger.info(f"Processing {file_path.relative_to(GOLD_DIR)}")
result = publish_dataset(file_path, api, private=False)
if "error" in result:
failed += 1
else:
successful += 1
print()
logger.info("=" * 80)
logger.success(f"✅ Successful: {successful}")
logger.error(f"❌ Failed: {failed}")
logger.info("=" * 80)
if __name__ == "__main__":
main()
|