Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 5,805 Bytes
61d29fc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 | #!/usr/bin/env python3
"""
Fix and publish the 5 failed datasets
"""
import os
from pathlib import Path
from datetime import datetime
import pandas as pd
from huggingface_hub import HfApi, create_repo
from datasets import Dataset
from loguru import logger
from dotenv import load_dotenv
load_dotenv()
HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')
GOLD_DIR = Path("data/gold")
def fix_and_publish_jurisdictions():
"""Fix and publish the 4 jurisdiction files."""
jurisdiction_files = [
'data/gold/reference/jurisdictions_cities.parquet',
'data/gold/reference/jurisdictions_counties.parquet',
'data/gold/reference/jurisdictions_school_districts.parquet',
'data/gold/reference/jurisdictions_townships.parquet',
]
api = HfApi(token=HUGGINGFACE_TOKEN)
for file_str in jurisdiction_files:
file_path = Path(file_str)
dataset_name = f"reference-{file_path.stem.replace('_', '-')}"
repo_id = f"{HF_ORGANIZATION}/{dataset_name}"
logger.info(f"π€ Processing {file_path.name}...")
try:
# Load file
df = pd.read_parquet(file_path)
logger.info(f" Loaded {len(df):,} records, {len(df.columns)} columns")
# FIX: Convert ALL columns to standard types
# This fixes the Arrow dictionary/categorical issue
for col in df.columns:
if df[col].dtype.name == 'category':
logger.info(f" Converting categorical column: {col}")
df[col] = df[col].astype(str)
elif df[col].dtype == 'object':
# Ensure all object columns are strings
df[col] = df[col].astype(str)
# Reset index
df = df.reset_index(drop=True)
logger.info(f" Creating HuggingFace dataset...")
dataset = Dataset.from_pandas(df, preserve_index=False)
# Create repo
try:
create_repo(
repo_id=repo_id,
repo_type="dataset",
private=False,
exist_ok=True,
token=HUGGINGFACE_TOKEN
)
except Exception as e:
logger.debug(f" Repo may already exist: {e}")
# Push to hub
logger.info(f" Pushing to {repo_id}...")
dataset.push_to_hub(
repo_id=repo_id,
private=False,
commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
token=HUGGINGFACE_TOKEN
)
url = f"https://huggingface.co/datasets/{repo_id}"
logger.success(f" β
Published {len(df):,} records to {url}\n")
except Exception as e:
logger.error(f" β Failed: {e}\n")
def check_old_meeting_files():
"""Check if we have old meetings.parquet files that should be replaced."""
logger.info("π Checking for old meeting file naming...")
events_events = Path('data/gold/national/events_events.parquet')
old_meetings_calendar = Path('data/gold/national/meetings_calendar.parquet')
old_meetings = Path('data/gold/national/meetings.parquet')
if events_events.exists():
try:
df = pd.read_parquet(events_events)
logger.success(f"β
Found events_events.parquet with {len(df):,} records (new naming)")
if old_meetings_calendar.exists() or old_meetings.exists():
logger.warning("β οΈ Old meeting files still exist - these can be deleted:")
if old_meetings_calendar.exists():
logger.info(f" - meetings_calendar.parquet")
if old_meetings.exists():
logger.info(f" - meetings.parquet")
logger.info(" Run migration to rename old files to events_* naming\n")
except Exception as e:
logger.error(f"β events_events.parquet error: {e}")
else:
logger.warning("β οΈ events_events.parquet not found - run pipeline to generate")
# Check if old files exist
if old_meetings_calendar.exists():
try:
df = pd.read_parquet(old_meetings_calendar)
logger.info(f"π Old meetings_calendar.parquet has {len(df):,} records")
except Exception as e:
logger.error(f"β meetings_calendar.parquet is corrupted: {e}")
if old_meetings.exists():
try:
df = pd.read_parquet(old_meetings)
logger.info(f"π Old meetings.parquet has {len(df):,} records")
except Exception as e:
logger.error(f"β meetings.parquet is corrupted: {e}")
logger.info(f" File size: {old_meetings.stat().st_size / 1024 / 1024:.2f} MB")
logger.info("")
def main():
"""Fix and publish failed datasets."""
if not HUGGINGFACE_TOKEN:
logger.error("β HUGGINGFACE_TOKEN not set")
return
logger.info("=" * 80)
logger.info("π§ Fixing and Publishing Failed Datasets")
logger.info("=" * 80)
print()
# Check for old meeting file naming
check_old_meeting_files()
# Fix and publish jurisdiction files
logger.info("π Publishing 4 jurisdiction reference datasets...")
print()
fix_and_publish_jurisdictions()
# Summary
logger.info("=" * 80)
logger.success("β
Done! Check your datasets at: https://huggingface.co/CommunityOne")
logger.info("=" * 80)
if __name__ == "__main__":
main()
|