Spaces:

CommunityOne
/

open-navigator

Running on CPU Upgrade

App Files Files Community

open-navigator / scripts /huggingface /retry_failed_datasets.py

jcbowyer

Clean HuggingFace deployment without binary files

61d29fc 28 days ago

raw

history blame contribute delete

5.15 kB

	#!/usr/bin/env python3
	"""
	Retry publishing just the failed datasets
	"""
	import os
	from pathlib import Path
	from datetime import datetime
	import pandas as pd
	from huggingface_hub import HfApi, create_repo
	from datasets import Dataset
	from loguru import logger
	from dotenv import load_dotenv
	import traceback

	# Load environment variables
	load_dotenv()

	# Configuration
	HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN')
	HF_ORGANIZATION = os.getenv('HF_ORGANIZATION', 'CommunityOne')

	# Failed datasets to retry
	FAILED_FILES = [
	"data/gold/national/meetings.parquet",
	"data/gold/reference/jurisdictions_cities.parquet",
	"data/gold/reference/jurisdictions_counties.parquet",
	"data/gold/reference/jurisdictions_school_districts.parquet",
	"data/gold/reference/jurisdictions_townships.parquet",
	]

	GOLD_DIR = Path("data/gold")


	def get_dataset_name(file_path: Path, gold_dir: Path) -> str:
	"""Generate HuggingFace dataset name from file path."""
	rel_path = file_path.relative_to(gold_dir)
	parts = list(rel_path.parts)
	filename = parts[-1].replace('.parquet', '')

	if parts[0] == 'national':
	name = f"national-{filename}"
	elif parts[0] == 'reference':
	name = f"reference-{filename.replace('_', '-')}"
	elif parts[0] == 'states':
	state_code = parts[1].lower()
	name = f"states-{state_code}-{filename.replace('_', '-')}"
	else:
	name = '-'.join(parts).replace('.parquet', '').replace('_', '-')

	return name


	def publish_dataset(file_path: Path, api: HfApi, private: bool = False) -> dict:
	"""Publish a single parquet file to HuggingFace."""

	if not file_path.exists():
	logger.warning(f"⚠️ Skipping {file_path} - file not found")
	return {"error": "File not found"}

	dataset_name = get_dataset_name(file_path, GOLD_DIR)
	repo_id = f"{HF_ORGANIZATION}/{dataset_name}"

	logger.info(f"📤 Publishing {file_path.relative_to(GOLD_DIR)} to {repo_id}...")

	try:
	# Load parquet file
	df = pd.read_parquet(file_path)
	logger.info(f" Loaded {len(df):,} records, {len(df.columns)} columns")
	logger.info(f" Columns: {list(df.columns)}")

	# Reset index and ensure clean data
	df = df.reset_index(drop=True)

	# Convert any complex types to strings if needed
	for col in df.columns:
	if df[col].dtype == 'object':
	# Check if it contains complex objects
	try:
	first_val = df[col].dropna().iloc[0] if len(df[col].dropna()) > 0 else None
	if first_val is not None and not isinstance(first_val, (str, int, float, bool)):
	logger.warning(f" Converting complex column {col} to string")
	df[col] = df[col].astype(str)
	except:
	pass

	# Create HuggingFace dataset
	logger.info(f" Creating dataset...")
	dataset = Dataset.from_pandas(df, preserve_index=False)

	# Create repo if it doesn't exist
	try:
	create_repo(
	repo_id=repo_id,
	repo_type="dataset",
	private=private,
	exist_ok=True,
	token=HUGGINGFACE_TOKEN
	)
	except Exception as e:
	logger.debug(f" Repo may already exist: {e}")

	# Push to hub
	logger.info(f" Pushing to hub...")
	dataset.push_to_hub(
	repo_id=repo_id,
	private=private,
	commit_message=f"Update {dataset_name} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}",
	token=HUGGINGFACE_TOKEN
	)

	url = f"https://huggingface.co/datasets/{repo_id}"
	logger.success(f" ✅ Published {len(df):,} records to {url}")

	return {
	"repo_id": repo_id,
	"url": url,
	"records": len(df),
	}

	except Exception as e:
	logger.error(f" ❌ Failed: {e}")
	logger.error(f" Full traceback:\n{traceback.format_exc()}")
	return {"error": str(e), "file": str(file_path)}


	def main():
	"""Retry publishing failed datasets."""

	if not HUGGINGFACE_TOKEN:
	logger.error("❌ HUGGINGFACE_TOKEN not set in environment")
	return

	api = HfApi(token=HUGGINGFACE_TOKEN)

	logger.info("=" * 80)
	logger.info(f"♻️ Retrying {len(FAILED_FILES)} failed datasets")
	logger.info("=" * 80)
	print()

	successful = 0
	failed = 0

	for file_str in FAILED_FILES:
	file_path = Path(file_str)
	logger.info(f"Processing {file_path.relative_to(GOLD_DIR)}")
	result = publish_dataset(file_path, api, private=False)

	if "error" in result:
	failed += 1
	else:
	successful += 1

	print()

	logger.info("=" * 80)
	logger.success(f"✅ Successful: {successful}")
	logger.error(f"❌ Failed: {failed}")
	logger.info("=" * 80)


	if __name__ == "__main__":
	main()