open-navigator / scripts /huggingface /finalize_huggingface_structure.py
jcbowyer's picture
Clean HuggingFace deployment without binary files
61d29fc
#!/usr/bin/env python3
"""
Finalize HuggingFace dataset structure by organizing all files.
Structure:
- data/gold/national/ - Full national datasets (consolidated)
- data/gold/states/ - State-by-state datasets
- data/gold/reference/ - Lookup tables and reference data
"""
import shutil
from pathlib import Path
from loguru import logger
def organize_files():
"""Organize files into logical directories."""
logger.info("=" * 70)
logger.info("πŸ—‚οΈ Finalizing HuggingFace dataset structure")
logger.info("=" * 70)
gold_dir = Path("data/gold")
# 1. Move consolidated nonprofit files to national/
logger.info("\nπŸ“¦ Moving consolidated datasets to national/")
national_dir = gold_dir / "national"
national_dir.mkdir(exist_ok=True)
national_files = [
"nonprofits_organizations.parquet",
"nonprofits_locations.parquet",
"nonprofits_financials.parquet",
"nonprofits_programs.parquet",
]
for filename in national_files:
src = gold_dir / filename
if src.exists():
dst = national_dir / filename
shutil.move(str(src), str(dst))
size = dst.stat().st_size / 1024 / 1024
logger.info(f" βœ… {filename} β†’ national/ ({size:.1f} MB)")
# 2. Move reference/lookup tables to reference/
logger.info("\nπŸ“š Moving reference data to reference/")
reference_dir = gold_dir / "reference"
reference_dir.mkdir(exist_ok=True)
reference_files = [
"causes_everyorg_causes.parquet",
"causes_ntee_codes.parquet",
"domains_gsa_domains.parquet",
"jurisdictions_cities.parquet",
"jurisdictions_counties.parquet",
"jurisdictions_school_districts.parquet",
"jurisdictions_townships.parquet",
]
for filename in reference_files:
src = gold_dir / filename
if src.exists():
dst = reference_dir / filename
shutil.move(str(src), str(dst))
size = dst.stat().st_size / 1024 / 1024
logger.info(f" βœ… {filename} β†’ reference/ ({size:.2f} MB)")
# 3. Create README for national directory
logger.info("\nπŸ“ Creating documentation")
national_readme = national_dir / "README.md"
national_readme.write_text("""# National Nonprofit Datasets
These files contain **all U.S. nonprofit organizations** in single consolidated files.
## Files
- **nonprofits_organizations.parquet** (~134 MB) - 3.9M nonprofits with core data
- **nonprofits_locations.parquet** (~86 MB) - 1.9M location records
- **nonprofits_financials.parquet** (~77 MB) - Financial data from Form 990
- **nonprofits_programs.parquet** (~65 MB) - Programs and services
## When to Use
βœ… **Use this** if you need:
- Complete national analysis
- All states in one dataset
- Maximum convenience (single file per dataset)
❌ **Use states/ instead** if you need:
- Only specific states
- Smaller downloads
- State-by-state analysis
## Example
```python
import pandas as pd
# Load all 3.9M nonprofits
df = pd.read_parquet('national/nonprofits_organizations.parquet')
print(f"Total nonprofits: {len(df):,}")
# Filter to your state of interest
ca_orgs = df[df['state'] == 'CA']
print(f"California nonprofits: {len(ca_orgs):,}")
```
## Comparison
| Approach | File Size | Use Case |
|----------|-----------|----------|
| `national/nonprofits_organizations.parquet` | 134 MB | All 3.9M nonprofits |
| `states/CA/nonprofits_organizations.parquet` | 15 MB | Just California (~400K) |
| `states/*/nonprofits_organizations.parquet` | 347 MB total | All states (organized) |
**Note:** The `states/` directory contains the same data split by state for easier discovery and partial downloads.
""")
logger.success(f" βœ… Created {national_readme.name}")
reference_readme = reference_dir / "README.md"
reference_readme.write_text("""# Reference Data
Lookup tables and reference datasets for nonprofit analysis.
## Files
### Cause Codes
- **causes_ntee_codes.parquet** - National Taxonomy of Exempt Entities (NTEE) codes
- **causes_everyorg_causes.parquet** - Every.org cause categories
### Jurisdictions
- **jurisdictions_cities.parquet** - 19,495 incorporated cities
- **jurisdictions_counties.parquet** - 3,234 counties
- **jurisdictions_school_districts.parquet** - 13,362 school districts
- **jurisdictions_townships.parquet** - 16,360 townships
### Domains
- **domains_gsa_domains.parquet** - U.S. government domains from GSA
## Usage
These are small lookup tables (< 3 MB each) used to enrich nonprofit data.
```python
import pandas as pd
# Load NTEE codes
ntee = pd.read_parquet('reference/causes_ntee_codes.parquet')
print(ntee.head())
# Load cities
cities = pd.read_parquet('reference/jurisdictions_cities.parquet')
print(f"Total cities: {len(cities):,}")
```
""")
logger.success(f" βœ… Created {reference_readme.name}")
# 4. Update main states README
states_readme = gold_dir / "states" / "README.md"
if states_readme.exists():
content = states_readme.read_text()
# Add note about national datasets
if "national/" not in content:
addition = """
## 🌎 National Datasets
Looking for **all states in one file**? See the [`national/`](../national/) directory for consolidated datasets containing all 3.9M nonprofits.
"""
# Insert after the first header
lines = content.split('\n')
lines.insert(2, addition)
states_readme.write_text('\n'.join(lines))
logger.success(f" βœ… Updated states/README.md")
# Summary
logger.info("\n" + "=" * 70)
logger.success("βœ… COMPLETE: HuggingFace dataset structure finalized")
logger.info("=" * 70)
logger.info("\nπŸ“ Final structure:")
logger.info(" data/gold/")
logger.info(" β”œβ”€β”€ national/ # Full national datasets (362 MB)")
logger.info(" β”‚ β”œβ”€β”€ nonprofits_organizations.parquet")
logger.info(" β”‚ β”œβ”€β”€ nonprofits_locations.parquet")
logger.info(" β”‚ β”œβ”€β”€ nonprofits_financials.parquet")
logger.info(" β”‚ β”œβ”€β”€ nonprofits_programs.parquet")
logger.info(" β”‚ └── README.md")
logger.info(" β”œβ”€β”€ states/ # State-by-state datasets (347 MB)")
logger.info(" β”‚ β”œβ”€β”€ AL/")
logger.info(" β”‚ β”œβ”€β”€ CA/")
logger.info(" β”‚ β”œβ”€β”€ ... (62 states)")
logger.info(" β”‚ └── README.md")
logger.info(" └── reference/ # Lookup tables (6 MB)")
logger.info(" β”œβ”€β”€ causes_ntee_codes.parquet")
logger.info(" β”œβ”€β”€ jurisdictions_cities.parquet")
logger.info(" β”œβ”€β”€ ... (7 files)")
logger.info(" └── README.md")
logger.info("\nπŸ’‘ Users can choose:")
logger.info(" - national/ β†’ Complete datasets (best for national analysis)")
logger.info(" - states/ β†’ State-specific data (best for regional analysis)")
logger.info(" - reference/ β†’ Lookup tables (NTEE codes, jurisdictions, etc.)")
if __name__ == "__main__":
organize_files()