Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 7,200 Bytes
61d29fc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 | #!/usr/bin/env python3
"""
Finalize HuggingFace dataset structure by organizing all files.
Structure:
- data/gold/national/ - Full national datasets (consolidated)
- data/gold/states/ - State-by-state datasets
- data/gold/reference/ - Lookup tables and reference data
"""
import shutil
from pathlib import Path
from loguru import logger
def organize_files():
"""Organize files into logical directories."""
logger.info("=" * 70)
logger.info("ποΈ Finalizing HuggingFace dataset structure")
logger.info("=" * 70)
gold_dir = Path("data/gold")
# 1. Move consolidated nonprofit files to national/
logger.info("\nπ¦ Moving consolidated datasets to national/")
national_dir = gold_dir / "national"
national_dir.mkdir(exist_ok=True)
national_files = [
"nonprofits_organizations.parquet",
"nonprofits_locations.parquet",
"nonprofits_financials.parquet",
"nonprofits_programs.parquet",
]
for filename in national_files:
src = gold_dir / filename
if src.exists():
dst = national_dir / filename
shutil.move(str(src), str(dst))
size = dst.stat().st_size / 1024 / 1024
logger.info(f" β
{filename} β national/ ({size:.1f} MB)")
# 2. Move reference/lookup tables to reference/
logger.info("\nπ Moving reference data to reference/")
reference_dir = gold_dir / "reference"
reference_dir.mkdir(exist_ok=True)
reference_files = [
"causes_everyorg_causes.parquet",
"causes_ntee_codes.parquet",
"domains_gsa_domains.parquet",
"jurisdictions_cities.parquet",
"jurisdictions_counties.parquet",
"jurisdictions_school_districts.parquet",
"jurisdictions_townships.parquet",
]
for filename in reference_files:
src = gold_dir / filename
if src.exists():
dst = reference_dir / filename
shutil.move(str(src), str(dst))
size = dst.stat().st_size / 1024 / 1024
logger.info(f" β
{filename} β reference/ ({size:.2f} MB)")
# 3. Create README for national directory
logger.info("\nπ Creating documentation")
national_readme = national_dir / "README.md"
national_readme.write_text("""# National Nonprofit Datasets
These files contain **all U.S. nonprofit organizations** in single consolidated files.
## Files
- **nonprofits_organizations.parquet** (~134 MB) - 3.9M nonprofits with core data
- **nonprofits_locations.parquet** (~86 MB) - 1.9M location records
- **nonprofits_financials.parquet** (~77 MB) - Financial data from Form 990
- **nonprofits_programs.parquet** (~65 MB) - Programs and services
## When to Use
β
**Use this** if you need:
- Complete national analysis
- All states in one dataset
- Maximum convenience (single file per dataset)
β **Use states/ instead** if you need:
- Only specific states
- Smaller downloads
- State-by-state analysis
## Example
```python
import pandas as pd
# Load all 3.9M nonprofits
df = pd.read_parquet('national/nonprofits_organizations.parquet')
print(f"Total nonprofits: {len(df):,}")
# Filter to your state of interest
ca_orgs = df[df['state'] == 'CA']
print(f"California nonprofits: {len(ca_orgs):,}")
```
## Comparison
| Approach | File Size | Use Case |
|----------|-----------|----------|
| `national/nonprofits_organizations.parquet` | 134 MB | All 3.9M nonprofits |
| `states/CA/nonprofits_organizations.parquet` | 15 MB | Just California (~400K) |
| `states/*/nonprofits_organizations.parquet` | 347 MB total | All states (organized) |
**Note:** The `states/` directory contains the same data split by state for easier discovery and partial downloads.
""")
logger.success(f" β
Created {national_readme.name}")
reference_readme = reference_dir / "README.md"
reference_readme.write_text("""# Reference Data
Lookup tables and reference datasets for nonprofit analysis.
## Files
### Cause Codes
- **causes_ntee_codes.parquet** - National Taxonomy of Exempt Entities (NTEE) codes
- **causes_everyorg_causes.parquet** - Every.org cause categories
### Jurisdictions
- **jurisdictions_cities.parquet** - 19,495 incorporated cities
- **jurisdictions_counties.parquet** - 3,234 counties
- **jurisdictions_school_districts.parquet** - 13,362 school districts
- **jurisdictions_townships.parquet** - 16,360 townships
### Domains
- **domains_gsa_domains.parquet** - U.S. government domains from GSA
## Usage
These are small lookup tables (< 3 MB each) used to enrich nonprofit data.
```python
import pandas as pd
# Load NTEE codes
ntee = pd.read_parquet('reference/causes_ntee_codes.parquet')
print(ntee.head())
# Load cities
cities = pd.read_parquet('reference/jurisdictions_cities.parquet')
print(f"Total cities: {len(cities):,}")
```
""")
logger.success(f" β
Created {reference_readme.name}")
# 4. Update main states README
states_readme = gold_dir / "states" / "README.md"
if states_readme.exists():
content = states_readme.read_text()
# Add note about national datasets
if "national/" not in content:
addition = """
## π National Datasets
Looking for **all states in one file**? See the [`national/`](../national/) directory for consolidated datasets containing all 3.9M nonprofits.
"""
# Insert after the first header
lines = content.split('\n')
lines.insert(2, addition)
states_readme.write_text('\n'.join(lines))
logger.success(f" β
Updated states/README.md")
# Summary
logger.info("\n" + "=" * 70)
logger.success("β
COMPLETE: HuggingFace dataset structure finalized")
logger.info("=" * 70)
logger.info("\nπ Final structure:")
logger.info(" data/gold/")
logger.info(" βββ national/ # Full national datasets (362 MB)")
logger.info(" β βββ nonprofits_organizations.parquet")
logger.info(" β βββ nonprofits_locations.parquet")
logger.info(" β βββ nonprofits_financials.parquet")
logger.info(" β βββ nonprofits_programs.parquet")
logger.info(" β βββ README.md")
logger.info(" βββ states/ # State-by-state datasets (347 MB)")
logger.info(" β βββ AL/")
logger.info(" β βββ CA/")
logger.info(" β βββ ... (62 states)")
logger.info(" β βββ README.md")
logger.info(" βββ reference/ # Lookup tables (6 MB)")
logger.info(" βββ causes_ntee_codes.parquet")
logger.info(" βββ jurisdictions_cities.parquet")
logger.info(" βββ ... (7 files)")
logger.info(" βββ README.md")
logger.info("\nπ‘ Users can choose:")
logger.info(" - national/ β Complete datasets (best for national analysis)")
logger.info(" - states/ β State-specific data (best for regional analysis)")
logger.info(" - reference/ β Lookup tables (NTEE codes, jurisdictions, etc.)")
if __name__ == "__main__":
organize_files()
|