Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| #!/usr/bin/env python3 | |
| """ | |
| Compare data completeness between states | |
| Shows what data is missing and what needs to be enriched. | |
| """ | |
| import pandas as pd | |
| from pathlib import Path | |
| from loguru import logger | |
| def analyze_state_data(state_code: str): | |
| """Analyze data completeness for a state""" | |
| state_dir = Path(f"data/gold/states/{state_code}") | |
| if not state_dir.exists(): | |
| logger.error(f"β State directory not found: {state_dir}") | |
| return | |
| print(f"\n{'='*80}") | |
| print(f"π STATE: {state_code}") | |
| print(f"{'='*80}\n") | |
| # Check nonprofit organizations file | |
| org_file = state_dir / "nonprofits_organizations.parquet" | |
| if org_file.exists(): | |
| df = pd.read_parquet(org_file) | |
| print(f"π’ NONPROFITS: {len(df):,} organizations") | |
| # Check for contact fields | |
| contact_fields = [c for c in df.columns if any(x in c.lower() | |
| for x in ['officer', 'phone', 'email', 'contact', 'address', 'street'])] | |
| if contact_fields: | |
| print(f" β Contact fields: {len(contact_fields)}") | |
| for field in contact_fields[:10]: # Show first 10 | |
| non_null = df[field].notna().sum() | |
| pct = (non_null / len(df)) * 100 | |
| print(f" - {field}: {non_null:,} ({pct:.1f}%)") | |
| else: | |
| print(f" β No contact fields found") | |
| print(f" π‘ Run: scripts/enrich_nonprofits_gt990.py") | |
| # Check for grant/financial fields | |
| grant_fields = [c for c in df.columns if 'grant' in c.lower()] | |
| if grant_fields: | |
| print(f"\n π° Grant fields: {len(grant_fields)}") | |
| else: | |
| print(f"\n β No grant fields in organizations file") | |
| else: | |
| print(f"β Nonprofits file not found: {org_file}") | |
| # Check grant tables | |
| print(f"\nπΈ GRANT TABLES:") | |
| grant_files = [ | |
| "grants_foundation_giving.parquet", | |
| "grants_nonprofit_to_nonprofit.parquet", | |
| "grants_revenue_sources.parquet" | |
| ] | |
| for filename in grant_files: | |
| filepath = state_dir / filename | |
| if filepath.exists(): | |
| df = pd.read_parquet(filepath) | |
| status = "β " if len(df) > 0 else "β οΈ EMPTY" | |
| print(f" {status} {filename}: {len(df):,} rows") | |
| else: | |
| print(f" β {filename}: NOT FOUND") | |
| # Check other data files | |
| print(f"\nπ OTHER DATA:") | |
| other_files = [ | |
| "nonprofits_financials.parquet", | |
| "nonprofits_programs.parquet", | |
| "nonprofits_locations.parquet", | |
| "contacts_local_officials.parquet", | |
| "meetings.parquet" | |
| ] | |
| for filename in other_files: | |
| filepath = state_dir / filename | |
| if filepath.exists(): | |
| df = pd.read_parquet(filepath) | |
| status = "β " if len(df) > 0 else "β οΈ EMPTY" | |
| file_size = filepath.stat().st_size / 1024 / 1024 # MB | |
| print(f" {status} {filename}: {len(df):,} rows ({file_size:.1f} MB)") | |
| else: | |
| print(f" β {filename}: NOT FOUND") | |
| def compare_states(): | |
| """Compare data across multiple states""" | |
| states = ['MA', 'AL', 'CA', 'NY', 'TX'] # Sample states | |
| print("\n" + "="*80) | |
| print("πΊοΈ STATE DATA COMPARISON") | |
| print("="*80) | |
| for state in states: | |
| state_dir = Path(f"data/gold/states/{state}") | |
| if state_dir.exists(): | |
| analyze_state_data(state) | |
| print("\n" + "="*80) | |
| print("π RECOMMENDATIONS") | |
| print("="*80) | |
| print(""" | |
| 1. To add contact information: | |
| ./enrich_alabama_nonprofits.sh | |
| 2. To add grant data (requires BigQuery setup): | |
| source .venv/bin/activate | |
| export GOOGLE_APPLICATION_CREDENTIALS=~/.gcp/bigquery-credentials.json | |
| python scripts/enrich_nonprofits_bigquery.py --state AL | |
| 3. To regenerate from IRS BMF: | |
| python -c "from discovery.irs_bmf_ingestion import IRSBMFIngestion; \\ | |
| bmf = IRSBMFIngestion(); \\ | |
| df = bmf.download_state_file('AL'); \\ | |
| df.to_parquet('data/gold/states/AL/nonprofits_organizations.parquet')" | |
| """) | |
| if __name__ == "__main__": | |
| import sys | |
| if len(sys.argv) > 1: | |
| # Analyze specific state | |
| state_code = sys.argv[1].upper() | |
| analyze_state_data(state_code) | |
| else: | |
| # Compare all states | |
| compare_states() | |