Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 4,424 Bytes
896453f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | #!/usr/bin/env python3
"""
Compare data completeness between states
Shows what data is missing and what needs to be enriched.
"""
import pandas as pd
from pathlib import Path
from loguru import logger
def analyze_state_data(state_code: str):
"""Analyze data completeness for a state"""
state_dir = Path(f"data/gold/states/{state_code}")
if not state_dir.exists():
logger.error(f"β State directory not found: {state_dir}")
return
print(f"\n{'='*80}")
print(f"π STATE: {state_code}")
print(f"{'='*80}\n")
# Check nonprofit organizations file
org_file = state_dir / "nonprofits_organizations.parquet"
if org_file.exists():
df = pd.read_parquet(org_file)
print(f"π’ NONPROFITS: {len(df):,} organizations")
# Check for contact fields
contact_fields = [c for c in df.columns if any(x in c.lower()
for x in ['officer', 'phone', 'email', 'contact', 'address', 'street'])]
if contact_fields:
print(f" β
Contact fields: {len(contact_fields)}")
for field in contact_fields[:10]: # Show first 10
non_null = df[field].notna().sum()
pct = (non_null / len(df)) * 100
print(f" - {field}: {non_null:,} ({pct:.1f}%)")
else:
print(f" β No contact fields found")
print(f" π‘ Run: scripts/enrich_nonprofits_gt990.py")
# Check for grant/financial fields
grant_fields = [c for c in df.columns if 'grant' in c.lower()]
if grant_fields:
print(f"\n π° Grant fields: {len(grant_fields)}")
else:
print(f"\n β No grant fields in organizations file")
else:
print(f"β Nonprofits file not found: {org_file}")
# Check grant tables
print(f"\nπΈ GRANT TABLES:")
grant_files = [
"grants_foundation_giving.parquet",
"grants_nonprofit_to_nonprofit.parquet",
"grants_revenue_sources.parquet"
]
for filename in grant_files:
filepath = state_dir / filename
if filepath.exists():
df = pd.read_parquet(filepath)
status = "β
" if len(df) > 0 else "β οΈ EMPTY"
print(f" {status} {filename}: {len(df):,} rows")
else:
print(f" β {filename}: NOT FOUND")
# Check other data files
print(f"\nπ OTHER DATA:")
other_files = [
"nonprofits_financials.parquet",
"nonprofits_programs.parquet",
"nonprofits_locations.parquet",
"contacts_local_officials.parquet",
"meetings.parquet"
]
for filename in other_files:
filepath = state_dir / filename
if filepath.exists():
df = pd.read_parquet(filepath)
status = "β
" if len(df) > 0 else "β οΈ EMPTY"
file_size = filepath.stat().st_size / 1024 / 1024 # MB
print(f" {status} {filename}: {len(df):,} rows ({file_size:.1f} MB)")
else:
print(f" β {filename}: NOT FOUND")
def compare_states():
"""Compare data across multiple states"""
states = ['MA', 'AL', 'CA', 'NY', 'TX'] # Sample states
print("\n" + "="*80)
print("πΊοΈ STATE DATA COMPARISON")
print("="*80)
for state in states:
state_dir = Path(f"data/gold/states/{state}")
if state_dir.exists():
analyze_state_data(state)
print("\n" + "="*80)
print("π RECOMMENDATIONS")
print("="*80)
print("""
1. To add contact information:
./enrich_alabama_nonprofits.sh
2. To add grant data (requires BigQuery setup):
source .venv/bin/activate
export GOOGLE_APPLICATION_CREDENTIALS=~/.gcp/bigquery-credentials.json
python scripts/enrich_nonprofits_bigquery.py --state AL
3. To regenerate from IRS BMF:
python -c "from discovery.irs_bmf_ingestion import IRSBMFIngestion; \\
bmf = IRSBMFIngestion(); \\
df = bmf.download_state_file('AL'); \\
df.to_parquet('data/gold/states/AL/nonprofits_organizations.parquet')"
""")
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
# Analyze specific state
state_code = sys.argv[1].upper()
analyze_state_data(state_code)
else:
# Compare all states
compare_states()
|