File size: 4,424 Bytes
896453f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python3
"""
Compare data completeness between states

Shows what data is missing and what needs to be enriched.
"""

import pandas as pd
from pathlib import Path
from loguru import logger


def analyze_state_data(state_code: str):
    """Analyze data completeness for a state"""
    
    state_dir = Path(f"data/gold/states/{state_code}")
    
    if not state_dir.exists():
        logger.error(f"❌ State directory not found: {state_dir}")
        return
    
    print(f"\n{'='*80}")
    print(f"πŸ“Š STATE: {state_code}")
    print(f"{'='*80}\n")
    
    # Check nonprofit organizations file
    org_file = state_dir / "nonprofits_organizations.parquet"
    if org_file.exists():
        df = pd.read_parquet(org_file)
        print(f"🏒 NONPROFITS: {len(df):,} organizations")
        
        # Check for contact fields
        contact_fields = [c for c in df.columns if any(x in c.lower() 
            for x in ['officer', 'phone', 'email', 'contact', 'address', 'street'])]
        
        if contact_fields:
            print(f"   βœ… Contact fields: {len(contact_fields)}")
            for field in contact_fields[:10]:  # Show first 10
                non_null = df[field].notna().sum()
                pct = (non_null / len(df)) * 100
                print(f"      - {field}: {non_null:,} ({pct:.1f}%)")
        else:
            print(f"   ❌ No contact fields found")
            print(f"   πŸ’‘ Run: scripts/enrich_nonprofits_gt990.py")
        
        # Check for grant/financial fields
        grant_fields = [c for c in df.columns if 'grant' in c.lower()]
        if grant_fields:
            print(f"\n   πŸ’° Grant fields: {len(grant_fields)}")
        else:
            print(f"\n   ❌ No grant fields in organizations file")
    else:
        print(f"❌ Nonprofits file not found: {org_file}")
    
    # Check grant tables
    print(f"\nπŸ’Έ GRANT TABLES:")
    grant_files = [
        "grants_foundation_giving.parquet",
        "grants_nonprofit_to_nonprofit.parquet",
        "grants_revenue_sources.parquet"
    ]
    
    for filename in grant_files:
        filepath = state_dir / filename
        if filepath.exists():
            df = pd.read_parquet(filepath)
            status = "βœ…" if len(df) > 0 else "⚠️  EMPTY"
            print(f"   {status} {filename}: {len(df):,} rows")
        else:
            print(f"   ❌ {filename}: NOT FOUND")
    
    # Check other data files
    print(f"\nπŸ“‹ OTHER DATA:")
    other_files = [
        "nonprofits_financials.parquet",
        "nonprofits_programs.parquet",
        "nonprofits_locations.parquet",
        "contacts_local_officials.parquet",
        "meetings.parquet"
    ]
    
    for filename in other_files:
        filepath = state_dir / filename
        if filepath.exists():
            df = pd.read_parquet(filepath)
            status = "βœ…" if len(df) > 0 else "⚠️  EMPTY"
            file_size = filepath.stat().st_size / 1024 / 1024  # MB
            print(f"   {status} {filename}: {len(df):,} rows ({file_size:.1f} MB)")
        else:
            print(f"   ❌ {filename}: NOT FOUND")


def compare_states():
    """Compare data across multiple states"""
    
    states = ['MA', 'AL', 'CA', 'NY', 'TX']  # Sample states
    
    print("\n" + "="*80)
    print("πŸ—ΊοΈ  STATE DATA COMPARISON")
    print("="*80)
    
    for state in states:
        state_dir = Path(f"data/gold/states/{state}")
        if state_dir.exists():
            analyze_state_data(state)
    
    print("\n" + "="*80)
    print("πŸ“Œ RECOMMENDATIONS")
    print("="*80)
    print("""
1. To add contact information:
   ./enrich_alabama_nonprofits.sh
   
2. To add grant data (requires BigQuery setup):
   source .venv/bin/activate
   export GOOGLE_APPLICATION_CREDENTIALS=~/.gcp/bigquery-credentials.json
   python scripts/enrich_nonprofits_bigquery.py --state AL
   
3. To regenerate from IRS BMF:
   python -c "from discovery.irs_bmf_ingestion import IRSBMFIngestion; \\
              bmf = IRSBMFIngestion(); \\
              df = bmf.download_state_file('AL'); \\
              df.to_parquet('data/gold/states/AL/nonprofits_organizations.parquet')"
""")


if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1:
        # Analyze specific state
        state_code = sys.argv[1].upper()
        analyze_state_data(state_code)
    else:
        # Compare all states
        compare_states()