File size: 10,934 Bytes
61d29fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
"""
Google Data Commons Integration for Jurisdiction Enrichment

Uses Google Data Commons Knowledge Graph API to enrich jurisdiction data with:
- Demographics (population, age, gender, race/ethnicity)
- Economic indicators (income, employment, poverty)
- Education levels
- Health insurance coverage
- Housing characteristics

Installation:
    pip install datacommons datacommons-pandas

Documentation:
    https://docs.datacommons.org/api/
    https://datacommons.org/tools/statvar

Citation:
    Google LLC. Data Commons. https://datacommons.org/
"""
from typing import List, Dict, Any, Optional
import pandas as pd
from loguru import logger

try:
    import datacommons as dc
    import datacommons_pandas as dcpd
    DATACOMMONS_AVAILABLE = True
except ImportError:
    logger.warning("datacommons not installed. Run: pip install datacommons datacommons-pandas")
    DATACOMMONS_AVAILABLE = False


class DataCommonsClient:
    """
    Client for enriching jurisdiction data with Google Data Commons variables.
    
    Replaces manual U.S. Census API calls with simplified Data Commons API.
    """
    
    # Standard statistical variables for jurisdictions
    DEMOGRAPHIC_VARS = [
        "Count_Person",                                          # Total population
        "Count_Person_Male",                                     # Male population
        "Count_Person_Female",                                   # Female population
        "Median_Age_Person",                                     # Median age
        "Count_Person_WhiteAlone",                              # White population
        "Count_Person_BlackOrAfricanAmericanAlone",             # Black population
        "Count_Person_HispanicOrLatino",                        # Hispanic/Latino
        "Count_Person_AsianAlone",                              # Asian population
    ]
    
    ECONOMIC_VARS = [
        "Median_Income_Household",                              # Median household income
        "UnemploymentRate_Person",                              # Unemployment rate
        "Count_Person_BelowPovertyLevelInThePast12Months",     # Poverty count
        "Median_Earnings_Person",                               # Median earnings
    ]
    
    EDUCATION_VARS = [
        "Count_Person_EducationalAttainmentBachelorsDegreeOrHigher",  # College graduates
        "Count_Person_EducationalAttainmentHighSchoolGraduateOrHigher",  # HS graduates
    ]
    
    HEALTH_VARS = [
        "Count_Person_WithHealthInsurance",                     # Insured population
        "Count_Person_NoHealthInsurance",                       # Uninsured population
    ]
    
    HOUSING_VARS = [
        "Median_Price_SoldHome",                                # Median home price
        "Count_HousingUnit",                                    # Total housing units
        "Count_Household",                                      # Total households
    ]
    
    ALL_VARS = (
        DEMOGRAPHIC_VARS + 
        ECONOMIC_VARS + 
        EDUCATION_VARS + 
        HEALTH_VARS + 
        HOUSING_VARS
    )
    
    def __init__(self):
        """Initialize the Data Commons client."""
        if not DATACOMMONS_AVAILABLE:
            raise ImportError(
                "datacommons package not installed. "
                "Install with: pip install datacommons datacommons-pandas"
            )
    
    def get_place_dcid(self, fips_code: str, place_type: str = "County") -> str:
        """
        Convert FIPS code to Data Commons ID (DCID).
        
        Args:
            fips_code: 5-digit FIPS code (state+county) or 7-digit (state+place)
            place_type: "County" or "City"
        
        Returns:
            DCID like "geoId/01073" for Jefferson County, AL
        
        Examples:
            >>> client = DataCommonsClient()
            >>> client.get_place_dcid("01073", "County")
            'geoId/01073'
            >>> client.get_place_dcid("0107000", "City")  # Birmingham, AL
            'geoId/0107000'
        """
        return f"geoId/{fips_code}"
    
    def enrich_jurisdiction(
        self,
        fips_code: str,
        variables: Optional[List[str]] = None,
        year: Optional[int] = None
    ) -> Dict[str, Any]:
        """
        Enrich a jurisdiction with Data Commons variables.
        
        Args:
            fips_code: 5-digit (county) or 7-digit (city) FIPS code
            variables: List of statistical variables (default: ALL_VARS)
            year: Optional year filter (default: most recent)
        
        Returns:
            Dictionary of {variable: value}
        
        Example:
            >>> client = DataCommonsClient()
            >>> data = client.enrich_jurisdiction("01073")  # Jefferson County, AL
            >>> print(data["Median_Income_Household"])
            65000
        """
        if variables is None:
            variables = self.ALL_VARS
        
        dcid = self.get_place_dcid(fips_code)
        
        try:
            # Get latest observation for each variable
            observations = dc.get_stat_value(dcid, variables)
            
            result = {
                "fips_code": fips_code,
                "dcid": dcid,
                "data_source": "Google Data Commons",
                "retrieval_date": pd.Timestamp.now().isoformat(),
            }
            
            # Add statistical variables
            for var in variables:
                result[var] = observations.get(var)
            
            return result
            
        except Exception as e:
            logger.error(f"Error enriching {fips_code}: {e}")
            return {"fips_code": fips_code, "error": str(e)}
    
    def enrich_jurisdictions_bulk(
        self,
        fips_codes: List[str],
        variables: Optional[List[str]] = None
    ) -> pd.DataFrame:
        """
        Enrich multiple jurisdictions in bulk.
        
        Args:
            fips_codes: List of FIPS codes
            variables: List of statistical variables
        
        Returns:
            DataFrame with one row per jurisdiction
        
        Example:
            >>> client = DataCommonsClient()
            >>> fips_codes = ["01073", "01089", "01097"]  # 3 AL counties
            >>> df = client.enrich_jurisdictions_bulk(fips_codes)
            >>> print(df[["fips_code", "Count_Person", "Median_Income_Household"]])
        """
        if variables is None:
            variables = self.ALL_VARS
        
        dcids = [self.get_place_dcid(fips) for fips in fips_codes]
        
        try:
            # Use datacommons_pandas for efficient bulk retrieval
            df = dcpd.build_multivariate(
                dcids=dcids,
                stat_vars=variables
            )
            
            # Add FIPS codes
            df["fips_code"] = fips_codes
            df["data_source"] = "Google Data Commons"
            df["retrieval_date"] = pd.Timestamp.now().isoformat()
            
            return df
            
        except Exception as e:
            logger.error(f"Error enriching bulk jurisdictions: {e}")
            return pd.DataFrame({"error": [str(e)]})
    
    def get_time_series(
        self,
        fips_code: str,
        variables: Optional[List[str]] = None,
        start_year: int = 2010,
        end_year: int = 2023
    ) -> pd.DataFrame:
        """
        Get time series data for a jurisdiction.
        
        Args:
            fips_code: FIPS code
            variables: Statistical variables (default: economic indicators)
            start_year: Start year
            end_year: End year
        
        Returns:
            DataFrame with time series (date index)
        
        Example:
            >>> client = DataCommonsClient()
            >>> df = client.get_time_series("01073", start_year=2015)
            >>> df.plot(y="Median_Income_Household")
        """
        if variables is None:
            variables = self.ECONOMIC_VARS
        
        dcid = self.get_place_dcid(fips_code)
        
        try:
            df = dcpd.build_time_series(
                place=dcid,
                stat_vars=variables
            )
            
            # Filter by year range
            df = df.loc[f"{start_year}":f"{end_year}"]
            
            return df
            
        except Exception as e:
            logger.error(f"Error getting time series for {fips_code}: {e}")
            return pd.DataFrame({"error": [str(e)]})
    
    def search_variables(self, query: str) -> List[Dict[str, str]]:
        """
        Search for available statistical variables.
        
        Args:
            query: Search query (e.g., "income", "education", "health")
        
        Returns:
            List of {dcid, name, description}
        
        Example:
            >>> client = DataCommonsClient()
            >>> vars = client.search_variables("dental health")
            >>> for v in vars:
            ...     print(v["dcid"], v["name"])
        """
        try:
            results = dc.search_statvar(query, max_results=50)
            return [
                {
                    "dcid": r.dcid,
                    "name": getattr(r, 'name', r.dcid),
                    "description": getattr(r, 'description', '')
                }
                for r in results
            ]
        except Exception as e:
            logger.error(f"Error searching variables: {e}")
            return []


def example_usage():
    """Example usage of Data Commons integration."""
    client = DataCommonsClient()
    
    # Example 1: Enrich a single county
    print("Example 1: Jefferson County, AL (FIPS 01073)")
    data = client.enrich_jurisdiction("01073")
    print(f"Population: {data.get('Count_Person')}")
    print(f"Median Income: ${data.get('Median_Income_Household')}")
    print(f"Unemployment Rate: {data.get('UnemploymentRate_Person')}%")
    print()
    
    # Example 2: Bulk enrich multiple counties
    print("Example 2: Top 3 AL counties by population")
    fips_codes = ["01073", "01089", "01097"]  # Jefferson, Madison, Mobile
    df = client.enrich_jurisdictions_bulk(fips_codes)
    print(df[["fips_code", "Count_Person", "Median_Income_Household"]])
    print()
    
    # Example 3: Time series
    print("Example 3: Income trends for Birmingham, AL")
    df_ts = client.get_time_series(
        "0107000",  # Birmingham city
        variables=["Median_Income_Household"],
        start_year=2015
    )
    print(df_ts)
    print()
    
    # Example 4: Search for dental health variables
    print("Example 4: Search for dental health variables")
    vars = client.search_variables("dental health")
    for v in vars[:5]:
        print(f"  - {v['dcid']}: {v['name']}")


if __name__ == "__main__":
    if DATACOMMONS_AVAILABLE:
        example_usage()
    else:
        print("Install datacommons: pip install datacommons datacommons-pandas")