Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| """ | |
| Google Data Commons Integration for Jurisdiction Enrichment | |
| Uses Google Data Commons Knowledge Graph API to enrich jurisdiction data with: | |
| - Demographics (population, age, gender, race/ethnicity) | |
| - Economic indicators (income, employment, poverty) | |
| - Education levels | |
| - Health insurance coverage | |
| - Housing characteristics | |
| Installation: | |
| pip install datacommons datacommons-pandas | |
| Documentation: | |
| https://docs.datacommons.org/api/ | |
| https://datacommons.org/tools/statvar | |
| Citation: | |
| Google LLC. Data Commons. https://datacommons.org/ | |
| """ | |
| from typing import List, Dict, Any, Optional | |
| import pandas as pd | |
| from loguru import logger | |
| try: | |
| import datacommons as dc | |
| import datacommons_pandas as dcpd | |
| DATACOMMONS_AVAILABLE = True | |
| except ImportError: | |
| logger.warning("datacommons not installed. Run: pip install datacommons datacommons-pandas") | |
| DATACOMMONS_AVAILABLE = False | |
| class DataCommonsClient: | |
| """ | |
| Client for enriching jurisdiction data with Google Data Commons variables. | |
| Replaces manual U.S. Census API calls with simplified Data Commons API. | |
| """ | |
| # Standard statistical variables for jurisdictions | |
| DEMOGRAPHIC_VARS = [ | |
| "Count_Person", # Total population | |
| "Count_Person_Male", # Male population | |
| "Count_Person_Female", # Female population | |
| "Median_Age_Person", # Median age | |
| "Count_Person_WhiteAlone", # White population | |
| "Count_Person_BlackOrAfricanAmericanAlone", # Black population | |
| "Count_Person_HispanicOrLatino", # Hispanic/Latino | |
| "Count_Person_AsianAlone", # Asian population | |
| ] | |
| ECONOMIC_VARS = [ | |
| "Median_Income_Household", # Median household income | |
| "UnemploymentRate_Person", # Unemployment rate | |
| "Count_Person_BelowPovertyLevelInThePast12Months", # Poverty count | |
| "Median_Earnings_Person", # Median earnings | |
| ] | |
| EDUCATION_VARS = [ | |
| "Count_Person_EducationalAttainmentBachelorsDegreeOrHigher", # College graduates | |
| "Count_Person_EducationalAttainmentHighSchoolGraduateOrHigher", # HS graduates | |
| ] | |
| HEALTH_VARS = [ | |
| "Count_Person_WithHealthInsurance", # Insured population | |
| "Count_Person_NoHealthInsurance", # Uninsured population | |
| ] | |
| HOUSING_VARS = [ | |
| "Median_Price_SoldHome", # Median home price | |
| "Count_HousingUnit", # Total housing units | |
| "Count_Household", # Total households | |
| ] | |
| ALL_VARS = ( | |
| DEMOGRAPHIC_VARS + | |
| ECONOMIC_VARS + | |
| EDUCATION_VARS + | |
| HEALTH_VARS + | |
| HOUSING_VARS | |
| ) | |
| def __init__(self): | |
| """Initialize the Data Commons client.""" | |
| if not DATACOMMONS_AVAILABLE: | |
| raise ImportError( | |
| "datacommons package not installed. " | |
| "Install with: pip install datacommons datacommons-pandas" | |
| ) | |
| def get_place_dcid(self, fips_code: str, place_type: str = "County") -> str: | |
| """ | |
| Convert FIPS code to Data Commons ID (DCID). | |
| Args: | |
| fips_code: 5-digit FIPS code (state+county) or 7-digit (state+place) | |
| place_type: "County" or "City" | |
| Returns: | |
| DCID like "geoId/01073" for Jefferson County, AL | |
| Examples: | |
| >>> client = DataCommonsClient() | |
| >>> client.get_place_dcid("01073", "County") | |
| 'geoId/01073' | |
| >>> client.get_place_dcid("0107000", "City") # Birmingham, AL | |
| 'geoId/0107000' | |
| """ | |
| return f"geoId/{fips_code}" | |
| def enrich_jurisdiction( | |
| self, | |
| fips_code: str, | |
| variables: Optional[List[str]] = None, | |
| year: Optional[int] = None | |
| ) -> Dict[str, Any]: | |
| """ | |
| Enrich a jurisdiction with Data Commons variables. | |
| Args: | |
| fips_code: 5-digit (county) or 7-digit (city) FIPS code | |
| variables: List of statistical variables (default: ALL_VARS) | |
| year: Optional year filter (default: most recent) | |
| Returns: | |
| Dictionary of {variable: value} | |
| Example: | |
| >>> client = DataCommonsClient() | |
| >>> data = client.enrich_jurisdiction("01073") # Jefferson County, AL | |
| >>> print(data["Median_Income_Household"]) | |
| 65000 | |
| """ | |
| if variables is None: | |
| variables = self.ALL_VARS | |
| dcid = self.get_place_dcid(fips_code) | |
| try: | |
| # Get latest observation for each variable | |
| observations = dc.get_stat_value(dcid, variables) | |
| result = { | |
| "fips_code": fips_code, | |
| "dcid": dcid, | |
| "data_source": "Google Data Commons", | |
| "retrieval_date": pd.Timestamp.now().isoformat(), | |
| } | |
| # Add statistical variables | |
| for var in variables: | |
| result[var] = observations.get(var) | |
| return result | |
| except Exception as e: | |
| logger.error(f"Error enriching {fips_code}: {e}") | |
| return {"fips_code": fips_code, "error": str(e)} | |
| def enrich_jurisdictions_bulk( | |
| self, | |
| fips_codes: List[str], | |
| variables: Optional[List[str]] = None | |
| ) -> pd.DataFrame: | |
| """ | |
| Enrich multiple jurisdictions in bulk. | |
| Args: | |
| fips_codes: List of FIPS codes | |
| variables: List of statistical variables | |
| Returns: | |
| DataFrame with one row per jurisdiction | |
| Example: | |
| >>> client = DataCommonsClient() | |
| >>> fips_codes = ["01073", "01089", "01097"] # 3 AL counties | |
| >>> df = client.enrich_jurisdictions_bulk(fips_codes) | |
| >>> print(df[["fips_code", "Count_Person", "Median_Income_Household"]]) | |
| """ | |
| if variables is None: | |
| variables = self.ALL_VARS | |
| dcids = [self.get_place_dcid(fips) for fips in fips_codes] | |
| try: | |
| # Use datacommons_pandas for efficient bulk retrieval | |
| df = dcpd.build_multivariate( | |
| dcids=dcids, | |
| stat_vars=variables | |
| ) | |
| # Add FIPS codes | |
| df["fips_code"] = fips_codes | |
| df["data_source"] = "Google Data Commons" | |
| df["retrieval_date"] = pd.Timestamp.now().isoformat() | |
| return df | |
| except Exception as e: | |
| logger.error(f"Error enriching bulk jurisdictions: {e}") | |
| return pd.DataFrame({"error": [str(e)]}) | |
| def get_time_series( | |
| self, | |
| fips_code: str, | |
| variables: Optional[List[str]] = None, | |
| start_year: int = 2010, | |
| end_year: int = 2023 | |
| ) -> pd.DataFrame: | |
| """ | |
| Get time series data for a jurisdiction. | |
| Args: | |
| fips_code: FIPS code | |
| variables: Statistical variables (default: economic indicators) | |
| start_year: Start year | |
| end_year: End year | |
| Returns: | |
| DataFrame with time series (date index) | |
| Example: | |
| >>> client = DataCommonsClient() | |
| >>> df = client.get_time_series("01073", start_year=2015) | |
| >>> df.plot(y="Median_Income_Household") | |
| """ | |
| if variables is None: | |
| variables = self.ECONOMIC_VARS | |
| dcid = self.get_place_dcid(fips_code) | |
| try: | |
| df = dcpd.build_time_series( | |
| place=dcid, | |
| stat_vars=variables | |
| ) | |
| # Filter by year range | |
| df = df.loc[f"{start_year}":f"{end_year}"] | |
| return df | |
| except Exception as e: | |
| logger.error(f"Error getting time series for {fips_code}: {e}") | |
| return pd.DataFrame({"error": [str(e)]}) | |
| def search_variables(self, query: str) -> List[Dict[str, str]]: | |
| """ | |
| Search for available statistical variables. | |
| Args: | |
| query: Search query (e.g., "income", "education", "health") | |
| Returns: | |
| List of {dcid, name, description} | |
| Example: | |
| >>> client = DataCommonsClient() | |
| >>> vars = client.search_variables("dental health") | |
| >>> for v in vars: | |
| ... print(v["dcid"], v["name"]) | |
| """ | |
| try: | |
| results = dc.search_statvar(query, max_results=50) | |
| return [ | |
| { | |
| "dcid": r.dcid, | |
| "name": getattr(r, 'name', r.dcid), | |
| "description": getattr(r, 'description', '') | |
| } | |
| for r in results | |
| ] | |
| except Exception as e: | |
| logger.error(f"Error searching variables: {e}") | |
| return [] | |
| def example_usage(): | |
| """Example usage of Data Commons integration.""" | |
| client = DataCommonsClient() | |
| # Example 1: Enrich a single county | |
| print("Example 1: Jefferson County, AL (FIPS 01073)") | |
| data = client.enrich_jurisdiction("01073") | |
| print(f"Population: {data.get('Count_Person')}") | |
| print(f"Median Income: ${data.get('Median_Income_Household')}") | |
| print(f"Unemployment Rate: {data.get('UnemploymentRate_Person')}%") | |
| print() | |
| # Example 2: Bulk enrich multiple counties | |
| print("Example 2: Top 3 AL counties by population") | |
| fips_codes = ["01073", "01089", "01097"] # Jefferson, Madison, Mobile | |
| df = client.enrich_jurisdictions_bulk(fips_codes) | |
| print(df[["fips_code", "Count_Person", "Median_Income_Household"]]) | |
| print() | |
| # Example 3: Time series | |
| print("Example 3: Income trends for Birmingham, AL") | |
| df_ts = client.get_time_series( | |
| "0107000", # Birmingham city | |
| variables=["Median_Income_Household"], | |
| start_year=2015 | |
| ) | |
| print(df_ts) | |
| print() | |
| # Example 4: Search for dental health variables | |
| print("Example 4: Search for dental health variables") | |
| vars = client.search_variables("dental health") | |
| for v in vars[:5]: | |
| print(f" - {v['dcid']}: {v['name']}") | |
| if __name__ == "__main__": | |
| if DATACOMMONS_AVAILABLE: | |
| example_usage() | |
| else: | |
| print("Install datacommons: pip install datacommons datacommons-pandas") | |