open-navigator / discovery /google_data_commons.py
jcbowyer's picture
Clean HuggingFace deployment without binary files
61d29fc
"""
Google Data Commons Integration for Jurisdiction Enrichment
Uses Google Data Commons Knowledge Graph API to enrich jurisdiction data with:
- Demographics (population, age, gender, race/ethnicity)
- Economic indicators (income, employment, poverty)
- Education levels
- Health insurance coverage
- Housing characteristics
Installation:
pip install datacommons datacommons-pandas
Documentation:
https://docs.datacommons.org/api/
https://datacommons.org/tools/statvar
Citation:
Google LLC. Data Commons. https://datacommons.org/
"""
from typing import List, Dict, Any, Optional
import pandas as pd
from loguru import logger
try:
import datacommons as dc
import datacommons_pandas as dcpd
DATACOMMONS_AVAILABLE = True
except ImportError:
logger.warning("datacommons not installed. Run: pip install datacommons datacommons-pandas")
DATACOMMONS_AVAILABLE = False
class DataCommonsClient:
"""
Client for enriching jurisdiction data with Google Data Commons variables.
Replaces manual U.S. Census API calls with simplified Data Commons API.
"""
# Standard statistical variables for jurisdictions
DEMOGRAPHIC_VARS = [
"Count_Person", # Total population
"Count_Person_Male", # Male population
"Count_Person_Female", # Female population
"Median_Age_Person", # Median age
"Count_Person_WhiteAlone", # White population
"Count_Person_BlackOrAfricanAmericanAlone", # Black population
"Count_Person_HispanicOrLatino", # Hispanic/Latino
"Count_Person_AsianAlone", # Asian population
]
ECONOMIC_VARS = [
"Median_Income_Household", # Median household income
"UnemploymentRate_Person", # Unemployment rate
"Count_Person_BelowPovertyLevelInThePast12Months", # Poverty count
"Median_Earnings_Person", # Median earnings
]
EDUCATION_VARS = [
"Count_Person_EducationalAttainmentBachelorsDegreeOrHigher", # College graduates
"Count_Person_EducationalAttainmentHighSchoolGraduateOrHigher", # HS graduates
]
HEALTH_VARS = [
"Count_Person_WithHealthInsurance", # Insured population
"Count_Person_NoHealthInsurance", # Uninsured population
]
HOUSING_VARS = [
"Median_Price_SoldHome", # Median home price
"Count_HousingUnit", # Total housing units
"Count_Household", # Total households
]
ALL_VARS = (
DEMOGRAPHIC_VARS +
ECONOMIC_VARS +
EDUCATION_VARS +
HEALTH_VARS +
HOUSING_VARS
)
def __init__(self):
"""Initialize the Data Commons client."""
if not DATACOMMONS_AVAILABLE:
raise ImportError(
"datacommons package not installed. "
"Install with: pip install datacommons datacommons-pandas"
)
def get_place_dcid(self, fips_code: str, place_type: str = "County") -> str:
"""
Convert FIPS code to Data Commons ID (DCID).
Args:
fips_code: 5-digit FIPS code (state+county) or 7-digit (state+place)
place_type: "County" or "City"
Returns:
DCID like "geoId/01073" for Jefferson County, AL
Examples:
>>> client = DataCommonsClient()
>>> client.get_place_dcid("01073", "County")
'geoId/01073'
>>> client.get_place_dcid("0107000", "City") # Birmingham, AL
'geoId/0107000'
"""
return f"geoId/{fips_code}"
def enrich_jurisdiction(
self,
fips_code: str,
variables: Optional[List[str]] = None,
year: Optional[int] = None
) -> Dict[str, Any]:
"""
Enrich a jurisdiction with Data Commons variables.
Args:
fips_code: 5-digit (county) or 7-digit (city) FIPS code
variables: List of statistical variables (default: ALL_VARS)
year: Optional year filter (default: most recent)
Returns:
Dictionary of {variable: value}
Example:
>>> client = DataCommonsClient()
>>> data = client.enrich_jurisdiction("01073") # Jefferson County, AL
>>> print(data["Median_Income_Household"])
65000
"""
if variables is None:
variables = self.ALL_VARS
dcid = self.get_place_dcid(fips_code)
try:
# Get latest observation for each variable
observations = dc.get_stat_value(dcid, variables)
result = {
"fips_code": fips_code,
"dcid": dcid,
"data_source": "Google Data Commons",
"retrieval_date": pd.Timestamp.now().isoformat(),
}
# Add statistical variables
for var in variables:
result[var] = observations.get(var)
return result
except Exception as e:
logger.error(f"Error enriching {fips_code}: {e}")
return {"fips_code": fips_code, "error": str(e)}
def enrich_jurisdictions_bulk(
self,
fips_codes: List[str],
variables: Optional[List[str]] = None
) -> pd.DataFrame:
"""
Enrich multiple jurisdictions in bulk.
Args:
fips_codes: List of FIPS codes
variables: List of statistical variables
Returns:
DataFrame with one row per jurisdiction
Example:
>>> client = DataCommonsClient()
>>> fips_codes = ["01073", "01089", "01097"] # 3 AL counties
>>> df = client.enrich_jurisdictions_bulk(fips_codes)
>>> print(df[["fips_code", "Count_Person", "Median_Income_Household"]])
"""
if variables is None:
variables = self.ALL_VARS
dcids = [self.get_place_dcid(fips) for fips in fips_codes]
try:
# Use datacommons_pandas for efficient bulk retrieval
df = dcpd.build_multivariate(
dcids=dcids,
stat_vars=variables
)
# Add FIPS codes
df["fips_code"] = fips_codes
df["data_source"] = "Google Data Commons"
df["retrieval_date"] = pd.Timestamp.now().isoformat()
return df
except Exception as e:
logger.error(f"Error enriching bulk jurisdictions: {e}")
return pd.DataFrame({"error": [str(e)]})
def get_time_series(
self,
fips_code: str,
variables: Optional[List[str]] = None,
start_year: int = 2010,
end_year: int = 2023
) -> pd.DataFrame:
"""
Get time series data for a jurisdiction.
Args:
fips_code: FIPS code
variables: Statistical variables (default: economic indicators)
start_year: Start year
end_year: End year
Returns:
DataFrame with time series (date index)
Example:
>>> client = DataCommonsClient()
>>> df = client.get_time_series("01073", start_year=2015)
>>> df.plot(y="Median_Income_Household")
"""
if variables is None:
variables = self.ECONOMIC_VARS
dcid = self.get_place_dcid(fips_code)
try:
df = dcpd.build_time_series(
place=dcid,
stat_vars=variables
)
# Filter by year range
df = df.loc[f"{start_year}":f"{end_year}"]
return df
except Exception as e:
logger.error(f"Error getting time series for {fips_code}: {e}")
return pd.DataFrame({"error": [str(e)]})
def search_variables(self, query: str) -> List[Dict[str, str]]:
"""
Search for available statistical variables.
Args:
query: Search query (e.g., "income", "education", "health")
Returns:
List of {dcid, name, description}
Example:
>>> client = DataCommonsClient()
>>> vars = client.search_variables("dental health")
>>> for v in vars:
... print(v["dcid"], v["name"])
"""
try:
results = dc.search_statvar(query, max_results=50)
return [
{
"dcid": r.dcid,
"name": getattr(r, 'name', r.dcid),
"description": getattr(r, 'description', '')
}
for r in results
]
except Exception as e:
logger.error(f"Error searching variables: {e}")
return []
def example_usage():
"""Example usage of Data Commons integration."""
client = DataCommonsClient()
# Example 1: Enrich a single county
print("Example 1: Jefferson County, AL (FIPS 01073)")
data = client.enrich_jurisdiction("01073")
print(f"Population: {data.get('Count_Person')}")
print(f"Median Income: ${data.get('Median_Income_Household')}")
print(f"Unemployment Rate: {data.get('UnemploymentRate_Person')}%")
print()
# Example 2: Bulk enrich multiple counties
print("Example 2: Top 3 AL counties by population")
fips_codes = ["01073", "01089", "01097"] # Jefferson, Madison, Mobile
df = client.enrich_jurisdictions_bulk(fips_codes)
print(df[["fips_code", "Count_Person", "Median_Income_Household"]])
print()
# Example 3: Time series
print("Example 3: Income trends for Birmingham, AL")
df_ts = client.get_time_series(
"0107000", # Birmingham city
variables=["Median_Income_Household"],
start_year=2015
)
print(df_ts)
print()
# Example 4: Search for dental health variables
print("Example 4: Search for dental health variables")
vars = client.search_variables("dental health")
for v in vars[:5]:
print(f" - {v['dcid']}: {v['name']}")
if __name__ == "__main__":
if DATACOMMONS_AVAILABLE:
example_usage()
else:
print("Install datacommons: pip install datacommons datacommons-pandas")