Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 10,934 Bytes
61d29fc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 | """
Google Data Commons Integration for Jurisdiction Enrichment
Uses Google Data Commons Knowledge Graph API to enrich jurisdiction data with:
- Demographics (population, age, gender, race/ethnicity)
- Economic indicators (income, employment, poverty)
- Education levels
- Health insurance coverage
- Housing characteristics
Installation:
pip install datacommons datacommons-pandas
Documentation:
https://docs.datacommons.org/api/
https://datacommons.org/tools/statvar
Citation:
Google LLC. Data Commons. https://datacommons.org/
"""
from typing import List, Dict, Any, Optional
import pandas as pd
from loguru import logger
try:
import datacommons as dc
import datacommons_pandas as dcpd
DATACOMMONS_AVAILABLE = True
except ImportError:
logger.warning("datacommons not installed. Run: pip install datacommons datacommons-pandas")
DATACOMMONS_AVAILABLE = False
class DataCommonsClient:
"""
Client for enriching jurisdiction data with Google Data Commons variables.
Replaces manual U.S. Census API calls with simplified Data Commons API.
"""
# Standard statistical variables for jurisdictions
DEMOGRAPHIC_VARS = [
"Count_Person", # Total population
"Count_Person_Male", # Male population
"Count_Person_Female", # Female population
"Median_Age_Person", # Median age
"Count_Person_WhiteAlone", # White population
"Count_Person_BlackOrAfricanAmericanAlone", # Black population
"Count_Person_HispanicOrLatino", # Hispanic/Latino
"Count_Person_AsianAlone", # Asian population
]
ECONOMIC_VARS = [
"Median_Income_Household", # Median household income
"UnemploymentRate_Person", # Unemployment rate
"Count_Person_BelowPovertyLevelInThePast12Months", # Poverty count
"Median_Earnings_Person", # Median earnings
]
EDUCATION_VARS = [
"Count_Person_EducationalAttainmentBachelorsDegreeOrHigher", # College graduates
"Count_Person_EducationalAttainmentHighSchoolGraduateOrHigher", # HS graduates
]
HEALTH_VARS = [
"Count_Person_WithHealthInsurance", # Insured population
"Count_Person_NoHealthInsurance", # Uninsured population
]
HOUSING_VARS = [
"Median_Price_SoldHome", # Median home price
"Count_HousingUnit", # Total housing units
"Count_Household", # Total households
]
ALL_VARS = (
DEMOGRAPHIC_VARS +
ECONOMIC_VARS +
EDUCATION_VARS +
HEALTH_VARS +
HOUSING_VARS
)
def __init__(self):
"""Initialize the Data Commons client."""
if not DATACOMMONS_AVAILABLE:
raise ImportError(
"datacommons package not installed. "
"Install with: pip install datacommons datacommons-pandas"
)
def get_place_dcid(self, fips_code: str, place_type: str = "County") -> str:
"""
Convert FIPS code to Data Commons ID (DCID).
Args:
fips_code: 5-digit FIPS code (state+county) or 7-digit (state+place)
place_type: "County" or "City"
Returns:
DCID like "geoId/01073" for Jefferson County, AL
Examples:
>>> client = DataCommonsClient()
>>> client.get_place_dcid("01073", "County")
'geoId/01073'
>>> client.get_place_dcid("0107000", "City") # Birmingham, AL
'geoId/0107000'
"""
return f"geoId/{fips_code}"
def enrich_jurisdiction(
self,
fips_code: str,
variables: Optional[List[str]] = None,
year: Optional[int] = None
) -> Dict[str, Any]:
"""
Enrich a jurisdiction with Data Commons variables.
Args:
fips_code: 5-digit (county) or 7-digit (city) FIPS code
variables: List of statistical variables (default: ALL_VARS)
year: Optional year filter (default: most recent)
Returns:
Dictionary of {variable: value}
Example:
>>> client = DataCommonsClient()
>>> data = client.enrich_jurisdiction("01073") # Jefferson County, AL
>>> print(data["Median_Income_Household"])
65000
"""
if variables is None:
variables = self.ALL_VARS
dcid = self.get_place_dcid(fips_code)
try:
# Get latest observation for each variable
observations = dc.get_stat_value(dcid, variables)
result = {
"fips_code": fips_code,
"dcid": dcid,
"data_source": "Google Data Commons",
"retrieval_date": pd.Timestamp.now().isoformat(),
}
# Add statistical variables
for var in variables:
result[var] = observations.get(var)
return result
except Exception as e:
logger.error(f"Error enriching {fips_code}: {e}")
return {"fips_code": fips_code, "error": str(e)}
def enrich_jurisdictions_bulk(
self,
fips_codes: List[str],
variables: Optional[List[str]] = None
) -> pd.DataFrame:
"""
Enrich multiple jurisdictions in bulk.
Args:
fips_codes: List of FIPS codes
variables: List of statistical variables
Returns:
DataFrame with one row per jurisdiction
Example:
>>> client = DataCommonsClient()
>>> fips_codes = ["01073", "01089", "01097"] # 3 AL counties
>>> df = client.enrich_jurisdictions_bulk(fips_codes)
>>> print(df[["fips_code", "Count_Person", "Median_Income_Household"]])
"""
if variables is None:
variables = self.ALL_VARS
dcids = [self.get_place_dcid(fips) for fips in fips_codes]
try:
# Use datacommons_pandas for efficient bulk retrieval
df = dcpd.build_multivariate(
dcids=dcids,
stat_vars=variables
)
# Add FIPS codes
df["fips_code"] = fips_codes
df["data_source"] = "Google Data Commons"
df["retrieval_date"] = pd.Timestamp.now().isoformat()
return df
except Exception as e:
logger.error(f"Error enriching bulk jurisdictions: {e}")
return pd.DataFrame({"error": [str(e)]})
def get_time_series(
self,
fips_code: str,
variables: Optional[List[str]] = None,
start_year: int = 2010,
end_year: int = 2023
) -> pd.DataFrame:
"""
Get time series data for a jurisdiction.
Args:
fips_code: FIPS code
variables: Statistical variables (default: economic indicators)
start_year: Start year
end_year: End year
Returns:
DataFrame with time series (date index)
Example:
>>> client = DataCommonsClient()
>>> df = client.get_time_series("01073", start_year=2015)
>>> df.plot(y="Median_Income_Household")
"""
if variables is None:
variables = self.ECONOMIC_VARS
dcid = self.get_place_dcid(fips_code)
try:
df = dcpd.build_time_series(
place=dcid,
stat_vars=variables
)
# Filter by year range
df = df.loc[f"{start_year}":f"{end_year}"]
return df
except Exception as e:
logger.error(f"Error getting time series for {fips_code}: {e}")
return pd.DataFrame({"error": [str(e)]})
def search_variables(self, query: str) -> List[Dict[str, str]]:
"""
Search for available statistical variables.
Args:
query: Search query (e.g., "income", "education", "health")
Returns:
List of {dcid, name, description}
Example:
>>> client = DataCommonsClient()
>>> vars = client.search_variables("dental health")
>>> for v in vars:
... print(v["dcid"], v["name"])
"""
try:
results = dc.search_statvar(query, max_results=50)
return [
{
"dcid": r.dcid,
"name": getattr(r, 'name', r.dcid),
"description": getattr(r, 'description', '')
}
for r in results
]
except Exception as e:
logger.error(f"Error searching variables: {e}")
return []
def example_usage():
"""Example usage of Data Commons integration."""
client = DataCommonsClient()
# Example 1: Enrich a single county
print("Example 1: Jefferson County, AL (FIPS 01073)")
data = client.enrich_jurisdiction("01073")
print(f"Population: {data.get('Count_Person')}")
print(f"Median Income: ${data.get('Median_Income_Household')}")
print(f"Unemployment Rate: {data.get('UnemploymentRate_Person')}%")
print()
# Example 2: Bulk enrich multiple counties
print("Example 2: Top 3 AL counties by population")
fips_codes = ["01073", "01089", "01097"] # Jefferson, Madison, Mobile
df = client.enrich_jurisdictions_bulk(fips_codes)
print(df[["fips_code", "Count_Person", "Median_Income_Household"]])
print()
# Example 3: Time series
print("Example 3: Income trends for Birmingham, AL")
df_ts = client.get_time_series(
"0107000", # Birmingham city
variables=["Median_Income_Household"],
start_year=2015
)
print(df_ts)
print()
# Example 4: Search for dental health variables
print("Example 4: Search for dental health variables")
vars = client.search_variables("dental health")
for v in vars[:5]:
print(f" - {v['dcid']}: {v['name']}")
if __name__ == "__main__":
if DATACOMMONS_AVAILABLE:
example_usage()
else:
print("Install datacommons: pip install datacommons datacommons-pandas")
|