geospatial-ai-query / data_utils.py
rifatSDAS's picture
Initial commit: Geospatial AI Query System
2171c22
"""
Enhanced data handlers for multiple geospatial data sources
"""
import pandas as pd
import requests
from typing import Dict, List, Optional
import json
class DataEnhancer:
"""
Additional data sources and enrichment for geospatial queries
"""
@staticmethod
def get_sample_economic_data():
"""
Sample economic indicators (in production, connect to World Bank API)
"""
return {
'United States': {'gdp_growth': 2.1, 'unemployment': 3.7, 'inflation': 3.2},
'China': {'gdp_growth': 5.2, 'unemployment': 5.0, 'inflation': 0.2},
'Germany': {'gdp_growth': 0.1, 'unemployment': 3.0, 'inflation': 6.1},
'India': {'gdp_growth': 7.2, 'unemployment': 8.0, 'inflation': 5.4},
'Brazil': {'gdp_growth': 2.9, 'unemployment': 8.5, 'inflation': 4.6},
'United Kingdom': {'gdp_growth': 0.5, 'unemployment': 3.9, 'inflation': 4.0},
'France': {'gdp_growth': 0.9, 'unemployment': 7.2, 'inflation': 5.2},
'Japan': {'gdp_growth': 1.9, 'unemployment': 2.6, 'inflation': 3.2},
'South Korea': {'gdp_growth': 1.4, 'unemployment': 2.7, 'inflation': 3.6},
'Canada': {'gdp_growth': 1.1, 'unemployment': 5.4, 'inflation': 3.9}
}
@staticmethod
def get_sample_environmental_data():
"""
Sample environmental indicators
"""
return {
'United States': {'co2_per_capita': 15.5, 'renewable_energy': 12.6, 'forest_coverage': 33.9},
'China': {'co2_per_capita': 7.4, 'renewable_energy': 12.4, 'forest_coverage': 23.0},
'Germany': {'co2_per_capita': 8.4, 'renewable_energy': 19.3, 'forest_coverage': 32.7},
'India': {'co2_per_capita': 1.9, 'renewable_energy': 17.5, 'forest_coverage': 24.4},
'Brazil': {'co2_per_capita': 2.2, 'renewable_energy': 46.1, 'forest_coverage': 59.4},
'Russia': {'co2_per_capita': 11.4, 'renewable_energy': 5.1, 'forest_coverage': 49.8},
'Japan': {'co2_per_capita': 8.7, 'renewable_energy': 10.2, 'forest_coverage': 68.5},
'Australia': {'co2_per_capita': 16.8, 'renewable_energy': 11.9, 'forest_coverage': 17.4}
}
@staticmethod
def enrich_dataframe(df: pd.DataFrame, data_type: str = 'economic') -> pd.DataFrame:
"""
Enrich existing dataframe with additional indicators
"""
enriched_df = df.copy()
if data_type == 'economic':
extra_data = DataEnhancer.get_sample_economic_data()
elif data_type == 'environmental':
extra_data = DataEnhancer.get_sample_environmental_data()
else:
return enriched_df
# Add new columns
for indicator in ['gdp_growth', 'unemployment', 'inflation',
'co2_per_capita', 'renewable_energy', 'forest_coverage']:
enriched_df[indicator] = enriched_df['name'].map(
lambda x: extra_data.get(x, {}).get(indicator, None)
)
return enriched_df
@staticmethod
def get_regional_aggregates(df: pd.DataFrame) -> pd.DataFrame:
"""
Calculate regional aggregates
"""
regional_stats = df.groupby('continent').agg({
'pop_est': 'sum',
'gdp_md_est': 'sum',
'name': 'count'
}).reset_index()
regional_stats.columns = ['continent', 'total_population', 'total_gdp', 'country_count']
regional_stats['avg_gdp_per_capita'] = (
regional_stats['total_gdp'] / regional_stats['total_population'] * 1000000
)
return regional_stats
class QueryEnhancer:
"""
Enhance and validate queries
"""
CONTINENT_MAP = {
'asia': 'Asia',
'europe': 'Europe',
'africa': 'Africa',
'north america': 'North America',
'south america': 'South America',
'oceania': 'Oceania',
'antarctica': 'Antarctica'
}
COUNTRY_GROUPS = {
'brics': ['Brazil', 'Russia', 'India', 'China', 'South Africa'],
'g7': ['United States of America', 'Japan', 'Germany', 'United Kingdom',
'France', 'Italy', 'Canada'],
'asean': ['Indonesia', 'Thailand', 'Philippines', 'Vietnam', 'Myanmar',
'Malaysia', 'Singapore', 'Cambodia', 'Laos', 'Brunei'],
'gcc': ['Saudi Arabia', 'United Arab Emirates', 'Kuwait', 'Qatar', 'Bahrain', 'Oman'],
'eu': ['Germany', 'France', 'Italy', 'Spain', 'Poland', 'Romania', 'Netherlands',
'Belgium', 'Greece', 'Portugal', 'Czech Republic', 'Hungary', 'Sweden',
'Austria', 'Bulgaria', 'Denmark', 'Finland', 'Slovakia', 'Ireland',
'Croatia', 'Lithuania', 'Slovenia', 'Latvia', 'Estonia', 'Cyprus',
'Luxembourg', 'Malta']
}
@classmethod
def expand_location(cls, location: str) -> List[str]:
"""
Expand location strings to actual country/region names
"""
location_lower = location.lower()
# Check if it's a continent
if location_lower in cls.CONTINENT_MAP:
return [cls.CONTINENT_MAP[location_lower]]
# Check if it's a country group
if location_lower in cls.COUNTRY_GROUPS:
return cls.COUNTRY_GROUPS[location_lower]
# Return as-is
return [location]
@classmethod
def validate_indicators(cls, indicators: List[str]) -> List[str]:
"""
Validate and normalize indicator names
"""
valid_indicators = []
indicator_mapping = {
'population': 'pop_est',
'gdp': 'gdp_md_est',
'density': 'pop_density',
'per capita': 'gdp_per_capita',
'co2': 'co2_per_capita',
'renewable': 'renewable_energy',
'forest': 'forest_coverage',
'growth': 'gdp_growth',
'unemployment': 'unemployment',
'inflation': 'inflation'
}
for indicator in indicators:
indicator_lower = indicator.lower()
for key, value in indicator_mapping.items():
if key in indicator_lower:
valid_indicators.append(value)
break
else:
valid_indicators.append('pop_est') # default
return list(set(valid_indicators)) # Remove duplicates
# Statistical analysis utilities
class GeoStats:
"""
Statistical analysis for geospatial data
"""
@staticmethod
def calculate_correlation(df: pd.DataFrame, col1: str, col2: str) -> float:
"""
Calculate correlation between two indicators
"""
try:
return df[[col1, col2]].corr().iloc[0, 1]
except:
return 0.0
@staticmethod
def get_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
"""
Identify outliers using IQR method
"""
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
return outliers
@staticmethod
def generate_summary_stats(df: pd.DataFrame, column: str) -> Dict:
"""
Generate summary statistics for a column
"""
return {
'mean': df[column].mean(),
'median': df[column].median(),
'std': df[column].std(),
'min': df[column].min(),
'max': df[column].max(),
'count': df[column].count()
}