Spaces:

rifatSDAS
/

geospatial-ai-query

Running

App Files Files Community

geospatial-ai-query / data_utils.py

rifatSDAS

Initial commit: Geospatial AI Query System

2171c22 6 days ago

raw

history blame contribute delete

7.81 kB

	"""
	Enhanced data handlers for multiple geospatial data sources
	"""
	import pandas as pd
	import requests
	from typing import Dict, List, Optional
	import json

	class DataEnhancer:
	"""
	Additional data sources and enrichment for geospatial queries
	"""

	@staticmethod
	def get_sample_economic_data():
	"""
	Sample economic indicators (in production, connect to World Bank API)
	"""
	return {
	'United States': {'gdp_growth': 2.1, 'unemployment': 3.7, 'inflation': 3.2},
	'China': {'gdp_growth': 5.2, 'unemployment': 5.0, 'inflation': 0.2},
	'Germany': {'gdp_growth': 0.1, 'unemployment': 3.0, 'inflation': 6.1},
	'India': {'gdp_growth': 7.2, 'unemployment': 8.0, 'inflation': 5.4},
	'Brazil': {'gdp_growth': 2.9, 'unemployment': 8.5, 'inflation': 4.6},
	'United Kingdom': {'gdp_growth': 0.5, 'unemployment': 3.9, 'inflation': 4.0},
	'France': {'gdp_growth': 0.9, 'unemployment': 7.2, 'inflation': 5.2},
	'Japan': {'gdp_growth': 1.9, 'unemployment': 2.6, 'inflation': 3.2},
	'South Korea': {'gdp_growth': 1.4, 'unemployment': 2.7, 'inflation': 3.6},
	'Canada': {'gdp_growth': 1.1, 'unemployment': 5.4, 'inflation': 3.9}
	}

	@staticmethod
	def get_sample_environmental_data():
	"""
	Sample environmental indicators
	"""
	return {
	'United States': {'co2_per_capita': 15.5, 'renewable_energy': 12.6, 'forest_coverage': 33.9},
	'China': {'co2_per_capita': 7.4, 'renewable_energy': 12.4, 'forest_coverage': 23.0},
	'Germany': {'co2_per_capita': 8.4, 'renewable_energy': 19.3, 'forest_coverage': 32.7},
	'India': {'co2_per_capita': 1.9, 'renewable_energy': 17.5, 'forest_coverage': 24.4},
	'Brazil': {'co2_per_capita': 2.2, 'renewable_energy': 46.1, 'forest_coverage': 59.4},
	'Russia': {'co2_per_capita': 11.4, 'renewable_energy': 5.1, 'forest_coverage': 49.8},
	'Japan': {'co2_per_capita': 8.7, 'renewable_energy': 10.2, 'forest_coverage': 68.5},
	'Australia': {'co2_per_capita': 16.8, 'renewable_energy': 11.9, 'forest_coverage': 17.4}
	}

	@staticmethod
	def enrich_dataframe(df: pd.DataFrame, data_type: str = 'economic') -> pd.DataFrame:
	"""
	Enrich existing dataframe with additional indicators
	"""
	enriched_df = df.copy()

	if data_type == 'economic':
	extra_data = DataEnhancer.get_sample_economic_data()
	elif data_type == 'environmental':
	extra_data = DataEnhancer.get_sample_environmental_data()
	else:
	return enriched_df

	# Add new columns
	for indicator in ['gdp_growth', 'unemployment', 'inflation',
	'co2_per_capita', 'renewable_energy', 'forest_coverage']:
	enriched_df[indicator] = enriched_df['name'].map(
	lambda x: extra_data.get(x, {}).get(indicator, None)
	)

	return enriched_df

	@staticmethod
	def get_regional_aggregates(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Calculate regional aggregates
	"""
	regional_stats = df.groupby('continent').agg({
	'pop_est': 'sum',
	'gdp_md_est': 'sum',
	'name': 'count'
	}).reset_index()

	regional_stats.columns = ['continent', 'total_population', 'total_gdp', 'country_count']
	regional_stats['avg_gdp_per_capita'] = (
	regional_stats['total_gdp'] / regional_stats['total_population'] * 1000000
	)

	return regional_stats

	class QueryEnhancer:
	"""
	Enhance and validate queries
	"""

	CONTINENT_MAP = {
	'asia': 'Asia',
	'europe': 'Europe',
	'africa': 'Africa',
	'north america': 'North America',
	'south america': 'South America',
	'oceania': 'Oceania',
	'antarctica': 'Antarctica'
	}

	COUNTRY_GROUPS = {
	'brics': ['Brazil', 'Russia', 'India', 'China', 'South Africa'],
	'g7': ['United States of America', 'Japan', 'Germany', 'United Kingdom',
	'France', 'Italy', 'Canada'],
	'asean': ['Indonesia', 'Thailand', 'Philippines', 'Vietnam', 'Myanmar',
	'Malaysia', 'Singapore', 'Cambodia', 'Laos', 'Brunei'],
	'gcc': ['Saudi Arabia', 'United Arab Emirates', 'Kuwait', 'Qatar', 'Bahrain', 'Oman'],
	'eu': ['Germany', 'France', 'Italy', 'Spain', 'Poland', 'Romania', 'Netherlands',
	'Belgium', 'Greece', 'Portugal', 'Czech Republic', 'Hungary', 'Sweden',
	'Austria', 'Bulgaria', 'Denmark', 'Finland', 'Slovakia', 'Ireland',
	'Croatia', 'Lithuania', 'Slovenia', 'Latvia', 'Estonia', 'Cyprus',
	'Luxembourg', 'Malta']
	}

	@classmethod
	def expand_location(cls, location: str) -> List[str]:
	"""
	Expand location strings to actual country/region names
	"""
	location_lower = location.lower()

	# Check if it's a continent
	if location_lower in cls.CONTINENT_MAP:
	return [cls.CONTINENT_MAP[location_lower]]

	# Check if it's a country group
	if location_lower in cls.COUNTRY_GROUPS:
	return cls.COUNTRY_GROUPS[location_lower]

	# Return as-is
	return [location]

	@classmethod
	def validate_indicators(cls, indicators: List[str]) -> List[str]:
	"""
	Validate and normalize indicator names
	"""
	valid_indicators = []
	indicator_mapping = {
	'population': 'pop_est',
	'gdp': 'gdp_md_est',
	'density': 'pop_density',
	'per capita': 'gdp_per_capita',
	'co2': 'co2_per_capita',
	'renewable': 'renewable_energy',
	'forest': 'forest_coverage',
	'growth': 'gdp_growth',
	'unemployment': 'unemployment',
	'inflation': 'inflation'
	}

	for indicator in indicators:
	indicator_lower = indicator.lower()
	for key, value in indicator_mapping.items():
	if key in indicator_lower:
	valid_indicators.append(value)
	break
	else:
	valid_indicators.append('pop_est') # default

	return list(set(valid_indicators)) # Remove duplicates

	# Statistical analysis utilities
	class GeoStats:
	"""
	Statistical analysis for geospatial data
	"""

	@staticmethod
	def calculate_correlation(df: pd.DataFrame, col1: str, col2: str) -> float:
	"""
	Calculate correlation between two indicators
	"""
	try:
	return df[[col1, col2]].corr().iloc[0, 1]
	except:
	return 0.0

	@staticmethod
	def get_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame:
	"""
	Identify outliers using IQR method
	"""
	Q1 = df[column].quantile(0.25)
	Q3 = df[column].quantile(0.75)
	IQR = Q3 - Q1

	lower_bound = Q1 - 1.5 * IQR
	upper_bound = Q3 + 1.5 * IQR

	outliers = df[(df[column] < lower_bound) \| (df[column] > upper_bound)]
	return outliers

	@staticmethod
	def generate_summary_stats(df: pd.DataFrame, column: str) -> Dict:
	"""
	Generate summary statistics for a column
	"""
	return {
	'mean': df[column].mean(),
	'median': df[column].median(),
	'std': df[column].std(),
	'min': df[column].min(),
	'max': df[column].max(),
	'count': df[column].count()
	}