Spaces:
Running
Running
| """ | |
| Enhanced data handlers for multiple geospatial data sources | |
| """ | |
| import pandas as pd | |
| import requests | |
| from typing import Dict, List, Optional | |
| import json | |
| class DataEnhancer: | |
| """ | |
| Additional data sources and enrichment for geospatial queries | |
| """ | |
| def get_sample_economic_data(): | |
| """ | |
| Sample economic indicators (in production, connect to World Bank API) | |
| """ | |
| return { | |
| 'United States': {'gdp_growth': 2.1, 'unemployment': 3.7, 'inflation': 3.2}, | |
| 'China': {'gdp_growth': 5.2, 'unemployment': 5.0, 'inflation': 0.2}, | |
| 'Germany': {'gdp_growth': 0.1, 'unemployment': 3.0, 'inflation': 6.1}, | |
| 'India': {'gdp_growth': 7.2, 'unemployment': 8.0, 'inflation': 5.4}, | |
| 'Brazil': {'gdp_growth': 2.9, 'unemployment': 8.5, 'inflation': 4.6}, | |
| 'United Kingdom': {'gdp_growth': 0.5, 'unemployment': 3.9, 'inflation': 4.0}, | |
| 'France': {'gdp_growth': 0.9, 'unemployment': 7.2, 'inflation': 5.2}, | |
| 'Japan': {'gdp_growth': 1.9, 'unemployment': 2.6, 'inflation': 3.2}, | |
| 'South Korea': {'gdp_growth': 1.4, 'unemployment': 2.7, 'inflation': 3.6}, | |
| 'Canada': {'gdp_growth': 1.1, 'unemployment': 5.4, 'inflation': 3.9} | |
| } | |
| def get_sample_environmental_data(): | |
| """ | |
| Sample environmental indicators | |
| """ | |
| return { | |
| 'United States': {'co2_per_capita': 15.5, 'renewable_energy': 12.6, 'forest_coverage': 33.9}, | |
| 'China': {'co2_per_capita': 7.4, 'renewable_energy': 12.4, 'forest_coverage': 23.0}, | |
| 'Germany': {'co2_per_capita': 8.4, 'renewable_energy': 19.3, 'forest_coverage': 32.7}, | |
| 'India': {'co2_per_capita': 1.9, 'renewable_energy': 17.5, 'forest_coverage': 24.4}, | |
| 'Brazil': {'co2_per_capita': 2.2, 'renewable_energy': 46.1, 'forest_coverage': 59.4}, | |
| 'Russia': {'co2_per_capita': 11.4, 'renewable_energy': 5.1, 'forest_coverage': 49.8}, | |
| 'Japan': {'co2_per_capita': 8.7, 'renewable_energy': 10.2, 'forest_coverage': 68.5}, | |
| 'Australia': {'co2_per_capita': 16.8, 'renewable_energy': 11.9, 'forest_coverage': 17.4} | |
| } | |
| def enrich_dataframe(df: pd.DataFrame, data_type: str = 'economic') -> pd.DataFrame: | |
| """ | |
| Enrich existing dataframe with additional indicators | |
| """ | |
| enriched_df = df.copy() | |
| if data_type == 'economic': | |
| extra_data = DataEnhancer.get_sample_economic_data() | |
| elif data_type == 'environmental': | |
| extra_data = DataEnhancer.get_sample_environmental_data() | |
| else: | |
| return enriched_df | |
| # Add new columns | |
| for indicator in ['gdp_growth', 'unemployment', 'inflation', | |
| 'co2_per_capita', 'renewable_energy', 'forest_coverage']: | |
| enriched_df[indicator] = enriched_df['name'].map( | |
| lambda x: extra_data.get(x, {}).get(indicator, None) | |
| ) | |
| return enriched_df | |
| def get_regional_aggregates(df: pd.DataFrame) -> pd.DataFrame: | |
| """ | |
| Calculate regional aggregates | |
| """ | |
| regional_stats = df.groupby('continent').agg({ | |
| 'pop_est': 'sum', | |
| 'gdp_md_est': 'sum', | |
| 'name': 'count' | |
| }).reset_index() | |
| regional_stats.columns = ['continent', 'total_population', 'total_gdp', 'country_count'] | |
| regional_stats['avg_gdp_per_capita'] = ( | |
| regional_stats['total_gdp'] / regional_stats['total_population'] * 1000000 | |
| ) | |
| return regional_stats | |
| class QueryEnhancer: | |
| """ | |
| Enhance and validate queries | |
| """ | |
| CONTINENT_MAP = { | |
| 'asia': 'Asia', | |
| 'europe': 'Europe', | |
| 'africa': 'Africa', | |
| 'north america': 'North America', | |
| 'south america': 'South America', | |
| 'oceania': 'Oceania', | |
| 'antarctica': 'Antarctica' | |
| } | |
| COUNTRY_GROUPS = { | |
| 'brics': ['Brazil', 'Russia', 'India', 'China', 'South Africa'], | |
| 'g7': ['United States of America', 'Japan', 'Germany', 'United Kingdom', | |
| 'France', 'Italy', 'Canada'], | |
| 'asean': ['Indonesia', 'Thailand', 'Philippines', 'Vietnam', 'Myanmar', | |
| 'Malaysia', 'Singapore', 'Cambodia', 'Laos', 'Brunei'], | |
| 'gcc': ['Saudi Arabia', 'United Arab Emirates', 'Kuwait', 'Qatar', 'Bahrain', 'Oman'], | |
| 'eu': ['Germany', 'France', 'Italy', 'Spain', 'Poland', 'Romania', 'Netherlands', | |
| 'Belgium', 'Greece', 'Portugal', 'Czech Republic', 'Hungary', 'Sweden', | |
| 'Austria', 'Bulgaria', 'Denmark', 'Finland', 'Slovakia', 'Ireland', | |
| 'Croatia', 'Lithuania', 'Slovenia', 'Latvia', 'Estonia', 'Cyprus', | |
| 'Luxembourg', 'Malta'] | |
| } | |
| def expand_location(cls, location: str) -> List[str]: | |
| """ | |
| Expand location strings to actual country/region names | |
| """ | |
| location_lower = location.lower() | |
| # Check if it's a continent | |
| if location_lower in cls.CONTINENT_MAP: | |
| return [cls.CONTINENT_MAP[location_lower]] | |
| # Check if it's a country group | |
| if location_lower in cls.COUNTRY_GROUPS: | |
| return cls.COUNTRY_GROUPS[location_lower] | |
| # Return as-is | |
| return [location] | |
| def validate_indicators(cls, indicators: List[str]) -> List[str]: | |
| """ | |
| Validate and normalize indicator names | |
| """ | |
| valid_indicators = [] | |
| indicator_mapping = { | |
| 'population': 'pop_est', | |
| 'gdp': 'gdp_md_est', | |
| 'density': 'pop_density', | |
| 'per capita': 'gdp_per_capita', | |
| 'co2': 'co2_per_capita', | |
| 'renewable': 'renewable_energy', | |
| 'forest': 'forest_coverage', | |
| 'growth': 'gdp_growth', | |
| 'unemployment': 'unemployment', | |
| 'inflation': 'inflation' | |
| } | |
| for indicator in indicators: | |
| indicator_lower = indicator.lower() | |
| for key, value in indicator_mapping.items(): | |
| if key in indicator_lower: | |
| valid_indicators.append(value) | |
| break | |
| else: | |
| valid_indicators.append('pop_est') # default | |
| return list(set(valid_indicators)) # Remove duplicates | |
| # Statistical analysis utilities | |
| class GeoStats: | |
| """ | |
| Statistical analysis for geospatial data | |
| """ | |
| def calculate_correlation(df: pd.DataFrame, col1: str, col2: str) -> float: | |
| """ | |
| Calculate correlation between two indicators | |
| """ | |
| try: | |
| return df[[col1, col2]].corr().iloc[0, 1] | |
| except: | |
| return 0.0 | |
| def get_outliers(df: pd.DataFrame, column: str) -> pd.DataFrame: | |
| """ | |
| Identify outliers using IQR method | |
| """ | |
| Q1 = df[column].quantile(0.25) | |
| Q3 = df[column].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| lower_bound = Q1 - 1.5 * IQR | |
| upper_bound = Q3 + 1.5 * IQR | |
| outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)] | |
| return outliers | |
| def generate_summary_stats(df: pd.DataFrame, column: str) -> Dict: | |
| """ | |
| Generate summary statistics for a column | |
| """ | |
| return { | |
| 'mean': df[column].mean(), | |
| 'median': df[column].median(), | |
| 'std': df[column].std(), | |
| 'min': df[column].min(), | |
| 'max': df[column].max(), | |
| 'count': df[column].count() | |
| } | |