|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import os |
|
|
import logging |
|
|
from datetime import datetime |
|
|
from src.config_manager import get_config |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
COLUMN_MAPPING = { |
|
|
|
|
|
'Country': 'country', |
|
|
'Country Name': 'country', |
|
|
'country_name': 'country', |
|
|
'nation': 'country', |
|
|
'country': 'country', |
|
|
'Country Name': 'country', |
|
|
'Nation': 'country', |
|
|
'Country Name': 'country', |
|
|
'Country': 'country', |
|
|
'COUNTRY': 'country', |
|
|
'country': 'country', |
|
|
|
|
|
|
|
|
'Year': 'year', |
|
|
'year_value': 'year', |
|
|
'year': 'year', |
|
|
|
|
|
|
|
|
'2025 SDG Index Score': 'sdg_index_score', |
|
|
'SDG Index Score': 'sdg_index_score', |
|
|
'SDG Index': 'sdg_index_score', |
|
|
'sdg_score': 'sdg_index_score', |
|
|
'overall_score': 'sdg_index_score', |
|
|
'index score': 'sdg_index_score', |
|
|
'sdgi_s': 'sdg_index_score', |
|
|
'SDGI_S': 'sdg_index_score', |
|
|
'SDG_Score': 'sdg_index_score', |
|
|
'sdg_index_score': 'sdg_index_score', |
|
|
'SDG_Index_Score': 'sdg_index_score', |
|
|
'Overall_Score': 'sdg_index_score', |
|
|
'INDEX_SCORE': 'sdg_index_score', |
|
|
|
|
|
|
|
|
'Goal 1 Score': 'goal_1_score', |
|
|
'Goal 1': 'goal_1_score', |
|
|
'sdg_1_score': 'goal_1_score', |
|
|
|
|
|
'Goal 2 Score': 'goal_2_score', |
|
|
'Goal 2': 'goal_2_score', |
|
|
'sdg_2_score': 'goal_2_score', |
|
|
|
|
|
'Goal 3 Score': 'goal_3_score', |
|
|
'Goal 3': 'goal_3_score', |
|
|
'sdg_3_score': 'goal_3_score', |
|
|
|
|
|
'Goal 4 Score': 'goal_4_score', |
|
|
'Goal 4': 'goal_4_score', |
|
|
'sdg_4_score': 'goal_4_score', |
|
|
|
|
|
'Goal 5 Score': 'goal_5_score', |
|
|
'Goal 5': 'goal_5_score', |
|
|
'sdg_5_score': 'goal_5_score', |
|
|
|
|
|
'Goal 6 Score': 'goal_6_score', |
|
|
'Goal 6': 'goal_6_score', |
|
|
'sdg_6_score': 'goal_6_score', |
|
|
|
|
|
'Goal 7 Score': 'goal_7_score', |
|
|
'Goal 7': 'goal_7_score', |
|
|
'sdg_7_score': 'goal_7_score', |
|
|
|
|
|
'Goal 8 Score': 'goal_8_score', |
|
|
'Goal 8': 'goal_8_score', |
|
|
'sdg_8_score': 'goal_8_score', |
|
|
|
|
|
'Goal 9 Score': 'goal_9_score', |
|
|
'Goal 9': 'goal_9_score', |
|
|
'sdg_9_score': 'goal_9_score', |
|
|
|
|
|
'Goal 10 Score': 'goal_10_score', |
|
|
'Goal 10': 'goal_10_score', |
|
|
'sdg_10_score': 'goal_10_score', |
|
|
|
|
|
'Goal 11 Score': 'goal_11_score', |
|
|
'Goal 11': 'goal_11_score', |
|
|
'sdg_11_score': 'goal_11_score', |
|
|
|
|
|
'Goal 12 Score': 'goal_12_score', |
|
|
'Goal 12': 'goal_12_score', |
|
|
'sdg_12_score': 'goal_12_score', |
|
|
|
|
|
'Goal 13 Score': 'goal_13_score', |
|
|
'Goal 13': 'goal_13_score', |
|
|
'sdg_13_score': 'goal_13_score', |
|
|
|
|
|
'Goal 14 Score': 'goal_14_score', |
|
|
'Goal 14': 'goal_14_score', |
|
|
'sdg_14_score': 'goal_14_score', |
|
|
|
|
|
'Goal 15 Score': 'goal_15_score', |
|
|
'Goal 15': 'goal_15_score', |
|
|
'sdg_15_score': 'goal_15_score', |
|
|
|
|
|
'Goal 16 Score': 'goal_16_score', |
|
|
'Goal 16': 'goal_16_score', |
|
|
'sdg_16_score': 'goal_16_score', |
|
|
|
|
|
'Goal 17 Score': 'goal_17_score', |
|
|
'Goal 17': 'goal_17_score', |
|
|
'sdg_17_score': 'goal_17_score', |
|
|
|
|
|
|
|
|
'goal1': 'goal_1_score', |
|
|
'goal2': 'goal_2_score', |
|
|
'goal3': 'goal_3_score', |
|
|
'goal4': 'goal_4_score', |
|
|
'goal5': 'goal_5_score', |
|
|
'goal6': 'goal_6_score', |
|
|
'goal7': 'goal_7_score', |
|
|
'goal8': 'goal_8_score', |
|
|
'goal9': 'goal_9_score', |
|
|
'goal10': 'goal_10_score', |
|
|
'goal11': 'goal_11_score', |
|
|
'goal12': 'goal_12_score', |
|
|
'goal13': 'goal_13_score', |
|
|
'goal14': 'goal_14_score', |
|
|
'goal15': 'goal_15_score', |
|
|
'goal16': 'goal_16_score', |
|
|
'goal17': 'goal_17_score', |
|
|
|
|
|
|
|
|
'Country Code': 'country_code', |
|
|
'country_code_alpha_3': 'country_code', |
|
|
'iso3': 'country_code' |
|
|
} |
|
|
|
|
|
|
|
|
COUNTRY_NAME_MAPPING = { |
|
|
|
|
|
'United States of America': 'United States', |
|
|
'USA': 'United States', |
|
|
'US': 'United States', |
|
|
|
|
|
'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom', |
|
|
'UK': 'United Kingdom', |
|
|
'Britain': 'United Kingdom', |
|
|
'Great Britain': 'United Kingdom', |
|
|
|
|
|
'Russian Federation': 'Russia', |
|
|
'Russia': 'Russia', |
|
|
|
|
|
'Korea, Republic of': 'South Korea', |
|
|
'Republic of Korea': 'South Korea', |
|
|
'Korea (South)': 'South Korea', |
|
|
'South Korea': 'South Korea', |
|
|
|
|
|
'Korea, Democratic People\'s Republic of': 'North Korea', |
|
|
'North Korea': 'North Korea', |
|
|
|
|
|
'Iran, Islamic Republic of': 'Iran', |
|
|
'Islamic Republic of Iran': 'Iran', |
|
|
|
|
|
'Venezuela, Bolivarian Republic of': 'Venezuela', |
|
|
'Bolivarian Republic of Venezuela': 'Venezuela', |
|
|
|
|
|
'Bolivia, Plurinational State of': 'Bolivia', |
|
|
'Plurinational State of Bolivia': 'Bolivia', |
|
|
|
|
|
'United Republic of Tanzania': 'Tanzania', |
|
|
'Tanzania, United Republic of': 'Tanzania', |
|
|
|
|
|
'Viet Nam': 'Vietnam', |
|
|
'Vietnam': 'Vietnam', |
|
|
|
|
|
'Republic of Moldova': 'Moldova', |
|
|
'Moldova, Republic of': 'Moldova', |
|
|
|
|
|
'The former Yugoslav Republic of Macedonia': 'North Macedonia', |
|
|
'North Macedonia': 'North Macedonia', |
|
|
'Macedonia': 'North Macedonia', |
|
|
|
|
|
'Syrian Arab Republic': 'Syria', |
|
|
'Syria': 'Syria', |
|
|
|
|
|
'Côte d\'Ivoire': 'Ivory Coast', |
|
|
'Ivory Coast': 'Ivory Coast', |
|
|
|
|
|
'Congo, Democratic Republic of the': 'Democratic Republic of the Congo', |
|
|
'Congo, Republic of the': 'Republic of the Congo', |
|
|
'Congo': 'Republic of the Congo', |
|
|
|
|
|
'Myanmar': 'Myanmar', |
|
|
'Burma': 'Myanmar', |
|
|
|
|
|
'Czech Republic': 'Czech Republic', |
|
|
'Czechia': 'Czech Republic', |
|
|
|
|
|
'Eswatini': 'Eswatini', |
|
|
'Swaziland': 'Eswatini', |
|
|
|
|
|
'Cabo Verde': 'Cape Verde', |
|
|
'Cape Verde': 'Cape Verde' |
|
|
, |
|
|
|
|
|
'Chinese Taipei': 'Taiwan', |
|
|
'Taipei, Chinese': 'Taiwan' |
|
|
} |
|
|
|
|
|
def load_sdg_data(file_path=None): |
|
|
""" |
|
|
Load and clean SDG data (CSV or Excel) with comprehensive validation and error handling. |
|
|
|
|
|
Features: |
|
|
- Supports both CSV and Excel formats |
|
|
- BOM handling (encoding='utf-8-sig') |
|
|
- Missing value imputation |
|
|
- Schema validation |
|
|
- Country name normalization |
|
|
- Data source tracking |
|
|
""" |
|
|
try: |
|
|
|
|
|
if file_path is None: |
|
|
config = get_config() |
|
|
file_path = config.get('data_sources.primary_data_path') |
|
|
|
|
|
|
|
|
if file_path is None or not os.path.exists(file_path): |
|
|
|
|
|
repo_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'SDR2025-data.xlsx') |
|
|
if os.path.exists(repo_data_path): |
|
|
file_path = repo_data_path |
|
|
logger.info(f"Using SDR2025 data at {repo_data_path}") |
|
|
else: |
|
|
|
|
|
if file_path and not os.path.exists(file_path): |
|
|
logger.warning(f"Configured file {file_path} not found. Falling back to sample data.") |
|
|
else: |
|
|
logger.warning("No data file provided. Using sample data.") |
|
|
return create_sample_data() |
|
|
|
|
|
df = None |
|
|
|
|
|
|
|
|
if file_path.endswith('.xlsx') or file_path.endswith('.xls'): |
|
|
|
|
|
df = load_excel_data(file_path) |
|
|
else: |
|
|
|
|
|
df = load_csv_data(file_path) |
|
|
|
|
|
if df is None or df.empty: |
|
|
logger.warning(f"No valid data loaded from {file_path}. Using sample data.") |
|
|
return create_sample_data() |
|
|
|
|
|
logger.info(f"Successfully loaded data from {file_path}: {len(df)} rows, {len(df.columns)} columns") |
|
|
|
|
|
|
|
|
df = add_data_metadata(df, file_path) |
|
|
|
|
|
return df |
|
|
except Exception as e: |
|
|
logger.error(f"Error loading data from {file_path}: {e}") |
|
|
logger.info("Falling back to sample data") |
|
|
return create_sample_data() |
|
|
|
|
|
def load_csv_data(file_path): |
|
|
"""Load CSV data with proper encoding handling.""" |
|
|
try: |
|
|
|
|
|
df = pd.read_csv(file_path, encoding='utf-8-sig') |
|
|
|
|
|
|
|
|
df.columns = [c.replace('\ufeff', '') for c in df.columns] |
|
|
|
|
|
df.columns = [c.strip() for c in df.columns] |
|
|
|
|
|
|
|
|
df = df.rename(columns=COLUMN_MAPPING) |
|
|
|
|
|
|
|
|
if df.columns.duplicated().any(): |
|
|
df = df.loc[:, ~df.columns.duplicated()] |
|
|
|
|
|
|
|
|
if 'country' in df.columns: |
|
|
df['country'] = df['country'].replace(COUNTRY_NAME_MAPPING) |
|
|
|
|
|
|
|
|
if 'year' in df.columns: |
|
|
target_year = df['year'] |
|
|
if isinstance(target_year, pd.DataFrame): |
|
|
target_year = target_year.iloc[:, 0] |
|
|
df['year'] = pd.to_numeric(target_year, errors='coerce').fillna(0).astype(int) |
|
|
|
|
|
if 'sdg_index_score' in df.columns: |
|
|
target_score = df['sdg_index_score'] |
|
|
if isinstance(target_score, pd.DataFrame): |
|
|
target_score = target_score.iloc[:, 0] |
|
|
df['sdg_index_score'] = pd.to_numeric(target_score, errors='coerce') |
|
|
|
|
|
|
|
|
if 'country' in df.columns and 'year' in df.columns: |
|
|
try: |
|
|
df = df.sort_values(['country', 'year']) |
|
|
|
|
|
df = handle_missing_values(df) |
|
|
except Exception as e: |
|
|
logger.warning(f"Error in missing value handling for CSV data: {e}") |
|
|
|
|
|
try: |
|
|
df = df.sort_values(['country', 'year']) |
|
|
for col in df.columns: |
|
|
if col.endswith('_score'): |
|
|
df[col] = df.groupby('country')[col].ffill().bfill() |
|
|
except Exception as e2: |
|
|
logger.warning(f"Fallback missing value handling also failed: {e2}") |
|
|
|
|
|
return df |
|
|
except Exception as e: |
|
|
logger.error(f"Error loading CSV data: {e}") |
|
|
return None |
|
|
|
|
|
def load_excel_data(file_path): |
|
|
"""Load and process Excel data with 2025 SDR structure and enhanced column detection.""" |
|
|
try: |
|
|
logger.info(f"Loading Excel file: {file_path}") |
|
|
|
|
|
|
|
|
xlsx_file = pd.ExcelFile(file_path) |
|
|
logger.info(f"Available sheets: {xlsx_file.sheet_names}") |
|
|
|
|
|
df = None |
|
|
|
|
|
|
|
|
sheets_found = {} |
|
|
for sheet_name in ['SDR2025 Data', 'Backdated SDG Index', 'Overview', 'Data']: |
|
|
if sheet_name in xlsx_file.sheet_names: |
|
|
try: |
|
|
sheet_df = pd.read_excel(file_path, sheet_name=sheet_name) |
|
|
|
|
|
sheet_df.columns = [str(col).strip() for col in sheet_df.columns] |
|
|
|
|
|
sheet_df = apply_2025_excel_mapping(sheet_df) |
|
|
|
|
|
if 'country' in sheet_df.columns: |
|
|
|
|
|
if 'year' not in sheet_df.columns: |
|
|
|
|
|
if sheet_name == 'SDR2025 Data': |
|
|
sheet_df['year'] = 2025 |
|
|
else: |
|
|
sheet_df['year'] = 2024 |
|
|
|
|
|
sheets_found[sheet_name] = sheet_df |
|
|
logger.info(f"Loaded sheet '{sheet_name}' with {len(sheet_df)} rows") |
|
|
except Exception as e: |
|
|
logger.warning(f"Failed to read sheet '{sheet_name}': {e}") |
|
|
|
|
|
if not sheets_found: |
|
|
logger.error("Could not find any valid data sheets in Excel file") |
|
|
return None |
|
|
|
|
|
|
|
|
if 'SDR2025 Data' in sheets_found and 'Backdated SDG Index' in sheets_found: |
|
|
logger.info("Merging current data (2025) with backdated index (2000-2024)") |
|
|
df = pd.concat([sheets_found['Backdated SDG Index'], sheets_found['SDR2025 Data']], ignore_index=True) |
|
|
|
|
|
df = df.drop_duplicates(subset=['country', 'year'], keep='last') |
|
|
else: |
|
|
|
|
|
for name in ['SDR2025 Data', 'Backdated SDG Index', 'Overview', 'Data']: |
|
|
if name in sheets_found: |
|
|
df = sheets_found[name] |
|
|
break |
|
|
|
|
|
|
|
|
original_columns = list(df.columns) |
|
|
df.columns = [str(col).strip() for col in df.columns] |
|
|
logger.info(f"Cleaned column names from: {original_columns[:5]}{'...' if len(original_columns) > 5 else ''} to: {list(df.columns)[:5]}{'...' if len(df.columns) > 5 else ''}") |
|
|
|
|
|
|
|
|
df = apply_2025_excel_mapping(df) |
|
|
logger.info(f"After column mapping, available columns: {list(df.columns)[:10]}{'...' if len(df.columns) > 10 else ''}") |
|
|
|
|
|
|
|
|
if 'country' in df.columns: |
|
|
df['country'] = df['country'].replace(COUNTRY_NAME_MAPPING) |
|
|
|
|
|
|
|
|
if 'year' not in df.columns: |
|
|
df['year'] = 2025 |
|
|
elif isinstance(df['year'], pd.DataFrame): |
|
|
df['year'] = df['year'].iloc[:, 0] |
|
|
|
|
|
|
|
|
df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(2025).astype(int) |
|
|
|
|
|
|
|
|
if 'sdg_index_score' not in df.columns: |
|
|
|
|
|
score_cols = [col for col in df.columns if 'score' in col.lower() and 'index' in col.lower()] |
|
|
if score_cols: |
|
|
|
|
|
target_col = df[score_cols[0]] |
|
|
if isinstance(target_col, pd.DataFrame): |
|
|
target_col = target_col.iloc[:, 0] |
|
|
df['sdg_index_score'] = pd.to_numeric(target_col, errors='coerce') |
|
|
else: |
|
|
df['sdg_index_score'] = 0 |
|
|
else: |
|
|
|
|
|
target_col = df['sdg_index_score'] |
|
|
if isinstance(target_col, pd.DataFrame): |
|
|
target_col = target_col.iloc[:, 0] |
|
|
df['sdg_index_score'] = pd.to_numeric(target_col, errors='coerce') |
|
|
|
|
|
|
|
|
goal_score_cols = [col for col in df.columns if col.startswith('goal_') and col.endswith('_score')] |
|
|
for col in goal_score_cols: |
|
|
target_col = df[col] |
|
|
if isinstance(target_col, pd.DataFrame): |
|
|
target_col = target_col.iloc[:, 0] |
|
|
df[col] = pd.to_numeric(target_col, errors='coerce') |
|
|
|
|
|
logger.info(f"Successfully processed Excel data: {len(df)} rows, {len(goal_score_cols)} goal score columns") |
|
|
|
|
|
return df |
|
|
except Exception as e: |
|
|
logger.error(f"Error loading Excel data: {e}") |
|
|
return None |
|
|
|
|
|
def apply_2025_excel_mapping(df): |
|
|
"""Apply flexible column mapping for 2025 Excel file structure with comprehensive logging.""" |
|
|
logger.info("Applying flexible 2025 Excel column mapping...") |
|
|
|
|
|
|
|
|
excel_mapping = { |
|
|
|
|
|
'Country': 'country', |
|
|
'country': 'country', |
|
|
'Country Name': 'country', |
|
|
'country_name': 'country', |
|
|
'Nation': 'country', |
|
|
'nation': 'country', |
|
|
'country code iso3': 'country', |
|
|
'ISO3': 'country', |
|
|
|
|
|
|
|
|
'2025 SDG Index Score': 'sdg_index_score', |
|
|
'SDG Index Score': 'sdg_index_score', |
|
|
'sdg_index_score': 'sdg_index_score', |
|
|
'SDG Index': 'sdg_index_score', |
|
|
'index score': 'sdg_index_score', |
|
|
'overall score': 'sdg_index_score', |
|
|
'sdgi_s': 'sdg_index_score', |
|
|
'SDGI_S': 'sdg_index_score', |
|
|
|
|
|
|
|
|
'2025 SDG Index Rank': 'global_rank', |
|
|
'SDG Index Rank': 'global_rank', |
|
|
'Region': 'region', |
|
|
'region': 'region', |
|
|
|
|
|
|
|
|
'Goal 1 Score': 'goal_1_score', |
|
|
'Goal 2 Score': 'goal_2_score', |
|
|
'Goal 3 Score': 'goal_3_score', |
|
|
'Goal 4 Score': 'goal_4_score', |
|
|
'Goal 5 Score': 'goal_5_score', |
|
|
'Goal 6 Score': 'goal_6_score', |
|
|
'Goal 7 Score': 'goal_7_score', |
|
|
'Goal 8 Score': 'goal_8_score', |
|
|
'Goal 9 Score': 'goal_9_score', |
|
|
'Goal 10 Score': 'goal_10_score', |
|
|
'Goal 11 Score': 'goal_11_score', |
|
|
'Goal 12 Score': 'goal_12_score', |
|
|
'Goal 13 Score': 'goal_13_score', |
|
|
'Goal 14 Score': 'goal_14_score', |
|
|
'Goal 15 Score': 'goal_15_score', |
|
|
'Goal 16 Score': 'goal_16_score', |
|
|
'Goal 17 Score': 'goal_17_score', |
|
|
|
|
|
|
|
|
'goal1': 'goal_1_score', |
|
|
'goal2': 'goal_2_score', |
|
|
'goal3': 'goal_3_score', |
|
|
'goal4': 'goal_4_score', |
|
|
'goal5': 'goal_5_score', |
|
|
'goal6': 'goal_6_score', |
|
|
'goal7': 'goal_7_score', |
|
|
'goal8': 'goal_8_score', |
|
|
'goal9': 'goal_9_score', |
|
|
'goal10': 'goal_10_score', |
|
|
'goal11': 'goal_11_score', |
|
|
'goal12': 'goal_12_score', |
|
|
'goal13': 'goal_13_score', |
|
|
'goal14': 'goal_14_score', |
|
|
'goal15': 'goal_15_score', |
|
|
'goal16': 'goal_16_score', |
|
|
'goal17': 'goal_17_score', |
|
|
|
|
|
|
|
|
'SDG1: No Poverty': 'goal_1_score', |
|
|
'SDG2: No Hunger': 'goal_2_score', |
|
|
'SDG3: Good Health and Well-Being': 'goal_3_score', |
|
|
'SDG4: Quality Education': 'goal_4_score', |
|
|
'SDG5: Gender Equality': 'goal_5_score', |
|
|
'SDG6: Clean Water and Sanitation': 'goal_6_score', |
|
|
'SDG7: Affordable and Clean Energy': 'goal_7_score', |
|
|
'SDG8: Decent Work and Economic Growth': 'goal_8_score', |
|
|
'SDG9: Industry, Innovation & Infrastructure': 'goal_9_score', |
|
|
'SDG10: Reduced Inequalities': 'goal_10_score', |
|
|
'SDG11: Sustainable Cities and Communities': 'goal_11_score', |
|
|
'SDG12: Responsible Consumption & Production': 'goal_12_score', |
|
|
'SDG13: Climate Action': 'goal_13_score', |
|
|
'SDG14: Life Below Water': 'goal_14_score', |
|
|
'SDG15: Life on Land': 'goal_15_score', |
|
|
'SDG16: Peace, Justice and Strong Institutions': 'goal_16_score', |
|
|
'SDG17: Partnerships for the Goals': 'goal_17_score', |
|
|
} |
|
|
|
|
|
|
|
|
actual_mappings = {old: new for old, new in excel_mapping.items() if old in df.columns} |
|
|
|
|
|
if actual_mappings: |
|
|
|
|
|
|
|
|
df = df.rename(columns=actual_mappings) |
|
|
logger.info(f"Successfully mapped columns: {list(actual_mappings.keys())}") |
|
|
else: |
|
|
logger.info("No columns found to map") |
|
|
|
|
|
|
|
|
for col in list(df.columns): |
|
|
new_col_name = None |
|
|
|
|
|
if col.startswith('Goal ') and col.endswith(' Score'): |
|
|
try: |
|
|
goal_num = col.split(' ')[1] |
|
|
new_col_name = f'goal_{goal_num}_score' |
|
|
except Exception as e: |
|
|
logger.warning(f"Failed to parse goal column '{col}': {e}") |
|
|
|
|
|
|
|
|
elif col.startswith('SDG') and ':' in col: |
|
|
try: |
|
|
goal_part = col.split(':')[0] |
|
|
goal_num = goal_part.replace('SDG', '') |
|
|
new_col_name = f'goal_{goal_num}_score' |
|
|
except Exception as e: |
|
|
logger.warning(f"Failed to parse SDG column '{col}': {e}") |
|
|
|
|
|
if new_col_name and new_col_name != col: |
|
|
|
|
|
|
|
|
df = df.rename(columns={col: new_col_name}) |
|
|
logger.info(f"Mapped: '{col}' -> '{new_col_name}'") |
|
|
|
|
|
|
|
|
if df.columns.duplicated().any(): |
|
|
logger.warning(f"Duplicate columns detected after mapping: {df.columns[df.columns.duplicated()].unique().tolist()}") |
|
|
|
|
|
df = df.loc[:, ~df.columns.duplicated()] |
|
|
logger.info("Successfully deduplicated columns by keeping the first occurrence.") |
|
|
|
|
|
|
|
|
unnamed_cols = [col for col in df.columns if col.startswith('Unnamed:')] |
|
|
if unnamed_cols: |
|
|
df = df.drop(columns=unnamed_cols) |
|
|
logger.info(f"Removed unnamed columns: {unnamed_cols}") |
|
|
|
|
|
|
|
|
remaining_cols = list(df.columns) |
|
|
logger.info(f"Final columns after mapping: {remaining_cols[:10]}{'...' if len(remaining_cols) > 10 else ''}") |
|
|
|
|
|
return df |
|
|
|
|
|
def validate_data_schema(df): |
|
|
""" |
|
|
Validate the data schema and return validation results. |
|
|
Updated to be more flexible for 2025 Excel data. |
|
|
""" |
|
|
|
|
|
core_required = ['country', 'sdg_index_score'] |
|
|
|
|
|
|
|
|
goal_columns = [f'goal_{i}_score' for i in range(1, 18)] |
|
|
|
|
|
errors = [] |
|
|
warnings = [] |
|
|
|
|
|
|
|
|
missing_core = [col for col in core_required if col not in df.columns] |
|
|
if missing_core: |
|
|
errors.append(f"Missing core required columns: {missing_core}") |
|
|
|
|
|
|
|
|
missing_goals = [col for col in goal_columns if col not in df.columns] |
|
|
if missing_goals: |
|
|
warnings.append(f"Missing goal columns: {len(missing_goals)} out of 17 goals available") |
|
|
|
|
|
|
|
|
if 'year' not in df.columns: |
|
|
warnings.append("Year column not found - will use default year 2025") |
|
|
else: |
|
|
min_year = df['year'].min() |
|
|
max_year = df['year'].max() |
|
|
if min_year < 2000 or max_year > 2030: |
|
|
warnings.append(f"Year range unusual: {min_year}-{max_year} (expected 2000-2030)") |
|
|
|
|
|
|
|
|
score_columns = [col for col in df.columns if col.endswith('_score') and col != 'country_score'] |
|
|
for col in score_columns: |
|
|
if col in df.columns: |
|
|
|
|
|
if df[col].isna().all(): |
|
|
warnings.append(f"Score column {col} contains only NaN values") |
|
|
continue |
|
|
|
|
|
min_score = df[col].min() |
|
|
max_score = df[col].max() |
|
|
if pd.notna(min_score) and pd.notna(max_score): |
|
|
if min_score < -10 or max_score > 110: |
|
|
warnings.append(f"Score column {col} has unusual range: {min_score:.1f}-{max_score:.1f} (expected 0-100)") |
|
|
|
|
|
|
|
|
if 'country' in df.columns: |
|
|
duplicate_countries = df['country'].duplicated().sum() |
|
|
if duplicate_countries > 0: |
|
|
warnings.append(f"Found {duplicate_countries} duplicate country entries") |
|
|
|
|
|
|
|
|
is_valid = len(errors) == 0 |
|
|
|
|
|
return { |
|
|
'valid': is_valid, |
|
|
'errors': errors, |
|
|
'warnings': warnings, |
|
|
'column_count': len(df.columns), |
|
|
'row_count': len(df), |
|
|
'available_goals': len([col for col in goal_columns if col in df.columns]), |
|
|
'year_range': (int(df['year'].min()), int(df['year'].max())) if 'year' in df.columns else (2025, 2025) |
|
|
} |
|
|
|
|
|
def normalize_country_names(df): |
|
|
""" |
|
|
Normalize country names to handle variations like "Taiwan" vs "Taiwan, Province of China". |
|
|
""" |
|
|
|
|
|
country_mappings = { |
|
|
'Taiwan, Province of China': 'Taiwan', |
|
|
'Korea, Republic of': 'South Korea', |
|
|
'Korea (South)': 'South Korea', |
|
|
'United States of America': 'United States', |
|
|
'UK': 'United Kingdom', |
|
|
'Russian Federation': 'Russia', |
|
|
'Iran, Islamic Republic of': 'Iran', |
|
|
'Venezuela, Bolivarian Republic of': 'Venezuela' |
|
|
} |
|
|
|
|
|
|
|
|
for old_name, new_name in country_mappings.items(): |
|
|
df['country'] = df['country'].replace(old_name, new_name) |
|
|
|
|
|
return df |
|
|
|
|
|
def handle_missing_values(df): |
|
|
""" |
|
|
Handle missing values with appropriate strategies for different column types. |
|
|
""" |
|
|
|
|
|
score_columns = [col for col in df.columns if col.endswith('_score')] |
|
|
|
|
|
for col in score_columns: |
|
|
if col in df.columns: |
|
|
try: |
|
|
|
|
|
target_col = df[col] |
|
|
if isinstance(target_col, pd.DataFrame): |
|
|
target_col = target_col.iloc[:, 0] |
|
|
df[col] = pd.to_numeric(target_col, errors='coerce') |
|
|
|
|
|
|
|
|
df[col] = df.groupby('country', group_keys=False)[col].apply(lambda x: x.ffill().bfill()) |
|
|
|
|
|
|
|
|
def safe_fillna(x): |
|
|
try: |
|
|
median_val = x.median() |
|
|
|
|
|
if pd.notna(median_val) and isinstance(median_val, (int, float)): |
|
|
return x.fillna(median_val) |
|
|
else: |
|
|
|
|
|
overall_median = df[col].median() |
|
|
if pd.notna(overall_median): |
|
|
return x.fillna(overall_median) |
|
|
else: |
|
|
|
|
|
return x.fillna(0) |
|
|
except Exception as e: |
|
|
logger.warning(f"Error in fillna for column {col}: {e}") |
|
|
return x.fillna(0) |
|
|
|
|
|
df[col] = df.groupby('year', group_keys=False)[col].apply(safe_fillna) |
|
|
|
|
|
|
|
|
target_col = df[col] |
|
|
if isinstance(target_col, pd.DataFrame): |
|
|
target_col = target_col.iloc[:, 0] |
|
|
df[col] = pd.to_numeric(target_col, errors='coerce').fillna(0) |
|
|
except Exception as e: |
|
|
logger.warning(f"Error processing column {col}: {e}") |
|
|
|
|
|
|
|
|
if 'country_code' in df.columns: |
|
|
try: |
|
|
df['country_code'] = df.groupby('country', group_keys=False)['country_code'].apply(lambda x: x.ffill().bfill()) |
|
|
except Exception as e: |
|
|
logger.warning(f"Error handling country_code missing values: {e}") |
|
|
|
|
|
return df |
|
|
|
|
|
def apply_data_quality_checks(df): |
|
|
""" |
|
|
Apply data quality checks and clean anomalous values. |
|
|
""" |
|
|
|
|
|
score_columns = [col for col in df.columns if col.endswith('_score')] |
|
|
|
|
|
for col in score_columns: |
|
|
if col in df.columns: |
|
|
|
|
|
df[col] = df[col].clip(lower=0, upper=100) |
|
|
|
|
|
|
|
|
if 'year' in df.columns: |
|
|
df['year'] = df['year'].astype(int) |
|
|
|
|
|
return df |
|
|
|
|
|
def add_data_metadata(df, file_path): |
|
|
""" |
|
|
Add metadata columns to track data source and quality. |
|
|
""" |
|
|
|
|
|
df['data_source'] = os.path.basename(file_path) if file_path else 'sample_data' |
|
|
df['last_updated'] = datetime.now().strftime('%Y-%m-%d') |
|
|
df['is_sample_data'] = file_path is None |
|
|
|
|
|
return df |
|
|
|
|
|
def create_sample_data(): |
|
|
""" |
|
|
Create sample data for demonstration when real data is not available. |
|
|
This data is clearly marked as sample data. |
|
|
""" |
|
|
countries = [ |
|
|
"United States", "China", "Japan", "Germany", "United Kingdom", |
|
|
"France", "Taiwan", "South Korea", "Canada", "Australia", |
|
|
"Italy", "Spain", "Netherlands", "Sweden", "Norway" |
|
|
] |
|
|
|
|
|
data = [] |
|
|
for country in countries: |
|
|
for year in range(2000, 2024): |
|
|
|
|
|
base_score = 65 + (year - 2000) * 0.8 |
|
|
|
|
|
country_factor = 1.0 |
|
|
if country in ["Norway", "Sweden", "Denmark"]: |
|
|
country_factor = 1.15 |
|
|
elif country in ["Taiwan", "South Korea"]: |
|
|
country_factor = 1.1 |
|
|
elif country in ["China"]: |
|
|
country_factor = 0.9 |
|
|
|
|
|
final_score = min(100, base_score * country_factor) |
|
|
|
|
|
row = { |
|
|
"country": country, |
|
|
"country_code": f"{country[:3].upper()}", |
|
|
"year": year, |
|
|
"sdg_index_score": final_score, |
|
|
"data_source": "sample_data", |
|
|
"last_updated": datetime.now().strftime('%Y-%m-%d'), |
|
|
"is_sample_data": True |
|
|
} |
|
|
|
|
|
|
|
|
for i in range(1, 18): |
|
|
goal_score = min(100, 50 + (year - 2000) * 1.2 + (i * 1.1) + np.random.normal(0, 5)) |
|
|
row[f"goal_{i}_score"] = max(0, goal_score) |
|
|
|
|
|
data.append(row) |
|
|
|
|
|
sample_df = pd.DataFrame(data) |
|
|
|
|
|
|
|
|
sample_df.attrs['warning'] = "⚠️ SAMPLE DATA - This is demonstration data only and should not be used for actual analysis or decision-making." |
|
|
|
|
|
return sample_df |
|
|
|
|
|
def get_country_list(df): |
|
|
"""Get sorted list of countries with normalization.""" |
|
|
if df is None or df.empty: |
|
|
return [] |
|
|
try: |
|
|
|
|
|
latest_df = get_latest_data(df) |
|
|
if latest_df is not None and not latest_df.empty: |
|
|
return sorted(latest_df['country'].dropna().unique().tolist()) |
|
|
except Exception: |
|
|
|
|
|
pass |
|
|
return sorted(df['country'].dropna().unique().tolist()) |
|
|
|
|
|
def filter_data(df, country, year_range): |
|
|
"""Filter data by country and year range.""" |
|
|
if df is None or df.empty: |
|
|
return pd.DataFrame() |
|
|
|
|
|
mask = (df['country'] == country) & (df['year'] >= year_range[0]) & (df['year'] <= year_range[1]) |
|
|
return df[mask].sort_values('year') |
|
|
|
|
|
def get_latest_data(df): |
|
|
"""Returns the latest year's data for all countries.""" |
|
|
if df is None or df.empty: |
|
|
return pd.DataFrame() |
|
|
|
|
|
latest_year = df['year'].max() |
|
|
return df[df['year'] == latest_year] |
|
|
|
|
|
def get_data_summary(df): |
|
|
"""Get comprehensive data summary including quality metrics.""" |
|
|
if df is None or df.empty: |
|
|
return { |
|
|
'total_countries': 0, |
|
|
'total_records': 0, |
|
|
'year_range': None, |
|
|
'data_quality_score': 0, |
|
|
'has_sample_data': False, |
|
|
'warnings': ['No data available'] |
|
|
} |
|
|
|
|
|
summary = { |
|
|
'total_countries': df['country'].nunique(), |
|
|
'total_records': len(df), |
|
|
'year_range': (df['year'].min(), df['year'].max()), |
|
|
'data_quality_score': calculate_data_quality_score(df), |
|
|
'has_sample_data': df['is_sample_data'].any() if 'is_sample_data' in df.columns else False, |
|
|
'warnings': [] |
|
|
} |
|
|
|
|
|
|
|
|
if summary['has_sample_data']: |
|
|
summary['warnings'].append('⚠️ This analysis uses sample data for demonstration purposes') |
|
|
|
|
|
|
|
|
missing_pct = df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100 |
|
|
if missing_pct > 5: |
|
|
summary['warnings'].append(f'High missing data percentage: {missing_pct:.1f}%') |
|
|
|
|
|
return summary |
|
|
|
|
|
def calculate_data_quality_score(df): |
|
|
"""Calculate a data quality score (0-100).""" |
|
|
if df is None or df.empty: |
|
|
return 0 |
|
|
|
|
|
score = 100.0 |
|
|
|
|
|
|
|
|
missing_pct = float(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100) |
|
|
score -= missing_pct * 2 |
|
|
|
|
|
|
|
|
if 'year' in df.columns: |
|
|
year_range = int(df['year'].max() - df['year'].min()) |
|
|
if year_range > 30: |
|
|
score -= 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
score_columns = [col for col in df.columns if col.endswith('_score')] |
|
|
if score_columns: |
|
|
logger.debug(f"Found {len(score_columns)} score columns, skipping outlier detection") |
|
|
|
|
|
|
|
|
if hasattr(score, 'iloc'): |
|
|
score = float(score.iloc[0] if len(score) > 0 else 0) |
|
|
|
|
|
return max(0.0, min(100.0, float(score))) |
|
|
|
|
|
def get_country_metrics(df, country, year): |
|
|
""" |
|
|
Get comprehensive metrics for a specific country in a specific year. |
|
|
""" |
|
|
if df is None or df.empty: |
|
|
return None |
|
|
|
|
|
country_data = df[(df['country'] == country) & (df['year'] == year)] |
|
|
if country_data.empty: |
|
|
return None |
|
|
|
|
|
|
|
|
year_data = df[df['year'] == year] |
|
|
if year_data.empty: |
|
|
return None |
|
|
|
|
|
global_avg = year_data['sdg_index_score'].mean() |
|
|
|
|
|
|
|
|
year_data_with_rank = year_data.copy() |
|
|
year_data_with_rank['rank'] = year_data_with_rank['sdg_index_score'].rank(ascending=False, method='min') |
|
|
country_rank = year_data_with_rank[year_data_with_rank['country'] == country]['rank'].values[0] |
|
|
|
|
|
|
|
|
trend_data = df[df['country'] == country] |
|
|
recent_years = trend_data[trend_data['year'] >= (year - 5)] |
|
|
if len(recent_years) > 1: |
|
|
recent_avg = recent_years['sdg_index_score'].mean() |
|
|
trend = recent_avg - trend_data['sdg_index_score'].iloc[0] if len(recent_years) > 0 else 0 |
|
|
else: |
|
|
trend = 0 |
|
|
|
|
|
|
|
|
goal_scores = {} |
|
|
for i in range(1, 18): |
|
|
goal_col = f'goal_{i}_score' |
|
|
if goal_col in country_data.columns: |
|
|
goal_scores[f'goal_{i}'] = country_data[goal_col].values[0] |
|
|
|
|
|
metrics = { |
|
|
'score': country_data['sdg_index_score'].values[0], |
|
|
'rank': int(country_rank), |
|
|
'global_avg': global_avg, |
|
|
'country_count': len(year_data), |
|
|
'trend_5yr': trend, |
|
|
'goal_scores': goal_scores, |
|
|
'data_quality': 'high' if ('is_sample_data' in country_data.columns and not country_data['is_sample_data'].any()) else 'sample', |
|
|
'last_updated': country_data['last_updated'].values[0] if ('last_updated' in country_data.columns and len(country_data) > 0) else 'N/A' |
|
|
} |
|
|
|
|
|
return metrics |
|
|
|