sdgToPic / src /data_loader.py
Song
chore: use git lfs for large data files
b88006b
import pandas as pd
import numpy as np
import os
import logging
from datetime import datetime
from src.config_manager import get_config
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
# Column mapping for standardizing column names across different data versions
COLUMN_MAPPING = {
# Country field variations - comprehensive case-insensitive matching
'Country': 'country',
'Country Name': 'country',
'country_name': 'country',
'nation': 'country',
'country': 'country',
'Country Name': 'country',
'Nation': 'country',
'Country Name': 'country',
'Country': 'country',
'COUNTRY': 'country',
'country': 'country',
# Year variations
'Year': 'year',
'year_value': 'year',
'year': 'year',
# SDG Index Score variations - flexible matching
'2025 SDG Index Score': 'sdg_index_score',
'SDG Index Score': 'sdg_index_score',
'SDG Index': 'sdg_index_score',
'sdg_score': 'sdg_index_score',
'overall_score': 'sdg_index_score',
'index score': 'sdg_index_score',
'sdgi_s': 'sdg_index_score',
'SDGI_S': 'sdg_index_score',
'SDG_Score': 'sdg_index_score',
'sdg_index_score': 'sdg_index_score',
'SDG_Index_Score': 'sdg_index_score',
'Overall_Score': 'sdg_index_score',
'INDEX_SCORE': 'sdg_index_score',
# Individual goal score mappings
'Goal 1 Score': 'goal_1_score',
'Goal 1': 'goal_1_score',
'sdg_1_score': 'goal_1_score',
'Goal 2 Score': 'goal_2_score',
'Goal 2': 'goal_2_score',
'sdg_2_score': 'goal_2_score',
'Goal 3 Score': 'goal_3_score',
'Goal 3': 'goal_3_score',
'sdg_3_score': 'goal_3_score',
'Goal 4 Score': 'goal_4_score',
'Goal 4': 'goal_4_score',
'sdg_4_score': 'goal_4_score',
'Goal 5 Score': 'goal_5_score',
'Goal 5': 'goal_5_score',
'sdg_5_score': 'goal_5_score',
'Goal 6 Score': 'goal_6_score',
'Goal 6': 'goal_6_score',
'sdg_6_score': 'goal_6_score',
'Goal 7 Score': 'goal_7_score',
'Goal 7': 'goal_7_score',
'sdg_7_score': 'goal_7_score',
'Goal 8 Score': 'goal_8_score',
'Goal 8': 'goal_8_score',
'sdg_8_score': 'goal_8_score',
'Goal 9 Score': 'goal_9_score',
'Goal 9': 'goal_9_score',
'sdg_9_score': 'goal_9_score',
'Goal 10 Score': 'goal_10_score',
'Goal 10': 'goal_10_score',
'sdg_10_score': 'goal_10_score',
'Goal 11 Score': 'goal_11_score',
'Goal 11': 'goal_11_score',
'sdg_11_score': 'goal_11_score',
'Goal 12 Score': 'goal_12_score',
'Goal 12': 'goal_12_score',
'sdg_12_score': 'goal_12_score',
'Goal 13 Score': 'goal_13_score',
'Goal 13': 'goal_13_score',
'sdg_13_score': 'goal_13_score',
'Goal 14 Score': 'goal_14_score',
'Goal 14': 'goal_14_score',
'sdg_14_score': 'goal_14_score',
'Goal 15 Score': 'goal_15_score',
'Goal 15': 'goal_15_score',
'sdg_15_score': 'goal_15_score',
'Goal 16 Score': 'goal_16_score',
'Goal 16': 'goal_16_score',
'sdg_16_score': 'goal_16_score',
'Goal 17 Score': 'goal_17_score',
'Goal 17': 'goal_17_score',
'sdg_17_score': 'goal_17_score',
# Short name variations from backdated index
'goal1': 'goal_1_score',
'goal2': 'goal_2_score',
'goal3': 'goal_3_score',
'goal4': 'goal_4_score',
'goal5': 'goal_5_score',
'goal6': 'goal_6_score',
'goal7': 'goal_7_score',
'goal8': 'goal_8_score',
'goal9': 'goal_9_score',
'goal10': 'goal_10_score',
'goal11': 'goal_11_score',
'goal12': 'goal_12_score',
'goal13': 'goal_13_score',
'goal14': 'goal_14_score',
'goal15': 'goal_15_score',
'goal16': 'goal_16_score',
'goal17': 'goal_17_score',
# Additional common mappings
'Country Code': 'country_code',
'country_code_alpha_3': 'country_code',
'iso3': 'country_code'
}
# Country name mapping for standardizing country names
COUNTRY_NAME_MAPPING = {
# Common variations and alternative names
'United States of America': 'United States',
'USA': 'United States',
'US': 'United States',
'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
'UK': 'United Kingdom',
'Britain': 'United Kingdom',
'Great Britain': 'United Kingdom',
'Russian Federation': 'Russia',
'Russia': 'Russia',
'Korea, Republic of': 'South Korea',
'Republic of Korea': 'South Korea',
'Korea (South)': 'South Korea',
'South Korea': 'South Korea',
'Korea, Democratic People\'s Republic of': 'North Korea',
'North Korea': 'North Korea',
'Iran, Islamic Republic of': 'Iran',
'Islamic Republic of Iran': 'Iran',
'Venezuela, Bolivarian Republic of': 'Venezuela',
'Bolivarian Republic of Venezuela': 'Venezuela',
'Bolivia, Plurinational State of': 'Bolivia',
'Plurinational State of Bolivia': 'Bolivia',
'United Republic of Tanzania': 'Tanzania',
'Tanzania, United Republic of': 'Tanzania',
'Viet Nam': 'Vietnam',
'Vietnam': 'Vietnam',
'Republic of Moldova': 'Moldova',
'Moldova, Republic of': 'Moldova',
'The former Yugoslav Republic of Macedonia': 'North Macedonia',
'North Macedonia': 'North Macedonia',
'Macedonia': 'North Macedonia',
'Syrian Arab Republic': 'Syria',
'Syria': 'Syria',
'Côte d\'Ivoire': 'Ivory Coast',
'Ivory Coast': 'Ivory Coast',
'Congo, Democratic Republic of the': 'Democratic Republic of the Congo',
'Congo, Republic of the': 'Republic of the Congo',
'Congo': 'Republic of the Congo',
'Myanmar': 'Myanmar',
'Burma': 'Myanmar',
'Czech Republic': 'Czech Republic',
'Czechia': 'Czech Republic',
'Eswatini': 'Eswatini',
'Swaziland': 'Eswatini',
'Cabo Verde': 'Cape Verde',
'Cape Verde': 'Cape Verde'
,
# Common variant for Taiwan used in some datasets
'Chinese Taipei': 'Taiwan',
'Taipei, Chinese': 'Taiwan'
}
def load_sdg_data(file_path=None):
"""
Load and clean SDG data (CSV or Excel) with comprehensive validation and error handling.
Features:
- Supports both CSV and Excel formats
- BOM handling (encoding='utf-8-sig')
- Missing value imputation
- Schema validation
- Country name normalization
- Data source tracking
"""
try:
# Get file path from configuration if not provided
if file_path is None:
config = get_config()
file_path = config.get('data_sources.primary_data_path')
# If no file_path provided or file not found, prefer SDR2025 file in data folder
if file_path is None or not os.path.exists(file_path):
# Try repository-relative data/SDR2025-data.xlsx
repo_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'SDR2025-data.xlsx')
if os.path.exists(repo_data_path):
file_path = repo_data_path
logger.info(f"Using SDR2025 data at {repo_data_path}")
else:
# Fallback to any configured path or sample data
if file_path and not os.path.exists(file_path):
logger.warning(f"Configured file {file_path} not found. Falling back to sample data.")
else:
logger.warning("No data file provided. Using sample data.")
return create_sample_data()
df = None
# Detect file type and load accordingly
if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
# Handle Excel files
df = load_excel_data(file_path)
else:
# Handle CSV files
df = load_csv_data(file_path)
if df is None or df.empty:
logger.warning(f"No valid data loaded from {file_path}. Using sample data.")
return create_sample_data()
logger.info(f"Successfully loaded data from {file_path}: {len(df)} rows, {len(df.columns)} columns")
# Add metadata columns
df = add_data_metadata(df, file_path)
return df
except Exception as e:
logger.error(f"Error loading data from {file_path}: {e}")
logger.info("Falling back to sample data")
return create_sample_data()
def load_csv_data(file_path):
"""Load CSV data with proper encoding handling."""
try:
# Load data with BOM handling
df = pd.read_csv(file_path, encoding='utf-8-sig')
# Remove BOM from column names if present
df.columns = [c.replace('\ufeff', '') for c in df.columns]
# Basic cleaning: remove whitespace
df.columns = [c.strip() for c in df.columns]
# Apply column mapping
df = df.rename(columns=COLUMN_MAPPING)
# Deduplicate columns if any mappings caused duplicates
if df.columns.duplicated().any():
df = df.loc[:, ~df.columns.duplicated()]
# Handle Country Name Normalization
if 'country' in df.columns:
df['country'] = df['country'].replace(COUNTRY_NAME_MAPPING)
# Ensure data types and handle potential duplicate columns
if 'year' in df.columns:
target_year = df['year']
if isinstance(target_year, pd.DataFrame):
target_year = target_year.iloc[:, 0]
df['year'] = pd.to_numeric(target_year, errors='coerce').fillna(0).astype(int)
if 'sdg_index_score' in df.columns:
target_score = df['sdg_index_score']
if isinstance(target_score, pd.DataFrame):
target_score = target_score.iloc[:, 0]
df['sdg_index_score'] = pd.to_numeric(target_score, errors='coerce')
# Missing value strategy: Forward fill by country with improved error handling
if 'country' in df.columns and 'year' in df.columns:
try:
df = df.sort_values(['country', 'year'])
# Use the improved handle_missing_values function
df = handle_missing_values(df)
except Exception as e:
logger.warning(f"Error in missing value handling for CSV data: {e}")
# Fallback to simple forward fill
try:
df = df.sort_values(['country', 'year'])
for col in df.columns:
if col.endswith('_score'):
df[col] = df.groupby('country')[col].ffill().bfill()
except Exception as e2:
logger.warning(f"Fallback missing value handling also failed: {e2}")
return df
except Exception as e:
logger.error(f"Error loading CSV data: {e}")
return None
def load_excel_data(file_path):
"""Load and process Excel data with 2025 SDR structure and enhanced column detection."""
try:
logger.info(f"Loading Excel file: {file_path}")
# Load Excel file and find the right sheet
xlsx_file = pd.ExcelFile(file_path)
logger.info(f"Available sheets: {xlsx_file.sheet_names}")
df = None
# Try to load both current data and historical data if available
sheets_found = {}
for sheet_name in ['SDR2025 Data', 'Backdated SDG Index', 'Overview', 'Data']:
if sheet_name in xlsx_file.sheet_names:
try:
sheet_df = pd.read_excel(file_path, sheet_name=sheet_name)
# Basic cleaning
sheet_df.columns = [str(col).strip() for col in sheet_df.columns]
# Map columns
sheet_df = apply_2025_excel_mapping(sheet_df)
if 'country' in sheet_df.columns:
# Ensure year
if 'year' not in sheet_df.columns:
# If it's the main data sheet, default to 2025
if sheet_name == 'SDR2025 Data':
sheet_df['year'] = 2025
else:
sheet_df['year'] = 2024 # Fallback
sheets_found[sheet_name] = sheet_df
logger.info(f"Loaded sheet '{sheet_name}' with {len(sheet_df)} rows")
except Exception as e:
logger.warning(f"Failed to read sheet '{sheet_name}': {e}")
if not sheets_found:
logger.error("Could not find any valid data sheets in Excel file")
return None
# Combine sheets if we have both current and history
if 'SDR2025 Data' in sheets_found and 'Backdated SDG Index' in sheets_found:
logger.info("Merging current data (2025) with backdated index (2000-2024)")
df = pd.concat([sheets_found['Backdated SDG Index'], sheets_found['SDR2025 Data']], ignore_index=True)
# Remove duplicated country-year entries (prefer 2025 sheet if overlap)
df = df.drop_duplicates(subset=['country', 'year'], keep='last')
else:
# Revert to whichever sheet we found first (priority order)
for name in ['SDR2025 Data', 'Backdated SDG Index', 'Overview', 'Data']:
if name in sheets_found:
df = sheets_found[name]
break
# Clean column names
original_columns = list(df.columns)
df.columns = [str(col).strip() for col in df.columns]
logger.info(f"Cleaned column names from: {original_columns[:5]}{'...' if len(original_columns) > 5 else ''} to: {list(df.columns)[:5]}{'...' if len(df.columns) > 5 else ''}")
# Apply 2025 Excel-specific column mapping
df = apply_2025_excel_mapping(df)
logger.info(f"After column mapping, available columns: {list(df.columns)[:10]}{'...' if len(df.columns) > 10 else ''}")
# Handle country name normalization
if 'country' in df.columns:
df['country'] = df['country'].replace(COUNTRY_NAME_MAPPING)
# Ensure required columns exist (add defaults if missing)
if 'year' not in df.columns:
df['year'] = 2025 # Default to 2025 if missing
elif isinstance(df['year'], pd.DataFrame):
df['year'] = df['year'].iloc[:, 0]
# Convert year to numeric
df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(2025).astype(int)
# Handle SDG Index Score
if 'sdg_index_score' not in df.columns:
# Try to find alternative score columns
score_cols = [col for col in df.columns if 'score' in col.lower() and 'index' in col.lower()]
if score_cols:
# Use first matching column, handle if it's a DataFrame (duplicate names)
target_col = df[score_cols[0]]
if isinstance(target_col, pd.DataFrame):
target_col = target_col.iloc[:, 0]
df['sdg_index_score'] = pd.to_numeric(target_col, errors='coerce')
else:
df['sdg_index_score'] = 0
else:
# Handle if it's a DataFrame
target_col = df['sdg_index_score']
if isinstance(target_col, pd.DataFrame):
target_col = target_col.iloc[:, 0]
df['sdg_index_score'] = pd.to_numeric(target_col, errors='coerce')
# Convert goal scores to numeric
goal_score_cols = [col for col in df.columns if col.startswith('goal_') and col.endswith('_score')]
for col in goal_score_cols:
target_col = df[col]
if isinstance(target_col, pd.DataFrame):
target_col = target_col.iloc[:, 0]
df[col] = pd.to_numeric(target_col, errors='coerce')
logger.info(f"Successfully processed Excel data: {len(df)} rows, {len(goal_score_cols)} goal score columns")
return df
except Exception as e:
logger.error(f"Error loading Excel data: {e}")
return None
def apply_2025_excel_mapping(df):
"""Apply flexible column mapping for 2025 Excel file structure with comprehensive logging."""
logger.info("Applying flexible 2025 Excel column mapping...")
# 2025 Excel specific mappings - comprehensive and flexible
excel_mapping = {
# Country field variations
'Country': 'country',
'country': 'country',
'Country Name': 'country',
'country_name': 'country',
'Nation': 'country',
'nation': 'country',
'country code iso3': 'country',
'ISO3': 'country',
# SDG Index Score variations - very flexible matching
'2025 SDG Index Score': 'sdg_index_score',
'SDG Index Score': 'sdg_index_score',
'sdg_index_score': 'sdg_index_score',
'SDG Index': 'sdg_index_score',
'index score': 'sdg_index_score',
'overall score': 'sdg_index_score',
'sdgi_s': 'sdg_index_score',
'SDGI_S': 'sdg_index_score',
# Additional metadata
'2025 SDG Index Rank': 'global_rank',
'SDG Index Rank': 'global_rank',
'Region': 'region',
'region': 'region',
# Goal mappings for SDR2025 Data sheet format (Goal X Score)
'Goal 1 Score': 'goal_1_score',
'Goal 2 Score': 'goal_2_score',
'Goal 3 Score': 'goal_3_score',
'Goal 4 Score': 'goal_4_score',
'Goal 5 Score': 'goal_5_score',
'Goal 6 Score': 'goal_6_score',
'Goal 7 Score': 'goal_7_score',
'Goal 8 Score': 'goal_8_score',
'Goal 9 Score': 'goal_9_score',
'Goal 10 Score': 'goal_10_score',
'Goal 11 Score': 'goal_11_score',
'Goal 12 Score': 'goal_12_score',
'Goal 13 Score': 'goal_13_score',
'Goal 14 Score': 'goal_14_score',
'Goal 15 Score': 'goal_15_score',
'Goal 16 Score': 'goal_16_score',
'Goal 17 Score': 'goal_17_score',
# Variations from backdated index
'goal1': 'goal_1_score',
'goal2': 'goal_2_score',
'goal3': 'goal_3_score',
'goal4': 'goal_4_score',
'goal5': 'goal_5_score',
'goal6': 'goal_6_score',
'goal7': 'goal_7_score',
'goal8': 'goal_8_score',
'goal9': 'goal_9_score',
'goal10': 'goal_10_score',
'goal11': 'goal_11_score',
'goal12': 'goal_12_score',
'goal13': 'goal_13_score',
'goal14': 'goal_14_score',
'goal15': 'goal_15_score',
'goal16': 'goal_16_score',
'goal17': 'goal_17_score',
# Goal mappings for Overview sheet format (SDGX: Name)
'SDG1: No Poverty': 'goal_1_score',
'SDG2: No Hunger': 'goal_2_score',
'SDG3: Good Health and Well-Being': 'goal_3_score',
'SDG4: Quality Education': 'goal_4_score',
'SDG5: Gender Equality': 'goal_5_score',
'SDG6: Clean Water and Sanitation': 'goal_6_score',
'SDG7: Affordable and Clean Energy': 'goal_7_score',
'SDG8: Decent Work and Economic Growth': 'goal_8_score',
'SDG9: Industry, Innovation & Infrastructure': 'goal_9_score',
'SDG10: Reduced Inequalities': 'goal_10_score',
'SDG11: Sustainable Cities and Communities': 'goal_11_score',
'SDG12: Responsible Consumption & Production': 'goal_12_score',
'SDG13: Climate Action': 'goal_13_score',
'SDG14: Life Below Water': 'goal_14_score',
'SDG15: Life on Land': 'goal_15_score',
'SDG16: Peace, Justice and Strong Institutions': 'goal_16_score',
'SDG17: Partnerships for the Goals': 'goal_17_score',
}
# Apply the mapping - only for columns that exist
actual_mappings = {old: new for old, new in excel_mapping.items() if old in df.columns}
if actual_mappings:
# Before renaming, check if any of the target names already exist
# and would cause duplicates that aren't intended
df = df.rename(columns=actual_mappings)
logger.info(f"Successfully mapped columns: {list(actual_mappings.keys())}")
else:
logger.info("No columns found to map")
# Flexible Goal Score mapping - handle 'Goal X Score' format dynamically
for col in list(df.columns):
new_col_name = None
# Match patterns like 'Goal 1 Score', 'Goal 2 Score', etc.
if col.startswith('Goal ') and col.endswith(' Score'):
try:
goal_num = col.split(' ')[1] # Extract number from 'Goal X Score'
new_col_name = f'goal_{goal_num}_score'
except Exception as e:
logger.warning(f"Failed to parse goal column '{col}': {e}")
# Match SDG naming patterns like 'SDG1: No Poverty'
elif col.startswith('SDG') and ':' in col:
try:
goal_part = col.split(':')[0] # Get 'SDG1' from 'SDG1: No Poverty'
goal_num = goal_part.replace('SDG', '') # Remove 'SDG' prefix
new_col_name = f'goal_{goal_num}_score'
except Exception as e:
logger.warning(f"Failed to parse SDG column '{col}': {e}")
if new_col_name and new_col_name != col:
# If target column already exists, we might want to consolidate or just drop this one
# For now, we rename and then we will deduplicate
df = df.rename(columns={col: new_col_name})
logger.info(f"Mapped: '{col}' -> '{new_col_name}'")
# CRITICAL: Handle duplicate column names that might have been created
if df.columns.duplicated().any():
logger.warning(f"Duplicate columns detected after mapping: {df.columns[df.columns.duplicated()].unique().tolist()}")
# Keep the first occurrence of each column name
df = df.loc[:, ~df.columns.duplicated()]
logger.info("Successfully deduplicated columns by keeping the first occurrence.")
# Remove unnamed columns (these are usually status indicators)
unnamed_cols = [col for col in df.columns if col.startswith('Unnamed:')]
if unnamed_cols:
df = df.drop(columns=unnamed_cols)
logger.info(f"Removed unnamed columns: {unnamed_cols}")
# Final column validation
remaining_cols = list(df.columns)
logger.info(f"Final columns after mapping: {remaining_cols[:10]}{'...' if len(remaining_cols) > 10 else ''}")
return df
def validate_data_schema(df):
"""
Validate the data schema and return validation results.
Updated to be more flexible for 2025 Excel data.
"""
# Core required columns (minimum needed for the application to work)
core_required = ['country', 'sdg_index_score']
# Optional goal columns (application can work without them)
goal_columns = [f'goal_{i}_score' for i in range(1, 18)]
errors = []
warnings = []
# Check core required columns
missing_core = [col for col in core_required if col not in df.columns]
if missing_core:
errors.append(f"Missing core required columns: {missing_core}")
# Check goal columns (warnings only, not errors)
missing_goals = [col for col in goal_columns if col not in df.columns]
if missing_goals:
warnings.append(f"Missing goal columns: {len(missing_goals)} out of 17 goals available")
# Check year column (optional but recommended)
if 'year' not in df.columns:
warnings.append("Year column not found - will use default year 2025")
else:
min_year = df['year'].min()
max_year = df['year'].max()
if min_year < 2000 or max_year > 2030:
warnings.append(f"Year range unusual: {min_year}-{max_year} (expected 2000-2030)")
# Check score ranges (0-100) for available score columns
score_columns = [col for col in df.columns if col.endswith('_score') and col != 'country_score']
for col in score_columns:
if col in df.columns:
# Skip if all values are NaN
if df[col].isna().all():
warnings.append(f"Score column {col} contains only NaN values")
continue
min_score = df[col].min()
max_score = df[col].max()
if pd.notna(min_score) and pd.notna(max_score):
if min_score < -10 or max_score > 110: # Allow some tolerance
warnings.append(f"Score column {col} has unusual range: {min_score:.1f}-{max_score:.1f} (expected 0-100)")
# Check data completeness
if 'country' in df.columns:
duplicate_countries = df['country'].duplicated().sum()
if duplicate_countries > 0:
warnings.append(f"Found {duplicate_countries} duplicate country entries")
# Overall validation result
is_valid = len(errors) == 0
return {
'valid': is_valid,
'errors': errors,
'warnings': warnings,
'column_count': len(df.columns),
'row_count': len(df),
'available_goals': len([col for col in goal_columns if col in df.columns]),
'year_range': (int(df['year'].min()), int(df['year'].max())) if 'year' in df.columns else (2025, 2025)
}
def normalize_country_names(df):
"""
Normalize country names to handle variations like "Taiwan" vs "Taiwan, Province of China".
"""
# Country name mapping for normalization
country_mappings = {
'Taiwan, Province of China': 'Taiwan',
'Korea, Republic of': 'South Korea',
'Korea (South)': 'South Korea',
'United States of America': 'United States',
'UK': 'United Kingdom',
'Russian Federation': 'Russia',
'Iran, Islamic Republic of': 'Iran',
'Venezuela, Bolivarian Republic of': 'Venezuela'
}
# Apply mappings
for old_name, new_name in country_mappings.items():
df['country'] = df['country'].replace(old_name, new_name)
return df
def handle_missing_values(df):
"""
Handle missing values with appropriate strategies for different column types.
"""
# For score columns, use forward fill then backward fill
score_columns = [col for col in df.columns if col.endswith('_score')]
for col in score_columns:
if col in df.columns:
try:
# Ensure column is numeric and handle if it's a DataFrame
target_col = df[col]
if isinstance(target_col, pd.DataFrame):
target_col = target_col.iloc[:, 0]
df[col] = pd.to_numeric(target_col, errors='coerce')
# Forward fill within country groups
df[col] = df.groupby('country', group_keys=False)[col].apply(lambda x: x.ffill().bfill())
# If still NaN, use global median for that year with better error handling
def safe_fillna(x):
try:
median_val = x.median()
# Check if median is valid (not NaN and is numeric)
if pd.notna(median_val) and isinstance(median_val, (int, float)):
return x.fillna(median_val)
else:
# If median is invalid, use overall column median
overall_median = df[col].median()
if pd.notna(overall_median):
return x.fillna(overall_median)
else:
# If still no valid median, use 0
return x.fillna(0)
except Exception as e:
logger.warning(f"Error in fillna for column {col}: {e}")
return x.fillna(0)
df[col] = df.groupby('year', group_keys=False)[col].apply(safe_fillna)
# Ensure column is numeric and handle if it's a DataFrame
target_col = df[col]
if isinstance(target_col, pd.DataFrame):
target_col = target_col.iloc[:, 0]
df[col] = pd.to_numeric(target_col, errors='coerce').fillna(0) # Fill final NaNs with 0
except Exception as e:
logger.warning(f"Error processing column {col}: {e}")
# For country_code, forward fill within country
if 'country_code' in df.columns:
try:
df['country_code'] = df.groupby('country', group_keys=False)['country_code'].apply(lambda x: x.ffill().bfill())
except Exception as e:
logger.warning(f"Error handling country_code missing values: {e}")
return df
def apply_data_quality_checks(df):
"""
Apply data quality checks and clean anomalous values.
"""
# Score columns should be between 0 and 100
score_columns = [col for col in df.columns if col.endswith('_score')]
for col in score_columns:
if col in df.columns:
# Cap scores at 0-100 range
df[col] = df[col].clip(lower=0, upper=100)
# Year should be integer
if 'year' in df.columns:
df['year'] = df['year'].astype(int)
return df
def add_data_metadata(df, file_path):
"""
Add metadata columns to track data source and quality.
"""
# Add data source information
df['data_source'] = os.path.basename(file_path) if file_path else 'sample_data'
df['last_updated'] = datetime.now().strftime('%Y-%m-%d')
df['is_sample_data'] = file_path is None
return df
def create_sample_data():
"""
Create sample data for demonstration when real data is not available.
This data is clearly marked as sample data.
"""
countries = [
"United States", "China", "Japan", "Germany", "United Kingdom",
"France", "Taiwan", "South Korea", "Canada", "Australia",
"Italy", "Spain", "Netherlands", "Sweden", "Norway"
]
data = []
for country in countries:
for year in range(2000, 2024):
# Create realistic but synthetic data
base_score = 65 + (year - 2000) * 0.8
# Add some country-specific variations
country_factor = 1.0
if country in ["Norway", "Sweden", "Denmark"]:
country_factor = 1.15 # Nordic countries perform better
elif country in ["Taiwan", "South Korea"]:
country_factor = 1.1 # East Asian countries perform well
elif country in ["China"]:
country_factor = 0.9 # Some variation
final_score = min(100, base_score * country_factor)
row = {
"country": country,
"country_code": f"{country[:3].upper()}", # Simple country code
"year": year,
"sdg_index_score": final_score,
"data_source": "sample_data",
"last_updated": datetime.now().strftime('%Y-%m-%d'),
"is_sample_data": True
}
# Generate individual goal scores
for i in range(1, 18):
goal_score = min(100, 50 + (year - 2000) * 1.2 + (i * 1.1) + np.random.normal(0, 5))
row[f"goal_{i}_score"] = max(0, goal_score)
data.append(row)
sample_df = pd.DataFrame(data)
# Add warning in the dataframe name
sample_df.attrs['warning'] = "⚠️ SAMPLE DATA - This is demonstration data only and should not be used for actual analysis or decision-making."
return sample_df
def get_country_list(df):
"""Get sorted list of countries with normalization."""
if df is None or df.empty:
return []
try:
# Return only countries present in the latest year to avoid listing countries without recent data
latest_df = get_latest_data(df)
if latest_df is not None and not latest_df.empty:
return sorted(latest_df['country'].dropna().unique().tolist())
except Exception:
# Fallback to all countries if something unexpected happens
pass
return sorted(df['country'].dropna().unique().tolist())
def filter_data(df, country, year_range):
"""Filter data by country and year range."""
if df is None or df.empty:
return pd.DataFrame()
mask = (df['country'] == country) & (df['year'] >= year_range[0]) & (df['year'] <= year_range[1])
return df[mask].sort_values('year')
def get_latest_data(df):
"""Returns the latest year's data for all countries."""
if df is None or df.empty:
return pd.DataFrame()
latest_year = df['year'].max()
return df[df['year'] == latest_year]
def get_data_summary(df):
"""Get comprehensive data summary including quality metrics."""
if df is None or df.empty:
return {
'total_countries': 0,
'total_records': 0,
'year_range': None,
'data_quality_score': 0,
'has_sample_data': False,
'warnings': ['No data available']
}
summary = {
'total_countries': df['country'].nunique(),
'total_records': len(df),
'year_range': (df['year'].min(), df['year'].max()),
'data_quality_score': calculate_data_quality_score(df),
'has_sample_data': df['is_sample_data'].any() if 'is_sample_data' in df.columns else False,
'warnings': []
}
# Add warnings for sample data
if summary['has_sample_data']:
summary['warnings'].append('⚠️ This analysis uses sample data for demonstration purposes')
# Check for data quality issues
missing_pct = df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100
if missing_pct > 5:
summary['warnings'].append(f'High missing data percentage: {missing_pct:.1f}%')
return summary
def calculate_data_quality_score(df):
"""Calculate a data quality score (0-100)."""
if df is None or df.empty:
return 0
score = 100.0 # Ensure float
# Deduct for missing values
missing_pct = float(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100)
score -= missing_pct * 2 # 2 points per percent missing
# Deduct for unusual year ranges
if 'year' in df.columns:
year_range = int(df['year'].max() - df['year'].min())
if year_range > 30: # More than 30 years seems unusual
score -= 10
# Skip outlier detection for now due to duplicate column issues
# This will be fixed in a future update
# TODO: Implement robust outlier detection that handles duplicate columns
score_columns = [col for col in df.columns if col.endswith('_score')]
if score_columns:
logger.debug(f"Found {len(score_columns)} score columns, skipping outlier detection")
# Ensure score is a scalar value
if hasattr(score, 'iloc'): # If it's a pandas Series/DataFrame
score = float(score.iloc[0] if len(score) > 0 else 0)
return max(0.0, min(100.0, float(score)))
def get_country_metrics(df, country, year):
"""
Get comprehensive metrics for a specific country in a specific year.
"""
if df is None or df.empty:
return None
country_data = df[(df['country'] == country) & (df['year'] == year)]
if country_data.empty:
return None
# Calculate global average for that year
year_data = df[df['year'] == year]
if year_data.empty:
return None
global_avg = year_data['sdg_index_score'].mean()
# Get rank
year_data_with_rank = year_data.copy()
year_data_with_rank['rank'] = year_data_with_rank['sdg_index_score'].rank(ascending=False, method='min')
country_rank = year_data_with_rank[year_data_with_rank['country'] == country]['rank'].values[0]
# Calculate trend (improvement over last 5 years)
trend_data = df[df['country'] == country]
recent_years = trend_data[trend_data['year'] >= (year - 5)]
if len(recent_years) > 1:
recent_avg = recent_years['sdg_index_score'].mean()
trend = recent_avg - trend_data['sdg_index_score'].iloc[0] if len(recent_years) > 0 else 0
else:
trend = 0
# Goal performance breakdown
goal_scores = {}
for i in range(1, 18):
goal_col = f'goal_{i}_score'
if goal_col in country_data.columns:
goal_scores[f'goal_{i}'] = country_data[goal_col].values[0]
metrics = {
'score': country_data['sdg_index_score'].values[0],
'rank': int(country_rank),
'global_avg': global_avg,
'country_count': len(year_data),
'trend_5yr': trend,
'goal_scores': goal_scores,
'data_quality': 'high' if ('is_sample_data' in country_data.columns and not country_data['is_sample_data'].any()) else 'sample',
'last_updated': country_data['last_updated'].values[0] if ('last_updated' in country_data.columns and len(country_data) > 0) else 'N/A'
}
return metrics