Spaces:

pcreem
/

sdgToPic

Running

sdgToPic / src /data_loader.py

Song

chore: use git lfs for large data files

b88006b about 2 months ago

37.4 kB

	import pandas as pd
	import numpy as np
	import os
	import logging
	from datetime import datetime
	from src.config_manager import get_config

	# Configure logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)

	# Column mapping for standardizing column names across different data versions
	COLUMN_MAPPING = {
	# Country field variations - comprehensive case-insensitive matching
	'Country': 'country',
	'Country Name': 'country',
	'country_name': 'country',
	'nation': 'country',
	'country': 'country',
	'Country Name': 'country',
	'Nation': 'country',
	'Country Name': 'country',
	'Country': 'country',
	'COUNTRY': 'country',
	'country': 'country',

	# Year variations
	'Year': 'year',
	'year_value': 'year',
	'year': 'year',

	# SDG Index Score variations - flexible matching
	'2025 SDG Index Score': 'sdg_index_score',
	'SDG Index Score': 'sdg_index_score',
	'SDG Index': 'sdg_index_score',
	'sdg_score': 'sdg_index_score',
	'overall_score': 'sdg_index_score',
	'index score': 'sdg_index_score',
	'sdgi_s': 'sdg_index_score',
	'SDGI_S': 'sdg_index_score',
	'SDG_Score': 'sdg_index_score',
	'sdg_index_score': 'sdg_index_score',
	'SDG_Index_Score': 'sdg_index_score',
	'Overall_Score': 'sdg_index_score',
	'INDEX_SCORE': 'sdg_index_score',

	# Individual goal score mappings
	'Goal 1 Score': 'goal_1_score',
	'Goal 1': 'goal_1_score',
	'sdg_1_score': 'goal_1_score',

	'Goal 2 Score': 'goal_2_score',
	'Goal 2': 'goal_2_score',
	'sdg_2_score': 'goal_2_score',

	'Goal 3 Score': 'goal_3_score',
	'Goal 3': 'goal_3_score',
	'sdg_3_score': 'goal_3_score',

	'Goal 4 Score': 'goal_4_score',
	'Goal 4': 'goal_4_score',
	'sdg_4_score': 'goal_4_score',

	'Goal 5 Score': 'goal_5_score',
	'Goal 5': 'goal_5_score',
	'sdg_5_score': 'goal_5_score',

	'Goal 6 Score': 'goal_6_score',
	'Goal 6': 'goal_6_score',
	'sdg_6_score': 'goal_6_score',

	'Goal 7 Score': 'goal_7_score',
	'Goal 7': 'goal_7_score',
	'sdg_7_score': 'goal_7_score',

	'Goal 8 Score': 'goal_8_score',
	'Goal 8': 'goal_8_score',
	'sdg_8_score': 'goal_8_score',

	'Goal 9 Score': 'goal_9_score',
	'Goal 9': 'goal_9_score',
	'sdg_9_score': 'goal_9_score',

	'Goal 10 Score': 'goal_10_score',
	'Goal 10': 'goal_10_score',
	'sdg_10_score': 'goal_10_score',

	'Goal 11 Score': 'goal_11_score',
	'Goal 11': 'goal_11_score',
	'sdg_11_score': 'goal_11_score',

	'Goal 12 Score': 'goal_12_score',
	'Goal 12': 'goal_12_score',
	'sdg_12_score': 'goal_12_score',

	'Goal 13 Score': 'goal_13_score',
	'Goal 13': 'goal_13_score',
	'sdg_13_score': 'goal_13_score',

	'Goal 14 Score': 'goal_14_score',
	'Goal 14': 'goal_14_score',
	'sdg_14_score': 'goal_14_score',

	'Goal 15 Score': 'goal_15_score',
	'Goal 15': 'goal_15_score',
	'sdg_15_score': 'goal_15_score',

	'Goal 16 Score': 'goal_16_score',
	'Goal 16': 'goal_16_score',
	'sdg_16_score': 'goal_16_score',

	'Goal 17 Score': 'goal_17_score',
	'Goal 17': 'goal_17_score',
	'sdg_17_score': 'goal_17_score',

	# Short name variations from backdated index
	'goal1': 'goal_1_score',
	'goal2': 'goal_2_score',
	'goal3': 'goal_3_score',
	'goal4': 'goal_4_score',
	'goal5': 'goal_5_score',
	'goal6': 'goal_6_score',
	'goal7': 'goal_7_score',
	'goal8': 'goal_8_score',
	'goal9': 'goal_9_score',
	'goal10': 'goal_10_score',
	'goal11': 'goal_11_score',
	'goal12': 'goal_12_score',
	'goal13': 'goal_13_score',
	'goal14': 'goal_14_score',
	'goal15': 'goal_15_score',
	'goal16': 'goal_16_score',
	'goal17': 'goal_17_score',

	# Additional common mappings
	'Country Code': 'country_code',
	'country_code_alpha_3': 'country_code',
	'iso3': 'country_code'
	}

	# Country name mapping for standardizing country names
	COUNTRY_NAME_MAPPING = {
	# Common variations and alternative names
	'United States of America': 'United States',
	'USA': 'United States',
	'US': 'United States',

	'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
	'UK': 'United Kingdom',
	'Britain': 'United Kingdom',
	'Great Britain': 'United Kingdom',

	'Russian Federation': 'Russia',
	'Russia': 'Russia',

	'Korea, Republic of': 'South Korea',
	'Republic of Korea': 'South Korea',
	'Korea (South)': 'South Korea',
	'South Korea': 'South Korea',

	'Korea, Democratic People\'s Republic of': 'North Korea',
	'North Korea': 'North Korea',

	'Iran, Islamic Republic of': 'Iran',
	'Islamic Republic of Iran': 'Iran',

	'Venezuela, Bolivarian Republic of': 'Venezuela',
	'Bolivarian Republic of Venezuela': 'Venezuela',

	'Bolivia, Plurinational State of': 'Bolivia',
	'Plurinational State of Bolivia': 'Bolivia',

	'United Republic of Tanzania': 'Tanzania',
	'Tanzania, United Republic of': 'Tanzania',

	'Viet Nam': 'Vietnam',
	'Vietnam': 'Vietnam',

	'Republic of Moldova': 'Moldova',
	'Moldova, Republic of': 'Moldova',

	'The former Yugoslav Republic of Macedonia': 'North Macedonia',
	'North Macedonia': 'North Macedonia',
	'Macedonia': 'North Macedonia',

	'Syrian Arab Republic': 'Syria',
	'Syria': 'Syria',

	'Côte d\'Ivoire': 'Ivory Coast',
	'Ivory Coast': 'Ivory Coast',

	'Congo, Democratic Republic of the': 'Democratic Republic of the Congo',
	'Congo, Republic of the': 'Republic of the Congo',
	'Congo': 'Republic of the Congo',

	'Myanmar': 'Myanmar',
	'Burma': 'Myanmar',

	'Czech Republic': 'Czech Republic',
	'Czechia': 'Czech Republic',

	'Eswatini': 'Eswatini',
	'Swaziland': 'Eswatini',

	'Cabo Verde': 'Cape Verde',
	'Cape Verde': 'Cape Verde'
	,
	# Common variant for Taiwan used in some datasets
	'Chinese Taipei': 'Taiwan',
	'Taipei, Chinese': 'Taiwan'
	}

	def load_sdg_data(file_path=None):
	"""
	Load and clean SDG data (CSV or Excel) with comprehensive validation and error handling.

	Features:
	- Supports both CSV and Excel formats
	- BOM handling (encoding='utf-8-sig')
	- Missing value imputation
	- Schema validation
	- Country name normalization
	- Data source tracking
	"""
	try:
	# Get file path from configuration if not provided
	if file_path is None:
	config = get_config()
	file_path = config.get('data_sources.primary_data_path')

	# If no file_path provided or file not found, prefer SDR2025 file in data folder
	if file_path is None or not os.path.exists(file_path):
	# Try repository-relative data/SDR2025-data.xlsx
	repo_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'SDR2025-data.xlsx')
	if os.path.exists(repo_data_path):
	file_path = repo_data_path
	logger.info(f"Using SDR2025 data at {repo_data_path}")
	else:
	# Fallback to any configured path or sample data
	if file_path and not os.path.exists(file_path):
	logger.warning(f"Configured file {file_path} not found. Falling back to sample data.")
	else:
	logger.warning("No data file provided. Using sample data.")
	return create_sample_data()

	df = None

	# Detect file type and load accordingly
	if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
	# Handle Excel files
	df = load_excel_data(file_path)
	else:
	# Handle CSV files
	df = load_csv_data(file_path)

	if df is None or df.empty:
	logger.warning(f"No valid data loaded from {file_path}. Using sample data.")
	return create_sample_data()

	logger.info(f"Successfully loaded data from {file_path}: {len(df)} rows, {len(df.columns)} columns")

	# Add metadata columns
	df = add_data_metadata(df, file_path)

	return df
	except Exception as e:
	logger.error(f"Error loading data from {file_path}: {e}")
	logger.info("Falling back to sample data")
	return create_sample_data()

	def load_csv_data(file_path):
	"""Load CSV data with proper encoding handling."""
	try:
	# Load data with BOM handling
	df = pd.read_csv(file_path, encoding='utf-8-sig')

	# Remove BOM from column names if present
	df.columns = [c.replace('\ufeff', '') for c in df.columns]
	# Basic cleaning: remove whitespace
	df.columns = [c.strip() for c in df.columns]

	# Apply column mapping
	df = df.rename(columns=COLUMN_MAPPING)

	# Deduplicate columns if any mappings caused duplicates
	if df.columns.duplicated().any():
	df = df.loc[:, ~df.columns.duplicated()]

	# Handle Country Name Normalization
	if 'country' in df.columns:
	df['country'] = df['country'].replace(COUNTRY_NAME_MAPPING)

	# Ensure data types and handle potential duplicate columns
	if 'year' in df.columns:
	target_year = df['year']
	if isinstance(target_year, pd.DataFrame):
	target_year = target_year.iloc[:, 0]
	df['year'] = pd.to_numeric(target_year, errors='coerce').fillna(0).astype(int)

	if 'sdg_index_score' in df.columns:
	target_score = df['sdg_index_score']
	if isinstance(target_score, pd.DataFrame):
	target_score = target_score.iloc[:, 0]
	df['sdg_index_score'] = pd.to_numeric(target_score, errors='coerce')

	# Missing value strategy: Forward fill by country with improved error handling
	if 'country' in df.columns and 'year' in df.columns:
	try:
	df = df.sort_values(['country', 'year'])
	# Use the improved handle_missing_values function
	df = handle_missing_values(df)
	except Exception as e:
	logger.warning(f"Error in missing value handling for CSV data: {e}")
	# Fallback to simple forward fill
	try:
	df = df.sort_values(['country', 'year'])
	for col in df.columns:
	if col.endswith('_score'):
	df[col] = df.groupby('country')[col].ffill().bfill()
	except Exception as e2:
	logger.warning(f"Fallback missing value handling also failed: {e2}")

	return df
	except Exception as e:
	logger.error(f"Error loading CSV data: {e}")
	return None

	def load_excel_data(file_path):
	"""Load and process Excel data with 2025 SDR structure and enhanced column detection."""
	try:
	logger.info(f"Loading Excel file: {file_path}")

	# Load Excel file and find the right sheet
	xlsx_file = pd.ExcelFile(file_path)
	logger.info(f"Available sheets: {xlsx_file.sheet_names}")

	df = None

	# Try to load both current data and historical data if available
	sheets_found = {}
	for sheet_name in ['SDR2025 Data', 'Backdated SDG Index', 'Overview', 'Data']:
	if sheet_name in xlsx_file.sheet_names:
	try:
	sheet_df = pd.read_excel(file_path, sheet_name=sheet_name)
	# Basic cleaning
	sheet_df.columns = [str(col).strip() for col in sheet_df.columns]
	# Map columns
	sheet_df = apply_2025_excel_mapping(sheet_df)

	if 'country' in sheet_df.columns:
	# Ensure year
	if 'year' not in sheet_df.columns:
	# If it's the main data sheet, default to 2025
	if sheet_name == 'SDR2025 Data':
	sheet_df['year'] = 2025
	else:
	sheet_df['year'] = 2024 # Fallback

	sheets_found[sheet_name] = sheet_df
	logger.info(f"Loaded sheet '{sheet_name}' with {len(sheet_df)} rows")
	except Exception as e:
	logger.warning(f"Failed to read sheet '{sheet_name}': {e}")

	if not sheets_found:
	logger.error("Could not find any valid data sheets in Excel file")
	return None

	# Combine sheets if we have both current and history
	if 'SDR2025 Data' in sheets_found and 'Backdated SDG Index' in sheets_found:
	logger.info("Merging current data (2025) with backdated index (2000-2024)")
	df = pd.concat([sheets_found['Backdated SDG Index'], sheets_found['SDR2025 Data']], ignore_index=True)
	# Remove duplicated country-year entries (prefer 2025 sheet if overlap)
	df = df.drop_duplicates(subset=['country', 'year'], keep='last')
	else:
	# Revert to whichever sheet we found first (priority order)
	for name in ['SDR2025 Data', 'Backdated SDG Index', 'Overview', 'Data']:
	if name in sheets_found:
	df = sheets_found[name]
	break

	# Clean column names
	original_columns = list(df.columns)
	df.columns = [str(col).strip() for col in df.columns]
	logger.info(f"Cleaned column names from: {original_columns[:5]}{'...' if len(original_columns) > 5 else ''} to: {list(df.columns)[:5]}{'...' if len(df.columns) > 5 else ''}")

	# Apply 2025 Excel-specific column mapping
	df = apply_2025_excel_mapping(df)
	logger.info(f"After column mapping, available columns: {list(df.columns)[:10]}{'...' if len(df.columns) > 10 else ''}")

	# Handle country name normalization
	if 'country' in df.columns:
	df['country'] = df['country'].replace(COUNTRY_NAME_MAPPING)

	# Ensure required columns exist (add defaults if missing)
	if 'year' not in df.columns:
	df['year'] = 2025 # Default to 2025 if missing
	elif isinstance(df['year'], pd.DataFrame):
	df['year'] = df['year'].iloc[:, 0]

	# Convert year to numeric
	df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(2025).astype(int)

	# Handle SDG Index Score
	if 'sdg_index_score' not in df.columns:
	# Try to find alternative score columns
	score_cols = [col for col in df.columns if 'score' in col.lower() and 'index' in col.lower()]
	if score_cols:
	# Use first matching column, handle if it's a DataFrame (duplicate names)
	target_col = df[score_cols[0]]
	if isinstance(target_col, pd.DataFrame):
	target_col = target_col.iloc[:, 0]
	df['sdg_index_score'] = pd.to_numeric(target_col, errors='coerce')
	else:
	df['sdg_index_score'] = 0
	else:
	# Handle if it's a DataFrame
	target_col = df['sdg_index_score']
	if isinstance(target_col, pd.DataFrame):
	target_col = target_col.iloc[:, 0]
	df['sdg_index_score'] = pd.to_numeric(target_col, errors='coerce')

	# Convert goal scores to numeric
	goal_score_cols = [col for col in df.columns if col.startswith('goal_') and col.endswith('_score')]
	for col in goal_score_cols:
	target_col = df[col]
	if isinstance(target_col, pd.DataFrame):
	target_col = target_col.iloc[:, 0]
	df[col] = pd.to_numeric(target_col, errors='coerce')

	logger.info(f"Successfully processed Excel data: {len(df)} rows, {len(goal_score_cols)} goal score columns")

	return df
	except Exception as e:
	logger.error(f"Error loading Excel data: {e}")
	return None

	def apply_2025_excel_mapping(df):
	"""Apply flexible column mapping for 2025 Excel file structure with comprehensive logging."""
	logger.info("Applying flexible 2025 Excel column mapping...")

	# 2025 Excel specific mappings - comprehensive and flexible
	excel_mapping = {
	# Country field variations
	'Country': 'country',
	'country': 'country',
	'Country Name': 'country',
	'country_name': 'country',
	'Nation': 'country',
	'nation': 'country',
	'country code iso3': 'country',
	'ISO3': 'country',

	# SDG Index Score variations - very flexible matching
	'2025 SDG Index Score': 'sdg_index_score',
	'SDG Index Score': 'sdg_index_score',
	'sdg_index_score': 'sdg_index_score',
	'SDG Index': 'sdg_index_score',
	'index score': 'sdg_index_score',
	'overall score': 'sdg_index_score',
	'sdgi_s': 'sdg_index_score',
	'SDGI_S': 'sdg_index_score',

	# Additional metadata
	'2025 SDG Index Rank': 'global_rank',
	'SDG Index Rank': 'global_rank',
	'Region': 'region',
	'region': 'region',

	# Goal mappings for SDR2025 Data sheet format (Goal X Score)
	'Goal 1 Score': 'goal_1_score',
	'Goal 2 Score': 'goal_2_score',
	'Goal 3 Score': 'goal_3_score',
	'Goal 4 Score': 'goal_4_score',
	'Goal 5 Score': 'goal_5_score',
	'Goal 6 Score': 'goal_6_score',
	'Goal 7 Score': 'goal_7_score',
	'Goal 8 Score': 'goal_8_score',
	'Goal 9 Score': 'goal_9_score',
	'Goal 10 Score': 'goal_10_score',
	'Goal 11 Score': 'goal_11_score',
	'Goal 12 Score': 'goal_12_score',
	'Goal 13 Score': 'goal_13_score',
	'Goal 14 Score': 'goal_14_score',
	'Goal 15 Score': 'goal_15_score',
	'Goal 16 Score': 'goal_16_score',
	'Goal 17 Score': 'goal_17_score',

	# Variations from backdated index
	'goal1': 'goal_1_score',
	'goal2': 'goal_2_score',
	'goal3': 'goal_3_score',
	'goal4': 'goal_4_score',
	'goal5': 'goal_5_score',
	'goal6': 'goal_6_score',
	'goal7': 'goal_7_score',
	'goal8': 'goal_8_score',
	'goal9': 'goal_9_score',
	'goal10': 'goal_10_score',
	'goal11': 'goal_11_score',
	'goal12': 'goal_12_score',
	'goal13': 'goal_13_score',
	'goal14': 'goal_14_score',
	'goal15': 'goal_15_score',
	'goal16': 'goal_16_score',
	'goal17': 'goal_17_score',

	# Goal mappings for Overview sheet format (SDGX: Name)
	'SDG1: No Poverty': 'goal_1_score',
	'SDG2: No Hunger': 'goal_2_score',
	'SDG3: Good Health and Well-Being': 'goal_3_score',
	'SDG4: Quality Education': 'goal_4_score',
	'SDG5: Gender Equality': 'goal_5_score',
	'SDG6: Clean Water and Sanitation': 'goal_6_score',
	'SDG7: Affordable and Clean Energy': 'goal_7_score',
	'SDG8: Decent Work and Economic Growth': 'goal_8_score',
	'SDG9: Industry, Innovation & Infrastructure': 'goal_9_score',
	'SDG10: Reduced Inequalities': 'goal_10_score',
	'SDG11: Sustainable Cities and Communities': 'goal_11_score',
	'SDG12: Responsible Consumption & Production': 'goal_12_score',
	'SDG13: Climate Action': 'goal_13_score',
	'SDG14: Life Below Water': 'goal_14_score',
	'SDG15: Life on Land': 'goal_15_score',
	'SDG16: Peace, Justice and Strong Institutions': 'goal_16_score',
	'SDG17: Partnerships for the Goals': 'goal_17_score',
	}

	# Apply the mapping - only for columns that exist
	actual_mappings = {old: new for old, new in excel_mapping.items() if old in df.columns}

	if actual_mappings:
	# Before renaming, check if any of the target names already exist
	# and would cause duplicates that aren't intended
	df = df.rename(columns=actual_mappings)
	logger.info(f"Successfully mapped columns: {list(actual_mappings.keys())}")
	else:
	logger.info("No columns found to map")

	# Flexible Goal Score mapping - handle 'Goal X Score' format dynamically
	for col in list(df.columns):
	new_col_name = None
	# Match patterns like 'Goal 1 Score', 'Goal 2 Score', etc.
	if col.startswith('Goal ') and col.endswith(' Score'):
	try:
	goal_num = col.split(' ')[1] # Extract number from 'Goal X Score'
	new_col_name = f'goal_{goal_num}_score'
	except Exception as e:
	logger.warning(f"Failed to parse goal column '{col}': {e}")

	# Match SDG naming patterns like 'SDG1: No Poverty'
	elif col.startswith('SDG') and ':' in col:
	try:
	goal_part = col.split(':')[0] # Get 'SDG1' from 'SDG1: No Poverty'
	goal_num = goal_part.replace('SDG', '') # Remove 'SDG' prefix
	new_col_name = f'goal_{goal_num}_score'
	except Exception as e:
	logger.warning(f"Failed to parse SDG column '{col}': {e}")

	if new_col_name and new_col_name != col:
	# If target column already exists, we might want to consolidate or just drop this one
	# For now, we rename and then we will deduplicate
	df = df.rename(columns={col: new_col_name})
	logger.info(f"Mapped: '{col}' -> '{new_col_name}'")

	# CRITICAL: Handle duplicate column names that might have been created
	if df.columns.duplicated().any():
	logger.warning(f"Duplicate columns detected after mapping: {df.columns[df.columns.duplicated()].unique().tolist()}")
	# Keep the first occurrence of each column name
	df = df.loc[:, ~df.columns.duplicated()]
	logger.info("Successfully deduplicated columns by keeping the first occurrence.")

	# Remove unnamed columns (these are usually status indicators)
	unnamed_cols = [col for col in df.columns if col.startswith('Unnamed:')]
	if unnamed_cols:
	df = df.drop(columns=unnamed_cols)
	logger.info(f"Removed unnamed columns: {unnamed_cols}")

	# Final column validation
	remaining_cols = list(df.columns)
	logger.info(f"Final columns after mapping: {remaining_cols[:10]}{'...' if len(remaining_cols) > 10 else ''}")

	return df

	def validate_data_schema(df):
	"""
	Validate the data schema and return validation results.
	Updated to be more flexible for 2025 Excel data.
	"""
	# Core required columns (minimum needed for the application to work)
	core_required = ['country', 'sdg_index_score']

	# Optional goal columns (application can work without them)
	goal_columns = [f'goal_{i}_score' for i in range(1, 18)]

	errors = []
	warnings = []

	# Check core required columns
	missing_core = [col for col in core_required if col not in df.columns]
	if missing_core:
	errors.append(f"Missing core required columns: {missing_core}")

	# Check goal columns (warnings only, not errors)
	missing_goals = [col for col in goal_columns if col not in df.columns]
	if missing_goals:
	warnings.append(f"Missing goal columns: {len(missing_goals)} out of 17 goals available")

	# Check year column (optional but recommended)
	if 'year' not in df.columns:
	warnings.append("Year column not found - will use default year 2025")
	else:
	min_year = df['year'].min()
	max_year = df['year'].max()
	if min_year < 2000 or max_year > 2030:
	warnings.append(f"Year range unusual: {min_year}-{max_year} (expected 2000-2030)")

	# Check score ranges (0-100) for available score columns
	score_columns = [col for col in df.columns if col.endswith('_score') and col != 'country_score']
	for col in score_columns:
	if col in df.columns:
	# Skip if all values are NaN
	if df[col].isna().all():
	warnings.append(f"Score column {col} contains only NaN values")
	continue

	min_score = df[col].min()
	max_score = df[col].max()
	if pd.notna(min_score) and pd.notna(max_score):
	if min_score < -10 or max_score > 110: # Allow some tolerance
	warnings.append(f"Score column {col} has unusual range: {min_score:.1f}-{max_score:.1f} (expected 0-100)")

	# Check data completeness
	if 'country' in df.columns:
	duplicate_countries = df['country'].duplicated().sum()
	if duplicate_countries > 0:
	warnings.append(f"Found {duplicate_countries} duplicate country entries")

	# Overall validation result
	is_valid = len(errors) == 0

	return {
	'valid': is_valid,
	'errors': errors,
	'warnings': warnings,
	'column_count': len(df.columns),
	'row_count': len(df),
	'available_goals': len([col for col in goal_columns if col in df.columns]),
	'year_range': (int(df['year'].min()), int(df['year'].max())) if 'year' in df.columns else (2025, 2025)
	}

	def normalize_country_names(df):
	"""
	Normalize country names to handle variations like "Taiwan" vs "Taiwan, Province of China".
	"""
	# Country name mapping for normalization
	country_mappings = {
	'Taiwan, Province of China': 'Taiwan',
	'Korea, Republic of': 'South Korea',
	'Korea (South)': 'South Korea',
	'United States of America': 'United States',
	'UK': 'United Kingdom',
	'Russian Federation': 'Russia',
	'Iran, Islamic Republic of': 'Iran',
	'Venezuela, Bolivarian Republic of': 'Venezuela'
	}

	# Apply mappings
	for old_name, new_name in country_mappings.items():
	df['country'] = df['country'].replace(old_name, new_name)

	return df

	def handle_missing_values(df):
	"""
	Handle missing values with appropriate strategies for different column types.
	"""
	# For score columns, use forward fill then backward fill
	score_columns = [col for col in df.columns if col.endswith('_score')]

	for col in score_columns:
	if col in df.columns:
	try:
	# Ensure column is numeric and handle if it's a DataFrame
	target_col = df[col]
	if isinstance(target_col, pd.DataFrame):
	target_col = target_col.iloc[:, 0]
	df[col] = pd.to_numeric(target_col, errors='coerce')

	# Forward fill within country groups
	df[col] = df.groupby('country', group_keys=False)[col].apply(lambda x: x.ffill().bfill())

	# If still NaN, use global median for that year with better error handling
	def safe_fillna(x):
	try:
	median_val = x.median()
	# Check if median is valid (not NaN and is numeric)
	if pd.notna(median_val) and isinstance(median_val, (int, float)):
	return x.fillna(median_val)
	else:
	# If median is invalid, use overall column median
	overall_median = df[col].median()
	if pd.notna(overall_median):
	return x.fillna(overall_median)
	else:
	# If still no valid median, use 0
	return x.fillna(0)
	except Exception as e:
	logger.warning(f"Error in fillna for column {col}: {e}")
	return x.fillna(0)

	df[col] = df.groupby('year', group_keys=False)[col].apply(safe_fillna)

	# Ensure column is numeric and handle if it's a DataFrame
	target_col = df[col]
	if isinstance(target_col, pd.DataFrame):
	target_col = target_col.iloc[:, 0]
	df[col] = pd.to_numeric(target_col, errors='coerce').fillna(0) # Fill final NaNs with 0
	except Exception as e:
	logger.warning(f"Error processing column {col}: {e}")

	# For country_code, forward fill within country
	if 'country_code' in df.columns:
	try:
	df['country_code'] = df.groupby('country', group_keys=False)['country_code'].apply(lambda x: x.ffill().bfill())
	except Exception as e:
	logger.warning(f"Error handling country_code missing values: {e}")

	return df

	def apply_data_quality_checks(df):
	"""
	Apply data quality checks and clean anomalous values.
	"""
	# Score columns should be between 0 and 100
	score_columns = [col for col in df.columns if col.endswith('_score')]

	for col in score_columns:
	if col in df.columns:
	# Cap scores at 0-100 range
	df[col] = df[col].clip(lower=0, upper=100)

	# Year should be integer
	if 'year' in df.columns:
	df['year'] = df['year'].astype(int)

	return df

	def add_data_metadata(df, file_path):
	"""
	Add metadata columns to track data source and quality.
	"""
	# Add data source information
	df['data_source'] = os.path.basename(file_path) if file_path else 'sample_data'
	df['last_updated'] = datetime.now().strftime('%Y-%m-%d')
	df['is_sample_data'] = file_path is None

	return df

	def create_sample_data():
	"""
	Create sample data for demonstration when real data is not available.
	This data is clearly marked as sample data.
	"""
	countries = [
	"United States", "China", "Japan", "Germany", "United Kingdom",
	"France", "Taiwan", "South Korea", "Canada", "Australia",
	"Italy", "Spain", "Netherlands", "Sweden", "Norway"
	]

	data = []
	for country in countries:
	for year in range(2000, 2024):
	# Create realistic but synthetic data
	base_score = 65 + (year - 2000) * 0.8
	# Add some country-specific variations
	country_factor = 1.0
	if country in ["Norway", "Sweden", "Denmark"]:
	country_factor = 1.15 # Nordic countries perform better
	elif country in ["Taiwan", "South Korea"]:
	country_factor = 1.1 # East Asian countries perform well
	elif country in ["China"]:
	country_factor = 0.9 # Some variation

	final_score = min(100, base_score * country_factor)

	row = {
	"country": country,
	"country_code": f"{country[:3].upper()}", # Simple country code
	"year": year,
	"sdg_index_score": final_score,
	"data_source": "sample_data",
	"last_updated": datetime.now().strftime('%Y-%m-%d'),
	"is_sample_data": True
	}

	# Generate individual goal scores
	for i in range(1, 18):
	goal_score = min(100, 50 + (year - 2000) * 1.2 + (i * 1.1) + np.random.normal(0, 5))
	row[f"goal_{i}_score"] = max(0, goal_score)

	data.append(row)

	sample_df = pd.DataFrame(data)

	# Add warning in the dataframe name
	sample_df.attrs['warning'] = "⚠️ SAMPLE DATA - This is demonstration data only and should not be used for actual analysis or decision-making."

	return sample_df

	def get_country_list(df):
	"""Get sorted list of countries with normalization."""
	if df is None or df.empty:
	return []
	try:
	# Return only countries present in the latest year to avoid listing countries without recent data
	latest_df = get_latest_data(df)
	if latest_df is not None and not latest_df.empty:
	return sorted(latest_df['country'].dropna().unique().tolist())
	except Exception:
	# Fallback to all countries if something unexpected happens
	pass
	return sorted(df['country'].dropna().unique().tolist())

	def filter_data(df, country, year_range):
	"""Filter data by country and year range."""
	if df is None or df.empty:
	return pd.DataFrame()

	mask = (df['country'] == country) & (df['year'] >= year_range[0]) & (df['year'] <= year_range[1])
	return df[mask].sort_values('year')

	def get_latest_data(df):
	"""Returns the latest year's data for all countries."""
	if df is None or df.empty:
	return pd.DataFrame()

	latest_year = df['year'].max()
	return df[df['year'] == latest_year]

	def get_data_summary(df):
	"""Get comprehensive data summary including quality metrics."""
	if df is None or df.empty:
	return {
	'total_countries': 0,
	'total_records': 0,
	'year_range': None,
	'data_quality_score': 0,
	'has_sample_data': False,
	'warnings': ['No data available']
	}

	summary = {
	'total_countries': df['country'].nunique(),
	'total_records': len(df),
	'year_range': (df['year'].min(), df['year'].max()),
	'data_quality_score': calculate_data_quality_score(df),
	'has_sample_data': df['is_sample_data'].any() if 'is_sample_data' in df.columns else False,
	'warnings': []
	}

	# Add warnings for sample data
	if summary['has_sample_data']:
	summary['warnings'].append('⚠️ This analysis uses sample data for demonstration purposes')

	# Check for data quality issues
	missing_pct = df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100
	if missing_pct > 5:
	summary['warnings'].append(f'High missing data percentage: {missing_pct:.1f}%')

	return summary

	def calculate_data_quality_score(df):
	"""Calculate a data quality score (0-100)."""
	if df is None or df.empty:
	return 0

	score = 100.0 # Ensure float

	# Deduct for missing values
	missing_pct = float(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100)
	score -= missing_pct * 2 # 2 points per percent missing

	# Deduct for unusual year ranges
	if 'year' in df.columns:
	year_range = int(df['year'].max() - df['year'].min())
	if year_range > 30: # More than 30 years seems unusual
	score -= 10

	# Skip outlier detection for now due to duplicate column issues
	# This will be fixed in a future update
	# TODO: Implement robust outlier detection that handles duplicate columns
	score_columns = [col for col in df.columns if col.endswith('_score')]
	if score_columns:
	logger.debug(f"Found {len(score_columns)} score columns, skipping outlier detection")

	# Ensure score is a scalar value
	if hasattr(score, 'iloc'): # If it's a pandas Series/DataFrame
	score = float(score.iloc[0] if len(score) > 0 else 0)

	return max(0.0, min(100.0, float(score)))

	def get_country_metrics(df, country, year):
	"""
	Get comprehensive metrics for a specific country in a specific year.
	"""
	if df is None or df.empty:
	return None

	country_data = df[(df['country'] == country) & (df['year'] == year)]
	if country_data.empty:
	return None

	# Calculate global average for that year
	year_data = df[df['year'] == year]
	if year_data.empty:
	return None

	global_avg = year_data['sdg_index_score'].mean()

	# Get rank
	year_data_with_rank = year_data.copy()
	year_data_with_rank['rank'] = year_data_with_rank['sdg_index_score'].rank(ascending=False, method='min')
	country_rank = year_data_with_rank[year_data_with_rank['country'] == country]['rank'].values[0]

	# Calculate trend (improvement over last 5 years)
	trend_data = df[df['country'] == country]
	recent_years = trend_data[trend_data['year'] >= (year - 5)]
	if len(recent_years) > 1:
	recent_avg = recent_years['sdg_index_score'].mean()
	trend = recent_avg - trend_data['sdg_index_score'].iloc[0] if len(recent_years) > 0 else 0
	else:
	trend = 0

	# Goal performance breakdown
	goal_scores = {}
	for i in range(1, 18):
	goal_col = f'goal_{i}_score'
	if goal_col in country_data.columns:
	goal_scores[f'goal_{i}'] = country_data[goal_col].values[0]

	metrics = {
	'score': country_data['sdg_index_score'].values[0],
	'rank': int(country_rank),
	'global_avg': global_avg,
	'country_count': len(year_data),
	'trend_5yr': trend,
	'goal_scores': goal_scores,
	'data_quality': 'high' if ('is_sample_data' in country_data.columns and not country_data['is_sample_data'].any()) else 'sample',
	'last_updated': country_data['last_updated'].values[0] if ('last_updated' in country_data.columns and len(country_data) > 0) else 'N/A'
	}

	return metrics