Spaces:

pcreem
/

sdgToPic

Running

File size: 37,447 Bytes

import pandas as pd
import numpy as np
import os
import logging
from datetime import datetime
from src.config_manager import get_config

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Column mapping for standardizing column names across different data versions
COLUMN_MAPPING = {
    # Country field variations - comprehensive case-insensitive matching
    'Country': 'country',
    'Country Name': 'country',
    'country_name': 'country',
    'nation': 'country',
    'country': 'country',
    'Country Name': 'country',
    'Nation': 'country',
    'Country Name': 'country',
    'Country': 'country',
    'COUNTRY': 'country',
    'country': 'country',
    
    # Year variations
    'Year': 'year',
    'year_value': 'year',
    'year': 'year',
    
    # SDG Index Score variations - flexible matching
    '2025 SDG Index Score': 'sdg_index_score',
    'SDG Index Score': 'sdg_index_score',
    'SDG Index': 'sdg_index_score',
    'sdg_score': 'sdg_index_score',
    'overall_score': 'sdg_index_score',
    'index score': 'sdg_index_score',
    'sdgi_s': 'sdg_index_score',
    'SDGI_S': 'sdg_index_score',
    'SDG_Score': 'sdg_index_score',
    'sdg_index_score': 'sdg_index_score',
    'SDG_Index_Score': 'sdg_index_score',
    'Overall_Score': 'sdg_index_score',
    'INDEX_SCORE': 'sdg_index_score',
    
    # Individual goal score mappings
    'Goal 1 Score': 'goal_1_score',
    'Goal 1': 'goal_1_score',
    'sdg_1_score': 'goal_1_score',
    
    'Goal 2 Score': 'goal_2_score',
    'Goal 2': 'goal_2_score',
    'sdg_2_score': 'goal_2_score',
    
    'Goal 3 Score': 'goal_3_score',
    'Goal 3': 'goal_3_score',
    'sdg_3_score': 'goal_3_score',
    
    'Goal 4 Score': 'goal_4_score',
    'Goal 4': 'goal_4_score',
    'sdg_4_score': 'goal_4_score',
    
    'Goal 5 Score': 'goal_5_score',
    'Goal 5': 'goal_5_score',
    'sdg_5_score': 'goal_5_score',
    
    'Goal 6 Score': 'goal_6_score',
    'Goal 6': 'goal_6_score',
    'sdg_6_score': 'goal_6_score',
    
    'Goal 7 Score': 'goal_7_score',
    'Goal 7': 'goal_7_score',
    'sdg_7_score': 'goal_7_score',
    
    'Goal 8 Score': 'goal_8_score',
    'Goal 8': 'goal_8_score',
    'sdg_8_score': 'goal_8_score',
    
    'Goal 9 Score': 'goal_9_score',
    'Goal 9': 'goal_9_score',
    'sdg_9_score': 'goal_9_score',
    
    'Goal 10 Score': 'goal_10_score',
    'Goal 10': 'goal_10_score',
    'sdg_10_score': 'goal_10_score',
    
    'Goal 11 Score': 'goal_11_score',
    'Goal 11': 'goal_11_score',
    'sdg_11_score': 'goal_11_score',
    
    'Goal 12 Score': 'goal_12_score',
    'Goal 12': 'goal_12_score',
    'sdg_12_score': 'goal_12_score',
    
    'Goal 13 Score': 'goal_13_score',
    'Goal 13': 'goal_13_score',
    'sdg_13_score': 'goal_13_score',
    
    'Goal 14 Score': 'goal_14_score',
    'Goal 14': 'goal_14_score',
    'sdg_14_score': 'goal_14_score',
    
    'Goal 15 Score': 'goal_15_score',
    'Goal 15': 'goal_15_score',
    'sdg_15_score': 'goal_15_score',
    
    'Goal 16 Score': 'goal_16_score',
    'Goal 16': 'goal_16_score',
    'sdg_16_score': 'goal_16_score',
    
    'Goal 17 Score': 'goal_17_score',
    'Goal 17': 'goal_17_score',
    'sdg_17_score': 'goal_17_score',
    
    # Short name variations from backdated index
    'goal1': 'goal_1_score',
    'goal2': 'goal_2_score',
    'goal3': 'goal_3_score',
    'goal4': 'goal_4_score',
    'goal5': 'goal_5_score',
    'goal6': 'goal_6_score',
    'goal7': 'goal_7_score',
    'goal8': 'goal_8_score',
    'goal9': 'goal_9_score',
    'goal10': 'goal_10_score',
    'goal11': 'goal_11_score',
    'goal12': 'goal_12_score',
    'goal13': 'goal_13_score',
    'goal14': 'goal_14_score',
    'goal15': 'goal_15_score',
    'goal16': 'goal_16_score',
    'goal17': 'goal_17_score',
    
    # Additional common mappings
    'Country Code': 'country_code',
    'country_code_alpha_3': 'country_code',
    'iso3': 'country_code'
}

# Country name mapping for standardizing country names
COUNTRY_NAME_MAPPING = {
    # Common variations and alternative names
    'United States of America': 'United States',
    'USA': 'United States',
    'US': 'United States',
    
    'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom',
    'UK': 'United Kingdom',
    'Britain': 'United Kingdom',
    'Great Britain': 'United Kingdom',
    
    'Russian Federation': 'Russia',
    'Russia': 'Russia',
    
    'Korea, Republic of': 'South Korea',
    'Republic of Korea': 'South Korea',
    'Korea (South)': 'South Korea',
    'South Korea': 'South Korea',
    
    'Korea, Democratic People\'s Republic of': 'North Korea',
    'North Korea': 'North Korea',
    
    'Iran, Islamic Republic of': 'Iran',
    'Islamic Republic of Iran': 'Iran',
    
    'Venezuela, Bolivarian Republic of': 'Venezuela',
    'Bolivarian Republic of Venezuela': 'Venezuela',
    
    'Bolivia, Plurinational State of': 'Bolivia',
    'Plurinational State of Bolivia': 'Bolivia',
    
    'United Republic of Tanzania': 'Tanzania',
    'Tanzania, United Republic of': 'Tanzania',
    
    'Viet Nam': 'Vietnam',
    'Vietnam': 'Vietnam',
    
    'Republic of Moldova': 'Moldova',
    'Moldova, Republic of': 'Moldova',
    
    'The former Yugoslav Republic of Macedonia': 'North Macedonia',
    'North Macedonia': 'North Macedonia',
    'Macedonia': 'North Macedonia',
    
    'Syrian Arab Republic': 'Syria',
    'Syria': 'Syria',
    
    'Côte d\'Ivoire': 'Ivory Coast',
    'Ivory Coast': 'Ivory Coast',
    
    'Congo, Democratic Republic of the': 'Democratic Republic of the Congo',
    'Congo, Republic of the': 'Republic of the Congo',
    'Congo': 'Republic of the Congo',
    
    'Myanmar': 'Myanmar',
    'Burma': 'Myanmar',
    
    'Czech Republic': 'Czech Republic',
    'Czechia': 'Czech Republic',
    
    'Eswatini': 'Eswatini',
    'Swaziland': 'Eswatini',
    
    'Cabo Verde': 'Cape Verde',
    'Cape Verde': 'Cape Verde'
    ,
    # Common variant for Taiwan used in some datasets
    'Chinese Taipei': 'Taiwan',
    'Taipei, Chinese': 'Taiwan'
}

def load_sdg_data(file_path=None):
    """
    Load and clean SDG data (CSV or Excel) with comprehensive validation and error handling.
    
    Features:
    - Supports both CSV and Excel formats
    - BOM handling (encoding='utf-8-sig')
    - Missing value imputation
    - Schema validation
    - Country name normalization
    - Data source tracking
    """
    try:
        # Get file path from configuration if not provided
        if file_path is None:
            config = get_config()
            file_path = config.get('data_sources.primary_data_path')
        
        # If no file_path provided or file not found, prefer SDR2025 file in data folder
        if file_path is None or not os.path.exists(file_path):
            # Try repository-relative data/SDR2025-data.xlsx
            repo_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'SDR2025-data.xlsx')
            if os.path.exists(repo_data_path):
                file_path = repo_data_path
                logger.info(f"Using SDR2025 data at {repo_data_path}")
            else:
                # Fallback to any configured path or sample data
                if file_path and not os.path.exists(file_path):
                    logger.warning(f"Configured file {file_path} not found. Falling back to sample data.")
                else:
                    logger.warning("No data file provided. Using sample data.")
                return create_sample_data()
        
        df = None
        
        # Detect file type and load accordingly
        if file_path.endswith('.xlsx') or file_path.endswith('.xls'):
            # Handle Excel files
            df = load_excel_data(file_path)
        else:
            # Handle CSV files
            df = load_csv_data(file_path)
        
        if df is None or df.empty:
            logger.warning(f"No valid data loaded from {file_path}. Using sample data.")
            return create_sample_data()
        
        logger.info(f"Successfully loaded data from {file_path}: {len(df)} rows, {len(df.columns)} columns")
        
        # Add metadata columns
        df = add_data_metadata(df, file_path)
        
        return df
    except Exception as e:
        logger.error(f"Error loading data from {file_path}: {e}")
        logger.info("Falling back to sample data")
        return create_sample_data()

def load_csv_data(file_path):
    """Load CSV data with proper encoding handling."""
    try:
        # Load data with BOM handling
        df = pd.read_csv(file_path, encoding='utf-8-sig')
        
        # Remove BOM from column names if present
        df.columns = [c.replace('\ufeff', '') for c in df.columns]
        # Basic cleaning: remove whitespace
        df.columns = [c.strip() for c in df.columns]
        
        # Apply column mapping
        df = df.rename(columns=COLUMN_MAPPING)
        
        # Deduplicate columns if any mappings caused duplicates
        if df.columns.duplicated().any():
            df = df.loc[:, ~df.columns.duplicated()]
        
        # Handle Country Name Normalization
        if 'country' in df.columns:
            df['country'] = df['country'].replace(COUNTRY_NAME_MAPPING)
        
        # Ensure data types and handle potential duplicate columns
        if 'year' in df.columns:
            target_year = df['year']
            if isinstance(target_year, pd.DataFrame):
                target_year = target_year.iloc[:, 0]
            df['year'] = pd.to_numeric(target_year, errors='coerce').fillna(0).astype(int)
        
        if 'sdg_index_score' in df.columns:
            target_score = df['sdg_index_score']
            if isinstance(target_score, pd.DataFrame):
                target_score = target_score.iloc[:, 0]
            df['sdg_index_score'] = pd.to_numeric(target_score, errors='coerce')
        
        # Missing value strategy: Forward fill by country with improved error handling
        if 'country' in df.columns and 'year' in df.columns:
            try:
                df = df.sort_values(['country', 'year'])
                # Use the improved handle_missing_values function
                df = handle_missing_values(df)
            except Exception as e:
                logger.warning(f"Error in missing value handling for CSV data: {e}")
                # Fallback to simple forward fill
                try:
                    df = df.sort_values(['country', 'year'])
                    for col in df.columns:
                        if col.endswith('_score'):
                            df[col] = df.groupby('country')[col].ffill().bfill()
                except Exception as e2:
                    logger.warning(f"Fallback missing value handling also failed: {e2}")
        
        return df
    except Exception as e:
        logger.error(f"Error loading CSV data: {e}")
        return None

def load_excel_data(file_path):
    """Load and process Excel data with 2025 SDR structure and enhanced column detection."""
    try:
        logger.info(f"Loading Excel file: {file_path}")
        
        # Load Excel file and find the right sheet
        xlsx_file = pd.ExcelFile(file_path)
        logger.info(f"Available sheets: {xlsx_file.sheet_names}")
        
        df = None
        
        # Try to load both current data and historical data if available
        sheets_found = {}
        for sheet_name in ['SDR2025 Data', 'Backdated SDG Index', 'Overview', 'Data']:
            if sheet_name in xlsx_file.sheet_names:
                try:
                    sheet_df = pd.read_excel(file_path, sheet_name=sheet_name)
                    # Basic cleaning
                    sheet_df.columns = [str(col).strip() for col in sheet_df.columns]
                    # Map columns
                    sheet_df = apply_2025_excel_mapping(sheet_df)
                    
                    if 'country' in sheet_df.columns:
                        # Ensure year
                        if 'year' not in sheet_df.columns:
                            # If it's the main data sheet, default to 2025
                            if sheet_name == 'SDR2025 Data':
                                sheet_df['year'] = 2025
                            else:
                                sheet_df['year'] = 2024 # Fallback
                        
                        sheets_found[sheet_name] = sheet_df
                        logger.info(f"Loaded sheet '{sheet_name}' with {len(sheet_df)} rows")
                except Exception as e:
                    logger.warning(f"Failed to read sheet '{sheet_name}': {e}")
        
        if not sheets_found:
            logger.error("Could not find any valid data sheets in Excel file")
            return None
        
        # Combine sheets if we have both current and history
        if 'SDR2025 Data' in sheets_found and 'Backdated SDG Index' in sheets_found:
            logger.info("Merging current data (2025) with backdated index (2000-2024)")
            df = pd.concat([sheets_found['Backdated SDG Index'], sheets_found['SDR2025 Data']], ignore_index=True)
            # Remove duplicated country-year entries (prefer 2025 sheet if overlap)
            df = df.drop_duplicates(subset=['country', 'year'], keep='last')
        else:
            # Revert to whichever sheet we found first (priority order)
            for name in ['SDR2025 Data', 'Backdated SDG Index', 'Overview', 'Data']:
                if name in sheets_found:
                    df = sheets_found[name]
                    break
        
        # Clean column names
        original_columns = list(df.columns)
        df.columns = [str(col).strip() for col in df.columns]
        logger.info(f"Cleaned column names from: {original_columns[:5]}{'...' if len(original_columns) > 5 else ''} to: {list(df.columns)[:5]}{'...' if len(df.columns) > 5 else ''}")
        
        # Apply 2025 Excel-specific column mapping
        df = apply_2025_excel_mapping(df)
        logger.info(f"After column mapping, available columns: {list(df.columns)[:10]}{'...' if len(df.columns) > 10 else ''}")
        
        # Handle country name normalization
        if 'country' in df.columns:
            df['country'] = df['country'].replace(COUNTRY_NAME_MAPPING)
        
        # Ensure required columns exist (add defaults if missing)
        if 'year' not in df.columns:
            df['year'] = 2025  # Default to 2025 if missing
        elif isinstance(df['year'], pd.DataFrame):
            df['year'] = df['year'].iloc[:, 0]
        
        # Convert year to numeric
        df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(2025).astype(int)
        
        # Handle SDG Index Score
        if 'sdg_index_score' not in df.columns:
            # Try to find alternative score columns
            score_cols = [col for col in df.columns if 'score' in col.lower() and 'index' in col.lower()]
            if score_cols:
                # Use first matching column, handle if it's a DataFrame (duplicate names)
                target_col = df[score_cols[0]]
                if isinstance(target_col, pd.DataFrame):
                    target_col = target_col.iloc[:, 0]
                df['sdg_index_score'] = pd.to_numeric(target_col, errors='coerce')
            else:
                df['sdg_index_score'] = 0
        else:
            # Handle if it's a DataFrame
            target_col = df['sdg_index_score']
            if isinstance(target_col, pd.DataFrame):
                target_col = target_col.iloc[:, 0]
            df['sdg_index_score'] = pd.to_numeric(target_col, errors='coerce')
        
        # Convert goal scores to numeric
        goal_score_cols = [col for col in df.columns if col.startswith('goal_') and col.endswith('_score')]
        for col in goal_score_cols:
            target_col = df[col]
            if isinstance(target_col, pd.DataFrame):
                target_col = target_col.iloc[:, 0]
            df[col] = pd.to_numeric(target_col, errors='coerce')
        
        logger.info(f"Successfully processed Excel data: {len(df)} rows, {len(goal_score_cols)} goal score columns")
        
        return df
    except Exception as e:
        logger.error(f"Error loading Excel data: {e}")
        return None

def apply_2025_excel_mapping(df):
    """Apply flexible column mapping for 2025 Excel file structure with comprehensive logging."""
    logger.info("Applying flexible 2025 Excel column mapping...")
    
    # 2025 Excel specific mappings - comprehensive and flexible
    excel_mapping = {
        # Country field variations
        'Country': 'country',
        'country': 'country',
        'Country Name': 'country',
        'country_name': 'country',
        'Nation': 'country',
        'nation': 'country',
        'country code iso3': 'country',
        'ISO3': 'country',
        
        # SDG Index Score variations - very flexible matching
        '2025 SDG Index Score': 'sdg_index_score',
        'SDG Index Score': 'sdg_index_score',
        'sdg_index_score': 'sdg_index_score',
        'SDG Index': 'sdg_index_score',
        'index score': 'sdg_index_score',
        'overall score': 'sdg_index_score',
        'sdgi_s': 'sdg_index_score',
        'SDGI_S': 'sdg_index_score',
        
        # Additional metadata
        '2025 SDG Index Rank': 'global_rank',
        'SDG Index Rank': 'global_rank',
        'Region': 'region',
        'region': 'region',
        
        # Goal mappings for SDR2025 Data sheet format (Goal X Score)
        'Goal 1 Score': 'goal_1_score',
        'Goal 2 Score': 'goal_2_score',
        'Goal 3 Score': 'goal_3_score',
        'Goal 4 Score': 'goal_4_score',
        'Goal 5 Score': 'goal_5_score',
        'Goal 6 Score': 'goal_6_score',
        'Goal 7 Score': 'goal_7_score',
        'Goal 8 Score': 'goal_8_score',
        'Goal 9 Score': 'goal_9_score',
        'Goal 10 Score': 'goal_10_score',
        'Goal 11 Score': 'goal_11_score',
        'Goal 12 Score': 'goal_12_score',
        'Goal 13 Score': 'goal_13_score',
        'Goal 14 Score': 'goal_14_score',
        'Goal 15 Score': 'goal_15_score',
        'Goal 16 Score': 'goal_16_score',
        'Goal 17 Score': 'goal_17_score',
        
        # Variations from backdated index
        'goal1': 'goal_1_score',
        'goal2': 'goal_2_score',
        'goal3': 'goal_3_score',
        'goal4': 'goal_4_score',
        'goal5': 'goal_5_score',
        'goal6': 'goal_6_score',
        'goal7': 'goal_7_score',
        'goal8': 'goal_8_score',
        'goal9': 'goal_9_score',
        'goal10': 'goal_10_score',
        'goal11': 'goal_11_score',
        'goal12': 'goal_12_score',
        'goal13': 'goal_13_score',
        'goal14': 'goal_14_score',
        'goal15': 'goal_15_score',
        'goal16': 'goal_16_score',
        'goal17': 'goal_17_score',
        
        # Goal mappings for Overview sheet format (SDGX: Name)
        'SDG1: No Poverty': 'goal_1_score',
        'SDG2: No Hunger': 'goal_2_score', 
        'SDG3: Good Health and Well-Being': 'goal_3_score',
        'SDG4: Quality Education': 'goal_4_score',
        'SDG5: Gender Equality': 'goal_5_score',
        'SDG6: Clean Water and Sanitation': 'goal_6_score',
        'SDG7: Affordable and Clean Energy': 'goal_7_score',
        'SDG8: Decent Work and Economic Growth': 'goal_8_score',
        'SDG9: Industry, Innovation & Infrastructure': 'goal_9_score',
        'SDG10: Reduced Inequalities': 'goal_10_score',
        'SDG11: Sustainable Cities and Communities': 'goal_11_score',
        'SDG12: Responsible Consumption & Production': 'goal_12_score',
        'SDG13: Climate Action': 'goal_13_score',
        'SDG14: Life Below Water': 'goal_14_score',
        'SDG15: Life on Land': 'goal_15_score',
        'SDG16: Peace, Justice and Strong Institutions': 'goal_16_score',
        'SDG17: Partnerships for the Goals': 'goal_17_score',
    }
    
    # Apply the mapping - only for columns that exist
    actual_mappings = {old: new for old, new in excel_mapping.items() if old in df.columns}
    
    if actual_mappings:
        # Before renaming, check if any of the target names already exist
        # and would cause duplicates that aren't intended
        df = df.rename(columns=actual_mappings)
        logger.info(f"Successfully mapped columns: {list(actual_mappings.keys())}")
    else:
        logger.info("No columns found to map")
    
    # Flexible Goal Score mapping - handle 'Goal X Score' format dynamically
    for col in list(df.columns):
        new_col_name = None
        # Match patterns like 'Goal 1 Score', 'Goal 2 Score', etc.
        if col.startswith('Goal ') and col.endswith(' Score'):
            try:
                goal_num = col.split(' ')[1]  # Extract number from 'Goal X Score'
                new_col_name = f'goal_{goal_num}_score'
            except Exception as e:
                logger.warning(f"Failed to parse goal column '{col}': {e}")
        
        # Match SDG naming patterns like 'SDG1: No Poverty'
        elif col.startswith('SDG') and ':' in col:
            try:
                goal_part = col.split(':')[0]  # Get 'SDG1' from 'SDG1: No Poverty'
                goal_num = goal_part.replace('SDG', '')  # Remove 'SDG' prefix
                new_col_name = f'goal_{goal_num}_score'
            except Exception as e:
                logger.warning(f"Failed to parse SDG column '{col}': {e}")
        
        if new_col_name and new_col_name != col:
            # If target column already exists, we might want to consolidate or just drop this one
            # For now, we rename and then we will deduplicate
            df = df.rename(columns={col: new_col_name})
            logger.info(f"Mapped: '{col}' -> '{new_col_name}'")

    # CRITICAL: Handle duplicate column names that might have been created
    if df.columns.duplicated().any():
        logger.warning(f"Duplicate columns detected after mapping: {df.columns[df.columns.duplicated()].unique().tolist()}")
        # Keep the first occurrence of each column name
        df = df.loc[:, ~df.columns.duplicated()]
        logger.info("Successfully deduplicated columns by keeping the first occurrence.")
    
    # Remove unnamed columns (these are usually status indicators)
    unnamed_cols = [col for col in df.columns if col.startswith('Unnamed:')]
    if unnamed_cols:
        df = df.drop(columns=unnamed_cols)
        logger.info(f"Removed unnamed columns: {unnamed_cols}")
    
    # Final column validation
    remaining_cols = list(df.columns)
    logger.info(f"Final columns after mapping: {remaining_cols[:10]}{'...' if len(remaining_cols) > 10 else ''}")
    
    return df

def validate_data_schema(df):
    """
    Validate the data schema and return validation results.
    Updated to be more flexible for 2025 Excel data.
    """
    # Core required columns (minimum needed for the application to work)
    core_required = ['country', 'sdg_index_score']
    
    # Optional goal columns (application can work without them)
    goal_columns = [f'goal_{i}_score' for i in range(1, 18)]
    
    errors = []
    warnings = []
    
    # Check core required columns
    missing_core = [col for col in core_required if col not in df.columns]
    if missing_core:
        errors.append(f"Missing core required columns: {missing_core}")
    
    # Check goal columns (warnings only, not errors)
    missing_goals = [col for col in goal_columns if col not in df.columns]
    if missing_goals:
        warnings.append(f"Missing goal columns: {len(missing_goals)} out of 17 goals available")
    
    # Check year column (optional but recommended)
    if 'year' not in df.columns:
        warnings.append("Year column not found - will use default year 2025")
    else:
        min_year = df['year'].min()
        max_year = df['year'].max()
        if min_year < 2000 or max_year > 2030:
            warnings.append(f"Year range unusual: {min_year}-{max_year} (expected 2000-2030)")
    
    # Check score ranges (0-100) for available score columns
    score_columns = [col for col in df.columns if col.endswith('_score') and col != 'country_score']
    for col in score_columns:
        if col in df.columns:
            # Skip if all values are NaN
            if df[col].isna().all():
                warnings.append(f"Score column {col} contains only NaN values")
                continue
                
            min_score = df[col].min()
            max_score = df[col].max()
            if pd.notna(min_score) and pd.notna(max_score):
                if min_score < -10 or max_score > 110:  # Allow some tolerance
                    warnings.append(f"Score column {col} has unusual range: {min_score:.1f}-{max_score:.1f} (expected 0-100)")
    
    # Check data completeness
    if 'country' in df.columns:
        duplicate_countries = df['country'].duplicated().sum()
        if duplicate_countries > 0:
            warnings.append(f"Found {duplicate_countries} duplicate country entries")
    
    # Overall validation result
    is_valid = len(errors) == 0
    
    return {
        'valid': is_valid,
        'errors': errors,
        'warnings': warnings,
        'column_count': len(df.columns),
        'row_count': len(df),
        'available_goals': len([col for col in goal_columns if col in df.columns]),
        'year_range': (int(df['year'].min()), int(df['year'].max())) if 'year' in df.columns else (2025, 2025)
    }

def normalize_country_names(df):
    """
    Normalize country names to handle variations like "Taiwan" vs "Taiwan, Province of China".
    """
    # Country name mapping for normalization
    country_mappings = {
        'Taiwan, Province of China': 'Taiwan',
        'Korea, Republic of': 'South Korea',
        'Korea (South)': 'South Korea',
        'United States of America': 'United States',
        'UK': 'United Kingdom',
        'Russian Federation': 'Russia',
        'Iran, Islamic Republic of': 'Iran',
        'Venezuela, Bolivarian Republic of': 'Venezuela'
    }
    
    # Apply mappings
    for old_name, new_name in country_mappings.items():
        df['country'] = df['country'].replace(old_name, new_name)
    
    return df

def handle_missing_values(df):
    """
    Handle missing values with appropriate strategies for different column types.
    """
    # For score columns, use forward fill then backward fill
    score_columns = [col for col in df.columns if col.endswith('_score')]
    
    for col in score_columns:
        if col in df.columns:
            try:
                # Ensure column is numeric and handle if it's a DataFrame
                target_col = df[col]
                if isinstance(target_col, pd.DataFrame):
                    target_col = target_col.iloc[:, 0]
                df[col] = pd.to_numeric(target_col, errors='coerce')
                
                # Forward fill within country groups
                df[col] = df.groupby('country', group_keys=False)[col].apply(lambda x: x.ffill().bfill())
                
                # If still NaN, use global median for that year with better error handling
                def safe_fillna(x):
                    try:
                        median_val = x.median()
                        # Check if median is valid (not NaN and is numeric)
                        if pd.notna(median_val) and isinstance(median_val, (int, float)):
                            return x.fillna(median_val)
                        else:
                            # If median is invalid, use overall column median
                            overall_median = df[col].median()
                            if pd.notna(overall_median):
                                return x.fillna(overall_median)
                            else:
                                # If still no valid median, use 0
                                return x.fillna(0)
                    except Exception as e:
                        logger.warning(f"Error in fillna for column {col}: {e}")
                        return x.fillna(0)
                
                df[col] = df.groupby('year', group_keys=False)[col].apply(safe_fillna)
                
                # Ensure column is numeric and handle if it's a DataFrame
                target_col = df[col]
                if isinstance(target_col, pd.DataFrame):
                    target_col = target_col.iloc[:, 0]
                df[col] = pd.to_numeric(target_col, errors='coerce').fillna(0) # Fill final NaNs with 0
            except Exception as e:
                logger.warning(f"Error processing column {col}: {e}")
    
    # For country_code, forward fill within country
    if 'country_code' in df.columns:
        try:
            df['country_code'] = df.groupby('country', group_keys=False)['country_code'].apply(lambda x: x.ffill().bfill())
        except Exception as e:
            logger.warning(f"Error handling country_code missing values: {e}")
    
    return df

def apply_data_quality_checks(df):
    """
    Apply data quality checks and clean anomalous values.
    """
    # Score columns should be between 0 and 100
    score_columns = [col for col in df.columns if col.endswith('_score')]
    
    for col in score_columns:
        if col in df.columns:
            # Cap scores at 0-100 range
            df[col] = df[col].clip(lower=0, upper=100)
    
    # Year should be integer
    if 'year' in df.columns:
        df['year'] = df['year'].astype(int)
    
    return df

def add_data_metadata(df, file_path):
    """
    Add metadata columns to track data source and quality.
    """
    # Add data source information
    df['data_source'] = os.path.basename(file_path) if file_path else 'sample_data'
    df['last_updated'] = datetime.now().strftime('%Y-%m-%d')
    df['is_sample_data'] = file_path is None
    
    return df

def create_sample_data():
    """
    Create sample data for demonstration when real data is not available.
    This data is clearly marked as sample data.
    """
    countries = [
        "United States", "China", "Japan", "Germany", "United Kingdom", 
        "France", "Taiwan", "South Korea", "Canada", "Australia",
        "Italy", "Spain", "Netherlands", "Sweden", "Norway"
    ]
    
    data = []
    for country in countries:
        for year in range(2000, 2024):
            # Create realistic but synthetic data
            base_score = 65 + (year - 2000) * 0.8
            # Add some country-specific variations
            country_factor = 1.0
            if country in ["Norway", "Sweden", "Denmark"]:
                country_factor = 1.15  # Nordic countries perform better
            elif country in ["Taiwan", "South Korea"]:
                country_factor = 1.1   # East Asian countries perform well
            elif country in ["China"]:
                country_factor = 0.9   # Some variation
            
            final_score = min(100, base_score * country_factor)
            
            row = {
                "country": country,
                "country_code": f"{country[:3].upper()}",  # Simple country code
                "year": year,
                "sdg_index_score": final_score,
                "data_source": "sample_data",
                "last_updated": datetime.now().strftime('%Y-%m-%d'),
                "is_sample_data": True
            }
            
            # Generate individual goal scores
            for i in range(1, 18):
                goal_score = min(100, 50 + (year - 2000) * 1.2 + (i * 1.1) + np.random.normal(0, 5))
                row[f"goal_{i}_score"] = max(0, goal_score)
            
            data.append(row)
    
    sample_df = pd.DataFrame(data)
    
    # Add warning in the dataframe name
    sample_df.attrs['warning'] = "⚠️ SAMPLE DATA - This is demonstration data only and should not be used for actual analysis or decision-making."
    
    return sample_df

def get_country_list(df):
    """Get sorted list of countries with normalization."""
    if df is None or df.empty:
        return []
    try:
        # Return only countries present in the latest year to avoid listing countries without recent data
        latest_df = get_latest_data(df)
        if latest_df is not None and not latest_df.empty:
            return sorted(latest_df['country'].dropna().unique().tolist())
    except Exception:
        # Fallback to all countries if something unexpected happens
        pass
    return sorted(df['country'].dropna().unique().tolist())

def filter_data(df, country, year_range):
    """Filter data by country and year range."""
    if df is None or df.empty:
        return pd.DataFrame()
    
    mask = (df['country'] == country) & (df['year'] >= year_range[0]) & (df['year'] <= year_range[1])
    return df[mask].sort_values('year')

def get_latest_data(df):
    """Returns the latest year's data for all countries."""
    if df is None or df.empty:
        return pd.DataFrame()
    
    latest_year = df['year'].max()
    return df[df['year'] == latest_year]

def get_data_summary(df):
    """Get comprehensive data summary including quality metrics."""
    if df is None or df.empty:
        return {
            'total_countries': 0,
            'total_records': 0,
            'year_range': None,
            'data_quality_score': 0,
            'has_sample_data': False,
            'warnings': ['No data available']
        }
    
    summary = {
        'total_countries': df['country'].nunique(),
        'total_records': len(df),
        'year_range': (df['year'].min(), df['year'].max()),
        'data_quality_score': calculate_data_quality_score(df),
        'has_sample_data': df['is_sample_data'].any() if 'is_sample_data' in df.columns else False,
        'warnings': []
    }
    
    # Add warnings for sample data
    if summary['has_sample_data']:
        summary['warnings'].append('⚠️ This analysis uses sample data for demonstration purposes')
    
    # Check for data quality issues
    missing_pct = df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100
    if missing_pct > 5:
        summary['warnings'].append(f'High missing data percentage: {missing_pct:.1f}%')
    
    return summary

def calculate_data_quality_score(df):
    """Calculate a data quality score (0-100)."""
    if df is None or df.empty:
        return 0
    
    score = 100.0  # Ensure float
    
    # Deduct for missing values
    missing_pct = float(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100)
    score -= missing_pct * 2  # 2 points per percent missing
    
    # Deduct for unusual year ranges
    if 'year' in df.columns:
        year_range = int(df['year'].max() - df['year'].min())
        if year_range > 30:  # More than 30 years seems unusual
            score -= 10
    
    # Skip outlier detection for now due to duplicate column issues
    # This will be fixed in a future update
    # TODO: Implement robust outlier detection that handles duplicate columns
    score_columns = [col for col in df.columns if col.endswith('_score')]
    if score_columns:
        logger.debug(f"Found {len(score_columns)} score columns, skipping outlier detection")
    
    # Ensure score is a scalar value
    if hasattr(score, 'iloc'):  # If it's a pandas Series/DataFrame
        score = float(score.iloc[0] if len(score) > 0 else 0)
    
    return max(0.0, min(100.0, float(score)))

def get_country_metrics(df, country, year):
    """
    Get comprehensive metrics for a specific country in a specific year.
    """
    if df is None or df.empty:
        return None
    
    country_data = df[(df['country'] == country) & (df['year'] == year)]
    if country_data.empty:
        return None
    
    # Calculate global average for that year
    year_data = df[df['year'] == year]
    if year_data.empty:
        return None
        
    global_avg = year_data['sdg_index_score'].mean()
    
    # Get rank
    year_data_with_rank = year_data.copy()
    year_data_with_rank['rank'] = year_data_with_rank['sdg_index_score'].rank(ascending=False, method='min')
    country_rank = year_data_with_rank[year_data_with_rank['country'] == country]['rank'].values[0]
    
    # Calculate trend (improvement over last 5 years)
    trend_data = df[df['country'] == country]
    recent_years = trend_data[trend_data['year'] >= (year - 5)]
    if len(recent_years) > 1:
        recent_avg = recent_years['sdg_index_score'].mean()
        trend = recent_avg - trend_data['sdg_index_score'].iloc[0] if len(recent_years) > 0 else 0
    else:
        trend = 0
    
    # Goal performance breakdown
    goal_scores = {}
    for i in range(1, 18):
        goal_col = f'goal_{i}_score'
        if goal_col in country_data.columns:
            goal_scores[f'goal_{i}'] = country_data[goal_col].values[0]
    
    metrics = {
        'score': country_data['sdg_index_score'].values[0],
        'rank': int(country_rank),
        'global_avg': global_avg,
        'country_count': len(year_data),
        'trend_5yr': trend,
        'goal_scores': goal_scores,
        'data_quality': 'high' if ('is_sample_data' in country_data.columns and not country_data['is_sample_data'].any()) else 'sample',
        'last_updated': country_data['last_updated'].values[0] if ('last_updated' in country_data.columns and len(country_data) > 0) else 'N/A'
    }
    
    return metrics