import pandas as pd import numpy as np import os import logging from datetime import datetime from src.config_manager import get_config # Configure logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) # Column mapping for standardizing column names across different data versions COLUMN_MAPPING = { # Country field variations - comprehensive case-insensitive matching 'Country': 'country', 'Country Name': 'country', 'country_name': 'country', 'nation': 'country', 'country': 'country', 'Country Name': 'country', 'Nation': 'country', 'Country Name': 'country', 'Country': 'country', 'COUNTRY': 'country', 'country': 'country', # Year variations 'Year': 'year', 'year_value': 'year', 'year': 'year', # SDG Index Score variations - flexible matching '2025 SDG Index Score': 'sdg_index_score', 'SDG Index Score': 'sdg_index_score', 'SDG Index': 'sdg_index_score', 'sdg_score': 'sdg_index_score', 'overall_score': 'sdg_index_score', 'index score': 'sdg_index_score', 'sdgi_s': 'sdg_index_score', 'SDGI_S': 'sdg_index_score', 'SDG_Score': 'sdg_index_score', 'sdg_index_score': 'sdg_index_score', 'SDG_Index_Score': 'sdg_index_score', 'Overall_Score': 'sdg_index_score', 'INDEX_SCORE': 'sdg_index_score', # Individual goal score mappings 'Goal 1 Score': 'goal_1_score', 'Goal 1': 'goal_1_score', 'sdg_1_score': 'goal_1_score', 'Goal 2 Score': 'goal_2_score', 'Goal 2': 'goal_2_score', 'sdg_2_score': 'goal_2_score', 'Goal 3 Score': 'goal_3_score', 'Goal 3': 'goal_3_score', 'sdg_3_score': 'goal_3_score', 'Goal 4 Score': 'goal_4_score', 'Goal 4': 'goal_4_score', 'sdg_4_score': 'goal_4_score', 'Goal 5 Score': 'goal_5_score', 'Goal 5': 'goal_5_score', 'sdg_5_score': 'goal_5_score', 'Goal 6 Score': 'goal_6_score', 'Goal 6': 'goal_6_score', 'sdg_6_score': 'goal_6_score', 'Goal 7 Score': 'goal_7_score', 'Goal 7': 'goal_7_score', 'sdg_7_score': 'goal_7_score', 'Goal 8 Score': 'goal_8_score', 'Goal 8': 'goal_8_score', 'sdg_8_score': 'goal_8_score', 'Goal 9 Score': 'goal_9_score', 'Goal 9': 'goal_9_score', 'sdg_9_score': 'goal_9_score', 'Goal 10 Score': 'goal_10_score', 'Goal 10': 'goal_10_score', 'sdg_10_score': 'goal_10_score', 'Goal 11 Score': 'goal_11_score', 'Goal 11': 'goal_11_score', 'sdg_11_score': 'goal_11_score', 'Goal 12 Score': 'goal_12_score', 'Goal 12': 'goal_12_score', 'sdg_12_score': 'goal_12_score', 'Goal 13 Score': 'goal_13_score', 'Goal 13': 'goal_13_score', 'sdg_13_score': 'goal_13_score', 'Goal 14 Score': 'goal_14_score', 'Goal 14': 'goal_14_score', 'sdg_14_score': 'goal_14_score', 'Goal 15 Score': 'goal_15_score', 'Goal 15': 'goal_15_score', 'sdg_15_score': 'goal_15_score', 'Goal 16 Score': 'goal_16_score', 'Goal 16': 'goal_16_score', 'sdg_16_score': 'goal_16_score', 'Goal 17 Score': 'goal_17_score', 'Goal 17': 'goal_17_score', 'sdg_17_score': 'goal_17_score', # Short name variations from backdated index 'goal1': 'goal_1_score', 'goal2': 'goal_2_score', 'goal3': 'goal_3_score', 'goal4': 'goal_4_score', 'goal5': 'goal_5_score', 'goal6': 'goal_6_score', 'goal7': 'goal_7_score', 'goal8': 'goal_8_score', 'goal9': 'goal_9_score', 'goal10': 'goal_10_score', 'goal11': 'goal_11_score', 'goal12': 'goal_12_score', 'goal13': 'goal_13_score', 'goal14': 'goal_14_score', 'goal15': 'goal_15_score', 'goal16': 'goal_16_score', 'goal17': 'goal_17_score', # Additional common mappings 'Country Code': 'country_code', 'country_code_alpha_3': 'country_code', 'iso3': 'country_code' } # Country name mapping for standardizing country names COUNTRY_NAME_MAPPING = { # Common variations and alternative names 'United States of America': 'United States', 'USA': 'United States', 'US': 'United States', 'United Kingdom of Great Britain and Northern Ireland': 'United Kingdom', 'UK': 'United Kingdom', 'Britain': 'United Kingdom', 'Great Britain': 'United Kingdom', 'Russian Federation': 'Russia', 'Russia': 'Russia', 'Korea, Republic of': 'South Korea', 'Republic of Korea': 'South Korea', 'Korea (South)': 'South Korea', 'South Korea': 'South Korea', 'Korea, Democratic People\'s Republic of': 'North Korea', 'North Korea': 'North Korea', 'Iran, Islamic Republic of': 'Iran', 'Islamic Republic of Iran': 'Iran', 'Venezuela, Bolivarian Republic of': 'Venezuela', 'Bolivarian Republic of Venezuela': 'Venezuela', 'Bolivia, Plurinational State of': 'Bolivia', 'Plurinational State of Bolivia': 'Bolivia', 'United Republic of Tanzania': 'Tanzania', 'Tanzania, United Republic of': 'Tanzania', 'Viet Nam': 'Vietnam', 'Vietnam': 'Vietnam', 'Republic of Moldova': 'Moldova', 'Moldova, Republic of': 'Moldova', 'The former Yugoslav Republic of Macedonia': 'North Macedonia', 'North Macedonia': 'North Macedonia', 'Macedonia': 'North Macedonia', 'Syrian Arab Republic': 'Syria', 'Syria': 'Syria', 'Côte d\'Ivoire': 'Ivory Coast', 'Ivory Coast': 'Ivory Coast', 'Congo, Democratic Republic of the': 'Democratic Republic of the Congo', 'Congo, Republic of the': 'Republic of the Congo', 'Congo': 'Republic of the Congo', 'Myanmar': 'Myanmar', 'Burma': 'Myanmar', 'Czech Republic': 'Czech Republic', 'Czechia': 'Czech Republic', 'Eswatini': 'Eswatini', 'Swaziland': 'Eswatini', 'Cabo Verde': 'Cape Verde', 'Cape Verde': 'Cape Verde' , # Common variant for Taiwan used in some datasets 'Chinese Taipei': 'Taiwan', 'Taipei, Chinese': 'Taiwan' } def load_sdg_data(file_path=None): """ Load and clean SDG data (CSV or Excel) with comprehensive validation and error handling. Features: - Supports both CSV and Excel formats - BOM handling (encoding='utf-8-sig') - Missing value imputation - Schema validation - Country name normalization - Data source tracking """ try: # Get file path from configuration if not provided if file_path is None: config = get_config() file_path = config.get('data_sources.primary_data_path') # If no file_path provided or file not found, prefer SDR2025 file in data folder if file_path is None or not os.path.exists(file_path): # Try repository-relative data/SDR2025-data.xlsx repo_data_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'data', 'SDR2025-data.xlsx') if os.path.exists(repo_data_path): file_path = repo_data_path logger.info(f"Using SDR2025 data at {repo_data_path}") else: # Fallback to any configured path or sample data if file_path and not os.path.exists(file_path): logger.warning(f"Configured file {file_path} not found. Falling back to sample data.") else: logger.warning("No data file provided. Using sample data.") return create_sample_data() df = None # Detect file type and load accordingly if file_path.endswith('.xlsx') or file_path.endswith('.xls'): # Handle Excel files df = load_excel_data(file_path) else: # Handle CSV files df = load_csv_data(file_path) if df is None or df.empty: logger.warning(f"No valid data loaded from {file_path}. Using sample data.") return create_sample_data() logger.info(f"Successfully loaded data from {file_path}: {len(df)} rows, {len(df.columns)} columns") # Add metadata columns df = add_data_metadata(df, file_path) return df except Exception as e: logger.error(f"Error loading data from {file_path}: {e}") logger.info("Falling back to sample data") return create_sample_data() def load_csv_data(file_path): """Load CSV data with proper encoding handling.""" try: # Load data with BOM handling df = pd.read_csv(file_path, encoding='utf-8-sig') # Remove BOM from column names if present df.columns = [c.replace('\ufeff', '') for c in df.columns] # Basic cleaning: remove whitespace df.columns = [c.strip() for c in df.columns] # Apply column mapping df = df.rename(columns=COLUMN_MAPPING) # Deduplicate columns if any mappings caused duplicates if df.columns.duplicated().any(): df = df.loc[:, ~df.columns.duplicated()] # Handle Country Name Normalization if 'country' in df.columns: df['country'] = df['country'].replace(COUNTRY_NAME_MAPPING) # Ensure data types and handle potential duplicate columns if 'year' in df.columns: target_year = df['year'] if isinstance(target_year, pd.DataFrame): target_year = target_year.iloc[:, 0] df['year'] = pd.to_numeric(target_year, errors='coerce').fillna(0).astype(int) if 'sdg_index_score' in df.columns: target_score = df['sdg_index_score'] if isinstance(target_score, pd.DataFrame): target_score = target_score.iloc[:, 0] df['sdg_index_score'] = pd.to_numeric(target_score, errors='coerce') # Missing value strategy: Forward fill by country with improved error handling if 'country' in df.columns and 'year' in df.columns: try: df = df.sort_values(['country', 'year']) # Use the improved handle_missing_values function df = handle_missing_values(df) except Exception as e: logger.warning(f"Error in missing value handling for CSV data: {e}") # Fallback to simple forward fill try: df = df.sort_values(['country', 'year']) for col in df.columns: if col.endswith('_score'): df[col] = df.groupby('country')[col].ffill().bfill() except Exception as e2: logger.warning(f"Fallback missing value handling also failed: {e2}") return df except Exception as e: logger.error(f"Error loading CSV data: {e}") return None def load_excel_data(file_path): """Load and process Excel data with 2025 SDR structure and enhanced column detection.""" try: logger.info(f"Loading Excel file: {file_path}") # Load Excel file and find the right sheet xlsx_file = pd.ExcelFile(file_path) logger.info(f"Available sheets: {xlsx_file.sheet_names}") df = None # Try to load both current data and historical data if available sheets_found = {} for sheet_name in ['SDR2025 Data', 'Backdated SDG Index', 'Overview', 'Data']: if sheet_name in xlsx_file.sheet_names: try: sheet_df = pd.read_excel(file_path, sheet_name=sheet_name) # Basic cleaning sheet_df.columns = [str(col).strip() for col in sheet_df.columns] # Map columns sheet_df = apply_2025_excel_mapping(sheet_df) if 'country' in sheet_df.columns: # Ensure year if 'year' not in sheet_df.columns: # If it's the main data sheet, default to 2025 if sheet_name == 'SDR2025 Data': sheet_df['year'] = 2025 else: sheet_df['year'] = 2024 # Fallback sheets_found[sheet_name] = sheet_df logger.info(f"Loaded sheet '{sheet_name}' with {len(sheet_df)} rows") except Exception as e: logger.warning(f"Failed to read sheet '{sheet_name}': {e}") if not sheets_found: logger.error("Could not find any valid data sheets in Excel file") return None # Combine sheets if we have both current and history if 'SDR2025 Data' in sheets_found and 'Backdated SDG Index' in sheets_found: logger.info("Merging current data (2025) with backdated index (2000-2024)") df = pd.concat([sheets_found['Backdated SDG Index'], sheets_found['SDR2025 Data']], ignore_index=True) # Remove duplicated country-year entries (prefer 2025 sheet if overlap) df = df.drop_duplicates(subset=['country', 'year'], keep='last') else: # Revert to whichever sheet we found first (priority order) for name in ['SDR2025 Data', 'Backdated SDG Index', 'Overview', 'Data']: if name in sheets_found: df = sheets_found[name] break # Clean column names original_columns = list(df.columns) df.columns = [str(col).strip() for col in df.columns] logger.info(f"Cleaned column names from: {original_columns[:5]}{'...' if len(original_columns) > 5 else ''} to: {list(df.columns)[:5]}{'...' if len(df.columns) > 5 else ''}") # Apply 2025 Excel-specific column mapping df = apply_2025_excel_mapping(df) logger.info(f"After column mapping, available columns: {list(df.columns)[:10]}{'...' if len(df.columns) > 10 else ''}") # Handle country name normalization if 'country' in df.columns: df['country'] = df['country'].replace(COUNTRY_NAME_MAPPING) # Ensure required columns exist (add defaults if missing) if 'year' not in df.columns: df['year'] = 2025 # Default to 2025 if missing elif isinstance(df['year'], pd.DataFrame): df['year'] = df['year'].iloc[:, 0] # Convert year to numeric df['year'] = pd.to_numeric(df['year'], errors='coerce').fillna(2025).astype(int) # Handle SDG Index Score if 'sdg_index_score' not in df.columns: # Try to find alternative score columns score_cols = [col for col in df.columns if 'score' in col.lower() and 'index' in col.lower()] if score_cols: # Use first matching column, handle if it's a DataFrame (duplicate names) target_col = df[score_cols[0]] if isinstance(target_col, pd.DataFrame): target_col = target_col.iloc[:, 0] df['sdg_index_score'] = pd.to_numeric(target_col, errors='coerce') else: df['sdg_index_score'] = 0 else: # Handle if it's a DataFrame target_col = df['sdg_index_score'] if isinstance(target_col, pd.DataFrame): target_col = target_col.iloc[:, 0] df['sdg_index_score'] = pd.to_numeric(target_col, errors='coerce') # Convert goal scores to numeric goal_score_cols = [col for col in df.columns if col.startswith('goal_') and col.endswith('_score')] for col in goal_score_cols: target_col = df[col] if isinstance(target_col, pd.DataFrame): target_col = target_col.iloc[:, 0] df[col] = pd.to_numeric(target_col, errors='coerce') logger.info(f"Successfully processed Excel data: {len(df)} rows, {len(goal_score_cols)} goal score columns") return df except Exception as e: logger.error(f"Error loading Excel data: {e}") return None def apply_2025_excel_mapping(df): """Apply flexible column mapping for 2025 Excel file structure with comprehensive logging.""" logger.info("Applying flexible 2025 Excel column mapping...") # 2025 Excel specific mappings - comprehensive and flexible excel_mapping = { # Country field variations 'Country': 'country', 'country': 'country', 'Country Name': 'country', 'country_name': 'country', 'Nation': 'country', 'nation': 'country', 'country code iso3': 'country', 'ISO3': 'country', # SDG Index Score variations - very flexible matching '2025 SDG Index Score': 'sdg_index_score', 'SDG Index Score': 'sdg_index_score', 'sdg_index_score': 'sdg_index_score', 'SDG Index': 'sdg_index_score', 'index score': 'sdg_index_score', 'overall score': 'sdg_index_score', 'sdgi_s': 'sdg_index_score', 'SDGI_S': 'sdg_index_score', # Additional metadata '2025 SDG Index Rank': 'global_rank', 'SDG Index Rank': 'global_rank', 'Region': 'region', 'region': 'region', # Goal mappings for SDR2025 Data sheet format (Goal X Score) 'Goal 1 Score': 'goal_1_score', 'Goal 2 Score': 'goal_2_score', 'Goal 3 Score': 'goal_3_score', 'Goal 4 Score': 'goal_4_score', 'Goal 5 Score': 'goal_5_score', 'Goal 6 Score': 'goal_6_score', 'Goal 7 Score': 'goal_7_score', 'Goal 8 Score': 'goal_8_score', 'Goal 9 Score': 'goal_9_score', 'Goal 10 Score': 'goal_10_score', 'Goal 11 Score': 'goal_11_score', 'Goal 12 Score': 'goal_12_score', 'Goal 13 Score': 'goal_13_score', 'Goal 14 Score': 'goal_14_score', 'Goal 15 Score': 'goal_15_score', 'Goal 16 Score': 'goal_16_score', 'Goal 17 Score': 'goal_17_score', # Variations from backdated index 'goal1': 'goal_1_score', 'goal2': 'goal_2_score', 'goal3': 'goal_3_score', 'goal4': 'goal_4_score', 'goal5': 'goal_5_score', 'goal6': 'goal_6_score', 'goal7': 'goal_7_score', 'goal8': 'goal_8_score', 'goal9': 'goal_9_score', 'goal10': 'goal_10_score', 'goal11': 'goal_11_score', 'goal12': 'goal_12_score', 'goal13': 'goal_13_score', 'goal14': 'goal_14_score', 'goal15': 'goal_15_score', 'goal16': 'goal_16_score', 'goal17': 'goal_17_score', # Goal mappings for Overview sheet format (SDGX: Name) 'SDG1: No Poverty': 'goal_1_score', 'SDG2: No Hunger': 'goal_2_score', 'SDG3: Good Health and Well-Being': 'goal_3_score', 'SDG4: Quality Education': 'goal_4_score', 'SDG5: Gender Equality': 'goal_5_score', 'SDG6: Clean Water and Sanitation': 'goal_6_score', 'SDG7: Affordable and Clean Energy': 'goal_7_score', 'SDG8: Decent Work and Economic Growth': 'goal_8_score', 'SDG9: Industry, Innovation & Infrastructure': 'goal_9_score', 'SDG10: Reduced Inequalities': 'goal_10_score', 'SDG11: Sustainable Cities and Communities': 'goal_11_score', 'SDG12: Responsible Consumption & Production': 'goal_12_score', 'SDG13: Climate Action': 'goal_13_score', 'SDG14: Life Below Water': 'goal_14_score', 'SDG15: Life on Land': 'goal_15_score', 'SDG16: Peace, Justice and Strong Institutions': 'goal_16_score', 'SDG17: Partnerships for the Goals': 'goal_17_score', } # Apply the mapping - only for columns that exist actual_mappings = {old: new for old, new in excel_mapping.items() if old in df.columns} if actual_mappings: # Before renaming, check if any of the target names already exist # and would cause duplicates that aren't intended df = df.rename(columns=actual_mappings) logger.info(f"Successfully mapped columns: {list(actual_mappings.keys())}") else: logger.info("No columns found to map") # Flexible Goal Score mapping - handle 'Goal X Score' format dynamically for col in list(df.columns): new_col_name = None # Match patterns like 'Goal 1 Score', 'Goal 2 Score', etc. if col.startswith('Goal ') and col.endswith(' Score'): try: goal_num = col.split(' ')[1] # Extract number from 'Goal X Score' new_col_name = f'goal_{goal_num}_score' except Exception as e: logger.warning(f"Failed to parse goal column '{col}': {e}") # Match SDG naming patterns like 'SDG1: No Poverty' elif col.startswith('SDG') and ':' in col: try: goal_part = col.split(':')[0] # Get 'SDG1' from 'SDG1: No Poverty' goal_num = goal_part.replace('SDG', '') # Remove 'SDG' prefix new_col_name = f'goal_{goal_num}_score' except Exception as e: logger.warning(f"Failed to parse SDG column '{col}': {e}") if new_col_name and new_col_name != col: # If target column already exists, we might want to consolidate or just drop this one # For now, we rename and then we will deduplicate df = df.rename(columns={col: new_col_name}) logger.info(f"Mapped: '{col}' -> '{new_col_name}'") # CRITICAL: Handle duplicate column names that might have been created if df.columns.duplicated().any(): logger.warning(f"Duplicate columns detected after mapping: {df.columns[df.columns.duplicated()].unique().tolist()}") # Keep the first occurrence of each column name df = df.loc[:, ~df.columns.duplicated()] logger.info("Successfully deduplicated columns by keeping the first occurrence.") # Remove unnamed columns (these are usually status indicators) unnamed_cols = [col for col in df.columns if col.startswith('Unnamed:')] if unnamed_cols: df = df.drop(columns=unnamed_cols) logger.info(f"Removed unnamed columns: {unnamed_cols}") # Final column validation remaining_cols = list(df.columns) logger.info(f"Final columns after mapping: {remaining_cols[:10]}{'...' if len(remaining_cols) > 10 else ''}") return df def validate_data_schema(df): """ Validate the data schema and return validation results. Updated to be more flexible for 2025 Excel data. """ # Core required columns (minimum needed for the application to work) core_required = ['country', 'sdg_index_score'] # Optional goal columns (application can work without them) goal_columns = [f'goal_{i}_score' for i in range(1, 18)] errors = [] warnings = [] # Check core required columns missing_core = [col for col in core_required if col not in df.columns] if missing_core: errors.append(f"Missing core required columns: {missing_core}") # Check goal columns (warnings only, not errors) missing_goals = [col for col in goal_columns if col not in df.columns] if missing_goals: warnings.append(f"Missing goal columns: {len(missing_goals)} out of 17 goals available") # Check year column (optional but recommended) if 'year' not in df.columns: warnings.append("Year column not found - will use default year 2025") else: min_year = df['year'].min() max_year = df['year'].max() if min_year < 2000 or max_year > 2030: warnings.append(f"Year range unusual: {min_year}-{max_year} (expected 2000-2030)") # Check score ranges (0-100) for available score columns score_columns = [col for col in df.columns if col.endswith('_score') and col != 'country_score'] for col in score_columns: if col in df.columns: # Skip if all values are NaN if df[col].isna().all(): warnings.append(f"Score column {col} contains only NaN values") continue min_score = df[col].min() max_score = df[col].max() if pd.notna(min_score) and pd.notna(max_score): if min_score < -10 or max_score > 110: # Allow some tolerance warnings.append(f"Score column {col} has unusual range: {min_score:.1f}-{max_score:.1f} (expected 0-100)") # Check data completeness if 'country' in df.columns: duplicate_countries = df['country'].duplicated().sum() if duplicate_countries > 0: warnings.append(f"Found {duplicate_countries} duplicate country entries") # Overall validation result is_valid = len(errors) == 0 return { 'valid': is_valid, 'errors': errors, 'warnings': warnings, 'column_count': len(df.columns), 'row_count': len(df), 'available_goals': len([col for col in goal_columns if col in df.columns]), 'year_range': (int(df['year'].min()), int(df['year'].max())) if 'year' in df.columns else (2025, 2025) } def normalize_country_names(df): """ Normalize country names to handle variations like "Taiwan" vs "Taiwan, Province of China". """ # Country name mapping for normalization country_mappings = { 'Taiwan, Province of China': 'Taiwan', 'Korea, Republic of': 'South Korea', 'Korea (South)': 'South Korea', 'United States of America': 'United States', 'UK': 'United Kingdom', 'Russian Federation': 'Russia', 'Iran, Islamic Republic of': 'Iran', 'Venezuela, Bolivarian Republic of': 'Venezuela' } # Apply mappings for old_name, new_name in country_mappings.items(): df['country'] = df['country'].replace(old_name, new_name) return df def handle_missing_values(df): """ Handle missing values with appropriate strategies for different column types. """ # For score columns, use forward fill then backward fill score_columns = [col for col in df.columns if col.endswith('_score')] for col in score_columns: if col in df.columns: try: # Ensure column is numeric and handle if it's a DataFrame target_col = df[col] if isinstance(target_col, pd.DataFrame): target_col = target_col.iloc[:, 0] df[col] = pd.to_numeric(target_col, errors='coerce') # Forward fill within country groups df[col] = df.groupby('country', group_keys=False)[col].apply(lambda x: x.ffill().bfill()) # If still NaN, use global median for that year with better error handling def safe_fillna(x): try: median_val = x.median() # Check if median is valid (not NaN and is numeric) if pd.notna(median_val) and isinstance(median_val, (int, float)): return x.fillna(median_val) else: # If median is invalid, use overall column median overall_median = df[col].median() if pd.notna(overall_median): return x.fillna(overall_median) else: # If still no valid median, use 0 return x.fillna(0) except Exception as e: logger.warning(f"Error in fillna for column {col}: {e}") return x.fillna(0) df[col] = df.groupby('year', group_keys=False)[col].apply(safe_fillna) # Ensure column is numeric and handle if it's a DataFrame target_col = df[col] if isinstance(target_col, pd.DataFrame): target_col = target_col.iloc[:, 0] df[col] = pd.to_numeric(target_col, errors='coerce').fillna(0) # Fill final NaNs with 0 except Exception as e: logger.warning(f"Error processing column {col}: {e}") # For country_code, forward fill within country if 'country_code' in df.columns: try: df['country_code'] = df.groupby('country', group_keys=False)['country_code'].apply(lambda x: x.ffill().bfill()) except Exception as e: logger.warning(f"Error handling country_code missing values: {e}") return df def apply_data_quality_checks(df): """ Apply data quality checks and clean anomalous values. """ # Score columns should be between 0 and 100 score_columns = [col for col in df.columns if col.endswith('_score')] for col in score_columns: if col in df.columns: # Cap scores at 0-100 range df[col] = df[col].clip(lower=0, upper=100) # Year should be integer if 'year' in df.columns: df['year'] = df['year'].astype(int) return df def add_data_metadata(df, file_path): """ Add metadata columns to track data source and quality. """ # Add data source information df['data_source'] = os.path.basename(file_path) if file_path else 'sample_data' df['last_updated'] = datetime.now().strftime('%Y-%m-%d') df['is_sample_data'] = file_path is None return df def create_sample_data(): """ Create sample data for demonstration when real data is not available. This data is clearly marked as sample data. """ countries = [ "United States", "China", "Japan", "Germany", "United Kingdom", "France", "Taiwan", "South Korea", "Canada", "Australia", "Italy", "Spain", "Netherlands", "Sweden", "Norway" ] data = [] for country in countries: for year in range(2000, 2024): # Create realistic but synthetic data base_score = 65 + (year - 2000) * 0.8 # Add some country-specific variations country_factor = 1.0 if country in ["Norway", "Sweden", "Denmark"]: country_factor = 1.15 # Nordic countries perform better elif country in ["Taiwan", "South Korea"]: country_factor = 1.1 # East Asian countries perform well elif country in ["China"]: country_factor = 0.9 # Some variation final_score = min(100, base_score * country_factor) row = { "country": country, "country_code": f"{country[:3].upper()}", # Simple country code "year": year, "sdg_index_score": final_score, "data_source": "sample_data", "last_updated": datetime.now().strftime('%Y-%m-%d'), "is_sample_data": True } # Generate individual goal scores for i in range(1, 18): goal_score = min(100, 50 + (year - 2000) * 1.2 + (i * 1.1) + np.random.normal(0, 5)) row[f"goal_{i}_score"] = max(0, goal_score) data.append(row) sample_df = pd.DataFrame(data) # Add warning in the dataframe name sample_df.attrs['warning'] = "⚠️ SAMPLE DATA - This is demonstration data only and should not be used for actual analysis or decision-making." return sample_df def get_country_list(df): """Get sorted list of countries with normalization.""" if df is None or df.empty: return [] try: # Return only countries present in the latest year to avoid listing countries without recent data latest_df = get_latest_data(df) if latest_df is not None and not latest_df.empty: return sorted(latest_df['country'].dropna().unique().tolist()) except Exception: # Fallback to all countries if something unexpected happens pass return sorted(df['country'].dropna().unique().tolist()) def filter_data(df, country, year_range): """Filter data by country and year range.""" if df is None or df.empty: return pd.DataFrame() mask = (df['country'] == country) & (df['year'] >= year_range[0]) & (df['year'] <= year_range[1]) return df[mask].sort_values('year') def get_latest_data(df): """Returns the latest year's data for all countries.""" if df is None or df.empty: return pd.DataFrame() latest_year = df['year'].max() return df[df['year'] == latest_year] def get_data_summary(df): """Get comprehensive data summary including quality metrics.""" if df is None or df.empty: return { 'total_countries': 0, 'total_records': 0, 'year_range': None, 'data_quality_score': 0, 'has_sample_data': False, 'warnings': ['No data available'] } summary = { 'total_countries': df['country'].nunique(), 'total_records': len(df), 'year_range': (df['year'].min(), df['year'].max()), 'data_quality_score': calculate_data_quality_score(df), 'has_sample_data': df['is_sample_data'].any() if 'is_sample_data' in df.columns else False, 'warnings': [] } # Add warnings for sample data if summary['has_sample_data']: summary['warnings'].append('⚠️ This analysis uses sample data for demonstration purposes') # Check for data quality issues missing_pct = df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100 if missing_pct > 5: summary['warnings'].append(f'High missing data percentage: {missing_pct:.1f}%') return summary def calculate_data_quality_score(df): """Calculate a data quality score (0-100).""" if df is None or df.empty: return 0 score = 100.0 # Ensure float # Deduct for missing values missing_pct = float(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100) score -= missing_pct * 2 # 2 points per percent missing # Deduct for unusual year ranges if 'year' in df.columns: year_range = int(df['year'].max() - df['year'].min()) if year_range > 30: # More than 30 years seems unusual score -= 10 # Skip outlier detection for now due to duplicate column issues # This will be fixed in a future update # TODO: Implement robust outlier detection that handles duplicate columns score_columns = [col for col in df.columns if col.endswith('_score')] if score_columns: logger.debug(f"Found {len(score_columns)} score columns, skipping outlier detection") # Ensure score is a scalar value if hasattr(score, 'iloc'): # If it's a pandas Series/DataFrame score = float(score.iloc[0] if len(score) > 0 else 0) return max(0.0, min(100.0, float(score))) def get_country_metrics(df, country, year): """ Get comprehensive metrics for a specific country in a specific year. """ if df is None or df.empty: return None country_data = df[(df['country'] == country) & (df['year'] == year)] if country_data.empty: return None # Calculate global average for that year year_data = df[df['year'] == year] if year_data.empty: return None global_avg = year_data['sdg_index_score'].mean() # Get rank year_data_with_rank = year_data.copy() year_data_with_rank['rank'] = year_data_with_rank['sdg_index_score'].rank(ascending=False, method='min') country_rank = year_data_with_rank[year_data_with_rank['country'] == country]['rank'].values[0] # Calculate trend (improvement over last 5 years) trend_data = df[df['country'] == country] recent_years = trend_data[trend_data['year'] >= (year - 5)] if len(recent_years) > 1: recent_avg = recent_years['sdg_index_score'].mean() trend = recent_avg - trend_data['sdg_index_score'].iloc[0] if len(recent_years) > 0 else 0 else: trend = 0 # Goal performance breakdown goal_scores = {} for i in range(1, 18): goal_col = f'goal_{i}_score' if goal_col in country_data.columns: goal_scores[f'goal_{i}'] = country_data[goal_col].values[0] metrics = { 'score': country_data['sdg_index_score'].values[0], 'rank': int(country_rank), 'global_avg': global_avg, 'country_count': len(year_data), 'trend_5yr': trend, 'goal_scores': goal_scores, 'data_quality': 'high' if ('is_sample_data' in country_data.columns and not country_data['is_sample_data'].any()) else 'sample', 'last_updated': country_data['last_updated'].values[0] if ('last_updated' in country_data.columns and len(country_data) > 0) else 'N/A' } return metrics