Spaces:

tbqguy
/

eda-gapminder

Sleeping

File size: 30,623 Bytes

da952bf

import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set page config
st.set_page_config(page_title="Gapminder EDA v2", layout="wide")

# Custom CSS to set Helvetica for body text while keeping headers in Source Sans Pro
st.markdown("""
<style>
    /* Body text, paragraphs, and general content */
    .stMarkdown p, .stMarkdown li, .stMarkdown span {
        font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
    }

    /* Dataframe text */
    .stDataFrame, .stDataFrame * {
        font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
    }

    /* Metric values */
    .stMetric {
        font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
    }

    /* Select boxes and inputs */
    .stSelectbox, .stMultiSelect {
        font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
    }

    /* Tab labels */
    .stTabs [data-baseweb="tab"] {
        font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
    }

    /* Info boxes */
    .stAlert {
        font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
    }

    /* Caption text */
    .stCaption {
        font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
    }

    /* Keep headers in Source Sans Pro (default Streamlit font) */
    h1, h2, h3, h4, h5, h6 {
        font-family: 'Source Sans Pro', sans-serif !important;
    }
</style>
""", unsafe_allow_html=True)

# Title
st.title("🌍 Gapminder Dataset - Exploratory Data Analysis v2")

# Load data
@st.cache_data
def load_data():
    """Load Gapminder dataset from local file with fallback to URL"""
    # For HuggingFace Spaces, use gapminder.tsv in the same directory
    local_path = 'gapminder.tsv'

    try:
        # Try loading from local file first
        df = pd.read_csv(local_path, sep='\t')
    except FileNotFoundError:
        # Fallback to data/ directory (for local development)
        try:
            df = pd.read_csv('data/gapminder.tsv', sep='\t')
        except FileNotFoundError:
            # Final fallback to downloading from GitHub
            url = "https://raw.githubusercontent.com/jennybc/gapminder/master/inst/extdata/gapminder.tsv"
            df = pd.read_csv(url, sep='\t')

    return df

df = load_data()

# Display basic info
st.header("Dataset Overview")
col1, col2, col3 = st.columns(3)
with col1:
    st.metric("Total Rows", df.shape[0])
with col2:
    st.metric("Total Columns", df.shape[1])
with col3:
    st.metric("Countries", df['country'].nunique())

st.subheader("Sample Data")
st.dataframe(df.head(10))

# Descriptive Statistics Section
st.header("📊 Descriptive Statistics")

# Select numerical columns
numerical_cols = ['lifeExp', 'pop', 'gdpPercap']

# Create tabs for each metric
tab1, tab2, tab3, tab4, tab5, tab6, tab7 = st.tabs([
    "Mean", "Median", "Mode", "Standard Deviation",
    "Range & IQR", "Coefficient of Variation", "Percentiles"
])

with tab1:
    st.subheader("Mean (Average)")
    st.write("The mean represents the average value of the data.")

    mean_data = {
        'Metric': numerical_cols,
        'Mean Value': [df[col].mean() for col in numerical_cols]
    }
    mean_df = pd.DataFrame(mean_data)

    col1, col2 = st.columns([1, 2])
    with col1:
        st.dataframe(mean_df, hide_index=True)
    with col2:
        fig = px.bar(mean_df, x='Metric', y='Mean Value',
                     title='Mean Values by Metric',
                     color='Metric')
        st.plotly_chart(fig, use_container_width=True)

with tab2:
    st.subheader("Median")
    st.write("The median represents the middle value when data is sorted.")

    median_data = {
        'Metric': numerical_cols,
        'Median Value': [df[col].median() for col in numerical_cols]
    }
    median_df = pd.DataFrame(median_data)

    col1, col2 = st.columns([1, 2])
    with col1:
        st.dataframe(median_df, hide_index=True)
    with col2:
        fig = px.bar(median_df, x='Metric', y='Median Value',
                     title='Median Values by Metric',
                     color='Metric')
        st.plotly_chart(fig, use_container_width=True)

with tab3:
    st.subheader("Mode")
    st.write("The mode represents the most frequently occurring value(s).")

    mode_results = []
    for col in numerical_cols:
        mode_val = df[col].mode()
        if len(mode_val) > 0:
            mode_results.append({
                'Metric': col,
                'Mode Value': mode_val[0],
                'Frequency': (df[col] == mode_val[0]).sum()
            })

    mode_df = pd.DataFrame(mode_results)
    st.dataframe(mode_df, hide_index=True)

    st.info("Note: For continuous data like life expectancy and GDP, mode may not be very meaningful as values rarely repeat exactly.")

with tab4:
    st.subheader("Standard Deviation")
    st.write("Standard deviation measures the amount of variation or dispersion in the data.")

    std_data = {
        'Metric': numerical_cols,
        'Standard Deviation': [df[col].std() for col in numerical_cols],
        'Variance': [df[col].var() for col in numerical_cols]
    }
    std_df = pd.DataFrame(std_data)

    col1, col2 = st.columns([1, 2])
    with col1:
        st.dataframe(std_df, hide_index=True)
    with col2:
        fig = px.bar(std_df, x='Metric', y='Standard Deviation',
                     title='Standard Deviation by Metric',
                     color='Metric')
        st.plotly_chart(fig, use_container_width=True)

with tab5:
    st.subheader("Range & Interquartile Range (IQR)")
    st.write("Range shows the total spread (Max - Min), while IQR shows the spread of the middle 50% of data.")

    range_data = {
        'Metric': numerical_cols,
        'Min': [df[col].min() for col in numerical_cols],
        'Q1 (25%)': [df[col].quantile(0.25) for col in numerical_cols],
        'Q3 (75%)': [df[col].quantile(0.75) for col in numerical_cols],
        'Max': [df[col].max() for col in numerical_cols],
        'Range': [df[col].max() - df[col].min() for col in numerical_cols],
        'IQR': [df[col].quantile(0.75) - df[col].quantile(0.25) for col in numerical_cols]
    }
    range_df = pd.DataFrame(range_data)

    st.dataframe(range_df, hide_index=True)

    # Visualizations
    col1, col2 = st.columns(2)

    with col1:
        fig = px.bar(range_df, x='Metric', y='Range',
                     title='Range by Metric',
                     color='Metric')
        fig.update_layout(transition={'duration': 500})
        st.plotly_chart(fig, use_container_width=True)

    with col2:
        fig = px.bar(range_df, x='Metric', y='IQR',
                     title='Interquartile Range (IQR) by Metric',
                     color='Metric')
        fig.update_layout(transition={'duration': 500})
        st.plotly_chart(fig, use_container_width=True)

    st.info("💡 IQR is more robust to outliers than Range. A larger IQR indicates more variability in the middle 50% of the data.")

with tab6:
    st.subheader("Coefficient of Variation (CV)")
    st.write("CV = (Standard Deviation / Mean) × 100. It allows comparison of variability across different scales.")

    cv_data = {
        'Metric': numerical_cols,
        'Mean': [df[col].mean() for col in numerical_cols],
        'Std Dev': [df[col].std() for col in numerical_cols],
        'CV (%)': [(df[col].std() / df[col].mean()) * 100 for col in numerical_cols]
    }
    cv_df = pd.DataFrame(cv_data)

    col1, col2 = st.columns([1, 2])

    with col1:
        st.dataframe(cv_df, hide_index=True)

        st.markdown("**Interpretation:**")
        st.markdown("- CV < 15%: Low variability")
        st.markdown("- CV 15-30%: Moderate variability")
        st.markdown("- CV > 30%: High variability")

    with col2:
        fig = px.bar(cv_df, x='Metric', y='CV (%)',
                     title='Coefficient of Variation by Metric',
                     color='Metric')
        fig.update_layout(
            yaxis_title="CV (%)",
            transition={'duration': 500}
        )

        # Add reference lines for interpretation
        fig.add_hline(y=15, line_dash="dash", line_color="green",
                     annotation_text="Low variability threshold")
        fig.add_hline(y=30, line_dash="dash", line_color="orange",
                     annotation_text="High variability threshold")

        st.plotly_chart(fig, use_container_width=True)

    st.info("💡 Higher CV indicates greater relative variability. This is especially useful when comparing metrics with different units or scales.")

with tab7:
    st.subheader("Percentiles Analysis")
    st.write("Percentiles divide the data into 100 equal parts. Key percentiles help understand data distribution.")

    percentiles = [0, 10, 25, 50, 75, 90, 100]
    percentile_data = {'Percentile': [f'{p}th' if p not in [0, 100] else ('Min' if p == 0 else 'Max')
                                      for p in percentiles]}

    for col in numerical_cols:
        percentile_data[col] = [df[col].quantile(p/100) for p in percentiles]

    percentile_df = pd.DataFrame(percentile_data)

    st.dataframe(percentile_df, hide_index=True)

    # Visualizations
    st.subheader("Percentile Visualizations")

    selected_percentile_metric = st.selectbox(
        "Select metric for percentile visualization",
        options=numerical_cols,
        format_func=lambda x: {
            'lifeExp': 'Life Expectancy',
            'pop': 'Population',
            'gdpPercap': 'GDP per Capita'
        }[x],
        key='percentile_metric'
    )

    col1, col2 = st.columns(2)

    with col1:
        # Line chart showing percentile progression
        perc_viz_data = pd.DataFrame({
            'Percentile': percentiles,
            'Value': [df[selected_percentile_metric].quantile(p/100) for p in percentiles]
        })

        fig = px.line(perc_viz_data, x='Percentile', y='Value',
                     title=f'Percentile Progression - {selected_percentile_metric}',
                     markers=True)
        fig.update_traces(line=dict(width=3), marker=dict(size=10))
        fig.update_layout(transition={'duration': 600})
        st.plotly_chart(fig, use_container_width=True)

    with col2:
        # Enhanced box plot with percentile annotations
        fig = px.box(df, y=selected_percentile_metric,
                    title=f'Box Plot with Percentiles - {selected_percentile_metric}')

        # Add percentile annotations
        for p in [10, 25, 50, 75, 90]:
            val = df[selected_percentile_metric].quantile(p/100)
            fig.add_hline(y=val, line_dash="dot", line_color="red",
                         annotation_text=f"{p}th percentile",
                         annotation_position="right")

        fig.update_layout(transition={'duration': 600})
        st.plotly_chart(fig, use_container_width=True)

    st.info("💡 The 50th percentile is the median. The difference between 75th and 25th percentiles is the IQR.")

# Comprehensive Statistics Table
st.header("📈 Complete Statistical Summary")
summary_stats = df[numerical_cols].describe().T
summary_stats['mode'] = [df[col].mode()[0] if len(df[col].mode()) > 0 else np.nan for col in numerical_cols]
summary_stats = summary_stats[['count', 'mean', 'mode', 'std', '50%', 'min', 'max']]
summary_stats.columns = ['Count', 'Mean', 'Mode', 'Std Dev', 'Median', 'Min', 'Max']
st.dataframe(summary_stats)

# Add Min/Max country details
st.subheader("🌟 Extreme Values - Countries")
st.write("Discover which countries hold the minimum and maximum values for each metric")

extreme_cols = st.columns(3)

for idx, col in enumerate(numerical_cols):
    with extreme_cols[idx]:
        col_name = {
            'lifeExp': 'Life Expectancy',
            'pop': 'Population',
            'gdpPercap': 'GDP per Capita'
        }[col]

        # Find min and max
        min_val = df[col].min()
        max_val = df[col].max()

        min_row = df[df[col] == min_val].iloc[0]
        max_row = df[df[col] == max_val].iloc[0]

        st.markdown(f"**{col_name}**")

        # Min value
        st.markdown(f"**📉 Minimum:** {min_val:,.2f}")
        st.caption(f"📍 {min_row['country']} ({min_row['continent']}) in {int(min_row['year'])}")

        # Max value
        st.markdown(f"**📈 Maximum:** {max_val:,.2f}")
        st.caption(f"📍 {max_row['country']} ({max_row['continent']}) in {int(max_row['year'])}")

        st.markdown("---")

# Correlation Analysis
st.header("🔗 Correlation Analysis")
st.write("Correlation measures the strength and direction of relationships between variables (-1 to +1).")

col1, col2 = st.columns([1, 1])

with col1:
    # Calculate correlation matrix
    correlation_matrix = df[numerical_cols].corr()

    st.subheader("Correlation Matrix")
    st.dataframe(correlation_matrix.round(3))

    st.markdown("**Interpretation:**")
    st.markdown("- **+1**: Perfect positive correlation")
    st.markdown("- **0**: No correlation")
    st.markdown("- **-1**: Perfect negative correlation")
    st.markdown("- **|r| > 0.7**: Strong correlation")
    st.markdown("- **|r| 0.3-0.7**: Moderate correlation")
    st.markdown("- **|r| < 0.3**: Weak correlation")

with col2:
    # Heatmap visualization
    fig = px.imshow(correlation_matrix,
                    text_auto='.2f',
                    aspect='auto',
                    color_continuous_scale='RdYlGn',
                    color_continuous_midpoint=0,
                    title='Correlation Heatmap',
                    labels=dict(color="Correlation"))

    fig.update_layout(
        height=400,
        xaxis_title="",
        yaxis_title=""
    )

    st.plotly_chart(fig, use_container_width=True)

# Detailed correlation insights
st.subheader("🔍 Key Correlation Insights")

col1, col2, col3 = st.columns(3)

# Get correlations
corr_life_gdp = correlation_matrix.loc['lifeExp', 'gdpPercap']
corr_life_pop = correlation_matrix.loc['lifeExp', 'pop']
corr_gdp_pop = correlation_matrix.loc['gdpPercap', 'pop']

with col1:
    st.metric(
        "Life Expectancy ↔ GDP per Capita",
        f"{corr_life_gdp:.3f}",
        delta="Strong positive" if abs(corr_life_gdp) > 0.7 else "Moderate" if abs(corr_life_gdp) > 0.3 else "Weak"
    )
    st.caption("Higher GDP tends to correlate with longer life expectancy")

with col2:
    st.metric(
        "Life Expectancy ↔ Population",
        f"{corr_life_pop:.3f}",
        delta="Strong" if abs(corr_life_pop) > 0.7 else "Moderate" if abs(corr_life_pop) > 0.3 else "Weak"
    )
    st.caption("Relationship between population size and life expectancy")

with col3:
    st.metric(
        "GDP per Capita ↔ Population",
        f"{corr_gdp_pop:.3f}",
        delta="Strong" if abs(corr_gdp_pop) > 0.7 else "Moderate" if abs(corr_gdp_pop) > 0.3 else "Weak"
    )
    st.caption("Relationship between wealth and population size")

st.info("💡 Correlation does not imply causation! High correlation indicates variables move together, but one doesn't necessarily cause the other.")

# Filter by Year or Country
st.header("🔎 Filter & Analyze")
col1, col2 = st.columns(2)

with col1:
    selected_year = st.selectbox("Select Year", sorted(df['year'].unique()))
with col2:
    selected_continent = st.multiselect("Select Continent(s)",
                                        df['continent'].unique(),
                                        default=df['continent'].unique())

filtered_df = df[(df['year'] == selected_year) & (df['continent'].isin(selected_continent))]

if not filtered_df.empty:
    st.subheader(f"Statistics for {selected_year} - {', '.join(selected_continent)}")

    # Metrics in columns
    col1, col2, col3 = st.columns(3)

    with col1:
        st.metric("Life Expectancy (Mean)", f"{filtered_df['lifeExp'].mean():.2f}")
        st.metric("Life Expectancy (Median)", f"{filtered_df['lifeExp'].median():.2f}")
        st.metric("Life Expectancy (Std Dev)", f"{filtered_df['lifeExp'].std():.2f}")

    with col2:
        st.metric("Population (Mean)", f"{filtered_df['pop'].mean():,.0f}")
        st.metric("Population (Median)", f"{filtered_df['pop'].median():,.0f}")
        st.metric("Population (Std Dev)", f"{filtered_df['pop'].std():,.0f}")

    with col3:
        st.metric("GDP per Capita (Mean)", f"${filtered_df['gdpPercap'].mean():,.2f}")
        st.metric("GDP per Capita (Median)", f"${filtered_df['gdpPercap'].median():.2f}")
        st.metric("GDP per Capita (Std Dev)", f"${filtered_df['gdpPercap'].std():,.2f}")

    # NEW: Add visualizations for filtered data
    st.subheader("📊 Visual Analysis")

    # Create tabs for different visualizations
    viz_tab1, viz_tab2, viz_tab3, viz_tab4, viz_tab5 = st.tabs([
        "Comparative Bar Charts",
        "Scatter Analysis",
        "Geographic Distribution",
        "Statistical Distribution",
        "Animated Timeline"
    ])

    with viz_tab1:
        st.write("Compare Mean, Median, and Standard Deviation across metrics")

        # Create comparison charts
        col1, col2 = st.columns(2)

        with col1:
            # Mean vs Median comparison
            comparison_data = pd.DataFrame({
                'Metric': numerical_cols * 2,
                'Statistic': ['Mean']*3 + ['Median']*3,
                'Value': [
                    filtered_df['lifeExp'].mean(),
                    filtered_df['pop'].mean(),
                    filtered_df['gdpPercap'].mean(),
                    filtered_df['lifeExp'].median(),
                    filtered_df['pop'].median(),
                    filtered_df['gdpPercap'].median()
                ]
            })

            fig = px.bar(comparison_data, x='Metric', y='Value',
                        color='Statistic', barmode='group',
                        title='Mean vs Median Comparison',
                        labels={'Value': 'Value'},
                        animation_frame=None)
            fig.update_layout(transition={'duration': 500})
            st.plotly_chart(fig, use_container_width=True)

        with col2:
            # Standard Deviation
            std_data = pd.DataFrame({
                'Metric': numerical_cols,
                'Std Dev': [
                    filtered_df['lifeExp'].std(),
                    filtered_df['pop'].std(),
                    filtered_df['gdpPercap'].std()
                ]
            })

            fig = px.bar(std_data, x='Metric', y='Std Dev',
                        title='Standard Deviation by Metric',
                        color='Metric')
            fig.update_layout(transition={'duration': 500})
            st.plotly_chart(fig, use_container_width=True)

    with viz_tab2:
        st.write("Explore relationships between different metrics")

        col1, col2 = st.columns(2)

        with col1:
            # GDP vs Life Expectancy
            fig = px.scatter(filtered_df, x='gdpPercap', y='lifeExp',
                           size='pop', color='continent',
                           hover_name='country',
                           title=f'GDP per Capita vs Life Expectancy ({selected_year})',
                           labels={'gdpPercap': 'GDP per Capita',
                                  'lifeExp': 'Life Expectancy'},
                           log_x=True)
            fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
            fig.update_layout(transition={'duration': 800, 'easing': 'cubic-in-out'})
            st.plotly_chart(fig, use_container_width=True)

        with col2:
            # Population vs Life Expectancy
            fig = px.scatter(filtered_df, x='pop', y='lifeExp',
                           size='gdpPercap', color='continent',
                           hover_name='country',
                           title=f'Population vs Life Expectancy ({selected_year})',
                           labels={'pop': 'Population',
                                  'lifeExp': 'Life Expectancy'},
                           log_x=True)
            fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
            fig.update_layout(transition={'duration': 800, 'easing': 'cubic-in-out'})
            st.plotly_chart(fig, use_container_width=True)

    with viz_tab3:
        st.write("Geographic distribution of metrics across countries")

        # Bar chart by country
        metric_choice = st.selectbox(
            "Select metric to visualize by country",
            options=['lifeExp', 'pop', 'gdpPercap'],
            format_func=lambda x: {
                'lifeExp': 'Life Expectancy',
                'pop': 'Population',
                'gdpPercap': 'GDP per Capita'
            }[x]
        )

        # Sort by selected metric
        sorted_df = filtered_df.sort_values(metric_choice, ascending=False)

        fig = px.bar(sorted_df, x='country', y=metric_choice,
                    color='continent',
                    title=f'{metric_choice.replace("_", " ").title()} by Country ({selected_year})',
                    labels={metric_choice: metric_choice.replace('_', ' ').title()})
        fig.update_layout(
            xaxis_tickangle=-45,
            height=500,
            transition={'duration': 500, 'easing': 'cubic-in-out'}
        )
        st.plotly_chart(fig, use_container_width=True)

        # Top 10 and Bottom 10
        col1, col2 = st.columns(2)

        with col1:
            st.write(f"🏆 Top 10 Countries - {metric_choice.replace('_', ' ').title()}")
            top_10 = sorted_df[['country', metric_choice, 'continent']].head(10)
            st.dataframe(top_10, hide_index=True)

        with col2:
            st.write(f"⚠️ Bottom 10 Countries - {metric_choice.replace('_', ' ').title()}")
            bottom_10 = sorted_df[['country', metric_choice, 'continent']].tail(10)
            st.dataframe(bottom_10, hide_index=True)

    with viz_tab4:
        st.write("Distribution patterns and box plots for filtered data")

        # Box plots by continent
        selected_metric_dist = st.selectbox(
            "Select metric for distribution analysis",
            options=['lifeExp', 'pop', 'gdpPercap'],
            format_func=lambda x: {
                'lifeExp': 'Life Expectancy',
                'pop': 'Population',
                'gdpPercap': 'GDP per Capita'
            }[x],
            key='dist_metric'
        )

        col1, col2 = st.columns(2)

        with col1:
            # Box plot
            fig = px.box(filtered_df, x='continent', y=selected_metric_dist,
                        color='continent',
                        title=f'{selected_metric_dist.replace("_", " ").title()} Distribution by Continent',
                        labels={selected_metric_dist: selected_metric_dist.replace('_', ' ').title()})
            fig.update_layout(transition={'duration': 600})
            st.plotly_chart(fig, use_container_width=True)

        with col2:
            # Violin plot
            fig = px.violin(filtered_df, x='continent', y=selected_metric_dist,
                           color='continent', box=True,
                           title=f'{selected_metric_dist.replace("_", " ").title()} Violin Plot',
                           labels={selected_metric_dist: selected_metric_dist.replace('_', ' ').title()})
            fig.update_layout(transition={'duration': 600})
            st.plotly_chart(fig, use_container_width=True)

    with viz_tab5:
        st.write("🎬 Watch how metrics evolve over time with animated visualizations")

        # Animation type selector
        anim_type = st.radio(
            "Select Animation Type",
            options=["Scatter Plot Timeline", "Bar Chart Race", "Line Chart Evolution"],
            horizontal=True
        )

        if anim_type == "Scatter Plot Timeline":
            # Animated scatter plot over all years
            st.subheader("GDP vs Life Expectancy Over Time")

            # Filter by selected continents
            anim_df = df[df['continent'].isin(selected_continent)]

            fig = px.scatter(anim_df,
                           x='gdpPercap',
                           y='lifeExp',
                           animation_frame='year',
                           animation_group='country',
                           size='pop',
                           color='continent',
                           hover_name='country',
                           log_x=True,
                           size_max=60,
                           range_x=[100, 120000],
                           range_y=[20, 90],
                           title='GDP per Capita vs Life Expectancy (Animated)',
                           labels={'gdpPercap': 'GDP per Capita',
                                  'lifeExp': 'Life Expectancy',
                                  'pop': 'Population'})

            fig.update_layout(
                height=600,
                transition={'duration': 800}
            )

            fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 800
            fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 500

            st.plotly_chart(fig, use_container_width=True)
            st.info("Click the Play button to watch the animation! Each frame represents a year from 1952 to 2007.")

        elif anim_type == "Bar Chart Race":
            # Animated bar chart race
            st.subheader("Top Countries Bar Chart Race")

            metric_race = st.selectbox(
                "Select metric for bar chart race",
                options=['lifeExp', 'pop', 'gdpPercap'],
                format_func=lambda x: {
                    'lifeExp': 'Life Expectancy',
                    'pop': 'Population',
                    'gdpPercap': 'GDP per Capita'
                }[x],
                key='race_metric'
            )

            # Get top 15 countries for the latest year
            latest_year_df = df[df['year'] == df['year'].max()]
            top_countries = latest_year_df.nlargest(15, metric_race)['country'].tolist()

            # Filter data for these countries
            race_df = df[df['country'].isin(top_countries) & df['continent'].isin(selected_continent)]

            fig = px.bar(race_df,
                        x=metric_race,
                        y='country',
                        color='continent',
                        animation_frame='year',
                        orientation='h',
                        title=f'Top Countries by {metric_race.replace("_", " ").title()} Over Time',
                        labels={metric_race: metric_race.replace('_', ' ').title()},
                        range_x=[0, race_df[metric_race].max() * 1.1])

            fig.update_layout(
                height=600,
                yaxis={'categoryorder': 'total ascending'}
            )

            fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1000
            fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 500

            st.plotly_chart(fig, use_container_width=True)
            st.info("Watch the ranking change over time! Countries rise and fall based on their performance.")

        else:  # Line Chart Evolution
            st.subheader("Metric Evolution by Continent")

            metric_line = st.selectbox(
                "Select metric for time evolution",
                options=['lifeExp', 'pop', 'gdpPercap'],
                format_func=lambda x: {
                    'lifeExp': 'Life Expectancy',
                    'pop': 'Population',
                    'gdpPercap': 'GDP per Capita'
                }[x],
                key='line_metric'
            )

            # Calculate mean by continent and year
            line_df = df[df['continent'].isin(selected_continent)].groupby(['year', 'continent'])[metric_line].mean().reset_index()

            fig = px.line(line_df,
                         x='year',
                         y=metric_line,
                         color='continent',
                         markers=True,
                         title=f'Average {metric_line.replace("_", " ").title()} Evolution by Continent',
                         labels={metric_line: f'Average {metric_line.replace("_", " ").title()}',
                                'year': 'Year'})

            fig.update_traces(
                mode='lines+markers',
                line=dict(width=3),
                marker=dict(size=8)
            )

            fig.update_layout(
                height=500,
                hovermode='x unified',
                transition={'duration': 600}
            )

            # Add animation on load
            fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
            fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')

            st.plotly_chart(fig, use_container_width=True)
            st.info("Hover over the chart to see detailed values for each year and continent.")

# Distribution Visualizations
st.header("📉 Overall Data Distributions")
selected_metric = st.selectbox("Select metric to visualize", numerical_cols,
                              format_func=lambda x: {
                                  'lifeExp': 'Life Expectancy',
                                  'pop': 'Population',
                                  'gdpPercap': 'GDP per Capita'
                              }[x])

col1, col2 = st.columns(2)

with col1:
    fig = px.histogram(df, x=selected_metric, nbins=50,
                       title=f'Distribution of {selected_metric}',
                       labels={selected_metric: selected_metric.replace('_', ' ').title()})
    fig.add_vline(x=df[selected_metric].mean(), line_dash="dash",
                  line_color="red", annotation_text="Mean")
    fig.add_vline(x=df[selected_metric].median(), line_dash="dash",
                  line_color="green", annotation_text="Median")
    fig.update_layout(transition={'duration': 700})
    st.plotly_chart(fig, use_container_width=True)

with col2:
    fig = px.box(df, y=selected_metric, x='continent',
                 title=f'{selected_metric} by Continent',
                 labels={selected_metric: selected_metric.replace('_', ' ').title()})
    fig.update_layout(transition={'duration': 700})
    st.plotly_chart(fig, use_container_width=True)

# Footer
st.markdown("---")
st.markdown("Data source: [Gapminder](https://www.gapminder.org/)")
st.markdown("**Version 2** - Enhanced with interactive visualizations in Filter & Analyze section")