import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots # Set page config st.set_page_config(page_title="Gapminder EDA v2", layout="wide") # Custom CSS to set Helvetica for body text while keeping headers in Source Sans Pro st.markdown(""" """, unsafe_allow_html=True) # Title st.title("🌍 Gapminder Dataset - Exploratory Data Analysis v2") # Load data @st.cache_data def load_data(): """Load Gapminder dataset from local file with fallback to URL""" # For HuggingFace Spaces, use gapminder.tsv in the same directory local_path = 'gapminder.tsv' try: # Try loading from local file first df = pd.read_csv(local_path, sep='\t') except FileNotFoundError: # Fallback to data/ directory (for local development) try: df = pd.read_csv('data/gapminder.tsv', sep='\t') except FileNotFoundError: # Final fallback to downloading from GitHub url = "https://raw.githubusercontent.com/jennybc/gapminder/master/inst/extdata/gapminder.tsv" df = pd.read_csv(url, sep='\t') return df df = load_data() # Display basic info st.header("Dataset Overview") col1, col2, col3 = st.columns(3) with col1: st.metric("Total Rows", df.shape[0]) with col2: st.metric("Total Columns", df.shape[1]) with col3: st.metric("Countries", df['country'].nunique()) st.subheader("Sample Data") st.dataframe(df.head(10)) # Descriptive Statistics Section st.header("📊 Descriptive Statistics") # Select numerical columns numerical_cols = ['lifeExp', 'pop', 'gdpPercap'] # Create tabs for each metric tab1, tab2, tab3, tab4, tab5, tab6, tab7 = st.tabs([ "Mean", "Median", "Mode", "Standard Deviation", "Range & IQR", "Coefficient of Variation", "Percentiles" ]) with tab1: st.subheader("Mean (Average)") st.write("The mean represents the average value of the data.") mean_data = { 'Metric': numerical_cols, 'Mean Value': [df[col].mean() for col in numerical_cols] } mean_df = pd.DataFrame(mean_data) col1, col2 = st.columns([1, 2]) with col1: st.dataframe(mean_df, hide_index=True) with col2: fig = px.bar(mean_df, x='Metric', y='Mean Value', title='Mean Values by Metric', color='Metric') st.plotly_chart(fig, use_container_width=True) with tab2: st.subheader("Median") st.write("The median represents the middle value when data is sorted.") median_data = { 'Metric': numerical_cols, 'Median Value': [df[col].median() for col in numerical_cols] } median_df = pd.DataFrame(median_data) col1, col2 = st.columns([1, 2]) with col1: st.dataframe(median_df, hide_index=True) with col2: fig = px.bar(median_df, x='Metric', y='Median Value', title='Median Values by Metric', color='Metric') st.plotly_chart(fig, use_container_width=True) with tab3: st.subheader("Mode") st.write("The mode represents the most frequently occurring value(s).") mode_results = [] for col in numerical_cols: mode_val = df[col].mode() if len(mode_val) > 0: mode_results.append({ 'Metric': col, 'Mode Value': mode_val[0], 'Frequency': (df[col] == mode_val[0]).sum() }) mode_df = pd.DataFrame(mode_results) st.dataframe(mode_df, hide_index=True) st.info("Note: For continuous data like life expectancy and GDP, mode may not be very meaningful as values rarely repeat exactly.") with tab4: st.subheader("Standard Deviation") st.write("Standard deviation measures the amount of variation or dispersion in the data.") std_data = { 'Metric': numerical_cols, 'Standard Deviation': [df[col].std() for col in numerical_cols], 'Variance': [df[col].var() for col in numerical_cols] } std_df = pd.DataFrame(std_data) col1, col2 = st.columns([1, 2]) with col1: st.dataframe(std_df, hide_index=True) with col2: fig = px.bar(std_df, x='Metric', y='Standard Deviation', title='Standard Deviation by Metric', color='Metric') st.plotly_chart(fig, use_container_width=True) with tab5: st.subheader("Range & Interquartile Range (IQR)") st.write("Range shows the total spread (Max - Min), while IQR shows the spread of the middle 50% of data.") range_data = { 'Metric': numerical_cols, 'Min': [df[col].min() for col in numerical_cols], 'Q1 (25%)': [df[col].quantile(0.25) for col in numerical_cols], 'Q3 (75%)': [df[col].quantile(0.75) for col in numerical_cols], 'Max': [df[col].max() for col in numerical_cols], 'Range': [df[col].max() - df[col].min() for col in numerical_cols], 'IQR': [df[col].quantile(0.75) - df[col].quantile(0.25) for col in numerical_cols] } range_df = pd.DataFrame(range_data) st.dataframe(range_df, hide_index=True) # Visualizations col1, col2 = st.columns(2) with col1: fig = px.bar(range_df, x='Metric', y='Range', title='Range by Metric', color='Metric') fig.update_layout(transition={'duration': 500}) st.plotly_chart(fig, use_container_width=True) with col2: fig = px.bar(range_df, x='Metric', y='IQR', title='Interquartile Range (IQR) by Metric', color='Metric') fig.update_layout(transition={'duration': 500}) st.plotly_chart(fig, use_container_width=True) st.info("💡 IQR is more robust to outliers than Range. A larger IQR indicates more variability in the middle 50% of the data.") with tab6: st.subheader("Coefficient of Variation (CV)") st.write("CV = (Standard Deviation / Mean) × 100. It allows comparison of variability across different scales.") cv_data = { 'Metric': numerical_cols, 'Mean': [df[col].mean() for col in numerical_cols], 'Std Dev': [df[col].std() for col in numerical_cols], 'CV (%)': [(df[col].std() / df[col].mean()) * 100 for col in numerical_cols] } cv_df = pd.DataFrame(cv_data) col1, col2 = st.columns([1, 2]) with col1: st.dataframe(cv_df, hide_index=True) st.markdown("**Interpretation:**") st.markdown("- CV < 15%: Low variability") st.markdown("- CV 15-30%: Moderate variability") st.markdown("- CV > 30%: High variability") with col2: fig = px.bar(cv_df, x='Metric', y='CV (%)', title='Coefficient of Variation by Metric', color='Metric') fig.update_layout( yaxis_title="CV (%)", transition={'duration': 500} ) # Add reference lines for interpretation fig.add_hline(y=15, line_dash="dash", line_color="green", annotation_text="Low variability threshold") fig.add_hline(y=30, line_dash="dash", line_color="orange", annotation_text="High variability threshold") st.plotly_chart(fig, use_container_width=True) st.info("💡 Higher CV indicates greater relative variability. This is especially useful when comparing metrics with different units or scales.") with tab7: st.subheader("Percentiles Analysis") st.write("Percentiles divide the data into 100 equal parts. Key percentiles help understand data distribution.") percentiles = [0, 10, 25, 50, 75, 90, 100] percentile_data = {'Percentile': [f'{p}th' if p not in [0, 100] else ('Min' if p == 0 else 'Max') for p in percentiles]} for col in numerical_cols: percentile_data[col] = [df[col].quantile(p/100) for p in percentiles] percentile_df = pd.DataFrame(percentile_data) st.dataframe(percentile_df, hide_index=True) # Visualizations st.subheader("Percentile Visualizations") selected_percentile_metric = st.selectbox( "Select metric for percentile visualization", options=numerical_cols, format_func=lambda x: { 'lifeExp': 'Life Expectancy', 'pop': 'Population', 'gdpPercap': 'GDP per Capita' }[x], key='percentile_metric' ) col1, col2 = st.columns(2) with col1: # Line chart showing percentile progression perc_viz_data = pd.DataFrame({ 'Percentile': percentiles, 'Value': [df[selected_percentile_metric].quantile(p/100) for p in percentiles] }) fig = px.line(perc_viz_data, x='Percentile', y='Value', title=f'Percentile Progression - {selected_percentile_metric}', markers=True) fig.update_traces(line=dict(width=3), marker=dict(size=10)) fig.update_layout(transition={'duration': 600}) st.plotly_chart(fig, use_container_width=True) with col2: # Enhanced box plot with percentile annotations fig = px.box(df, y=selected_percentile_metric, title=f'Box Plot with Percentiles - {selected_percentile_metric}') # Add percentile annotations for p in [10, 25, 50, 75, 90]: val = df[selected_percentile_metric].quantile(p/100) fig.add_hline(y=val, line_dash="dot", line_color="red", annotation_text=f"{p}th percentile", annotation_position="right") fig.update_layout(transition={'duration': 600}) st.plotly_chart(fig, use_container_width=True) st.info("💡 The 50th percentile is the median. The difference between 75th and 25th percentiles is the IQR.") # Comprehensive Statistics Table st.header("📈 Complete Statistical Summary") summary_stats = df[numerical_cols].describe().T summary_stats['mode'] = [df[col].mode()[0] if len(df[col].mode()) > 0 else np.nan for col in numerical_cols] summary_stats = summary_stats[['count', 'mean', 'mode', 'std', '50%', 'min', 'max']] summary_stats.columns = ['Count', 'Mean', 'Mode', 'Std Dev', 'Median', 'Min', 'Max'] st.dataframe(summary_stats) # Add Min/Max country details st.subheader("🌟 Extreme Values - Countries") st.write("Discover which countries hold the minimum and maximum values for each metric") extreme_cols = st.columns(3) for idx, col in enumerate(numerical_cols): with extreme_cols[idx]: col_name = { 'lifeExp': 'Life Expectancy', 'pop': 'Population', 'gdpPercap': 'GDP per Capita' }[col] # Find min and max min_val = df[col].min() max_val = df[col].max() min_row = df[df[col] == min_val].iloc[0] max_row = df[df[col] == max_val].iloc[0] st.markdown(f"**{col_name}**") # Min value st.markdown(f"**📉 Minimum:** {min_val:,.2f}") st.caption(f"📍 {min_row['country']} ({min_row['continent']}) in {int(min_row['year'])}") # Max value st.markdown(f"**📈 Maximum:** {max_val:,.2f}") st.caption(f"📍 {max_row['country']} ({max_row['continent']}) in {int(max_row['year'])}") st.markdown("---") # Correlation Analysis st.header("🔗 Correlation Analysis") st.write("Correlation measures the strength and direction of relationships between variables (-1 to +1).") col1, col2 = st.columns([1, 1]) with col1: # Calculate correlation matrix correlation_matrix = df[numerical_cols].corr() st.subheader("Correlation Matrix") st.dataframe(correlation_matrix.round(3)) st.markdown("**Interpretation:**") st.markdown("- **+1**: Perfect positive correlation") st.markdown("- **0**: No correlation") st.markdown("- **-1**: Perfect negative correlation") st.markdown("- **|r| > 0.7**: Strong correlation") st.markdown("- **|r| 0.3-0.7**: Moderate correlation") st.markdown("- **|r| < 0.3**: Weak correlation") with col2: # Heatmap visualization fig = px.imshow(correlation_matrix, text_auto='.2f', aspect='auto', color_continuous_scale='RdYlGn', color_continuous_midpoint=0, title='Correlation Heatmap', labels=dict(color="Correlation")) fig.update_layout( height=400, xaxis_title="", yaxis_title="" ) st.plotly_chart(fig, use_container_width=True) # Detailed correlation insights st.subheader("🔍 Key Correlation Insights") col1, col2, col3 = st.columns(3) # Get correlations corr_life_gdp = correlation_matrix.loc['lifeExp', 'gdpPercap'] corr_life_pop = correlation_matrix.loc['lifeExp', 'pop'] corr_gdp_pop = correlation_matrix.loc['gdpPercap', 'pop'] with col1: st.metric( "Life Expectancy ↔ GDP per Capita", f"{corr_life_gdp:.3f}", delta="Strong positive" if abs(corr_life_gdp) > 0.7 else "Moderate" if abs(corr_life_gdp) > 0.3 else "Weak" ) st.caption("Higher GDP tends to correlate with longer life expectancy") with col2: st.metric( "Life Expectancy ↔ Population", f"{corr_life_pop:.3f}", delta="Strong" if abs(corr_life_pop) > 0.7 else "Moderate" if abs(corr_life_pop) > 0.3 else "Weak" ) st.caption("Relationship between population size and life expectancy") with col3: st.metric( "GDP per Capita ↔ Population", f"{corr_gdp_pop:.3f}", delta="Strong" if abs(corr_gdp_pop) > 0.7 else "Moderate" if abs(corr_gdp_pop) > 0.3 else "Weak" ) st.caption("Relationship between wealth and population size") st.info("💡 Correlation does not imply causation! High correlation indicates variables move together, but one doesn't necessarily cause the other.") # Filter by Year or Country st.header("🔎 Filter & Analyze") col1, col2 = st.columns(2) with col1: selected_year = st.selectbox("Select Year", sorted(df['year'].unique())) with col2: selected_continent = st.multiselect("Select Continent(s)", df['continent'].unique(), default=df['continent'].unique()) filtered_df = df[(df['year'] == selected_year) & (df['continent'].isin(selected_continent))] if not filtered_df.empty: st.subheader(f"Statistics for {selected_year} - {', '.join(selected_continent)}") # Metrics in columns col1, col2, col3 = st.columns(3) with col1: st.metric("Life Expectancy (Mean)", f"{filtered_df['lifeExp'].mean():.2f}") st.metric("Life Expectancy (Median)", f"{filtered_df['lifeExp'].median():.2f}") st.metric("Life Expectancy (Std Dev)", f"{filtered_df['lifeExp'].std():.2f}") with col2: st.metric("Population (Mean)", f"{filtered_df['pop'].mean():,.0f}") st.metric("Population (Median)", f"{filtered_df['pop'].median():,.0f}") st.metric("Population (Std Dev)", f"{filtered_df['pop'].std():,.0f}") with col3: st.metric("GDP per Capita (Mean)", f"${filtered_df['gdpPercap'].mean():,.2f}") st.metric("GDP per Capita (Median)", f"${filtered_df['gdpPercap'].median():.2f}") st.metric("GDP per Capita (Std Dev)", f"${filtered_df['gdpPercap'].std():,.2f}") # NEW: Add visualizations for filtered data st.subheader("📊 Visual Analysis") # Create tabs for different visualizations viz_tab1, viz_tab2, viz_tab3, viz_tab4, viz_tab5 = st.tabs([ "Comparative Bar Charts", "Scatter Analysis", "Geographic Distribution", "Statistical Distribution", "Animated Timeline" ]) with viz_tab1: st.write("Compare Mean, Median, and Standard Deviation across metrics") # Create comparison charts col1, col2 = st.columns(2) with col1: # Mean vs Median comparison comparison_data = pd.DataFrame({ 'Metric': numerical_cols * 2, 'Statistic': ['Mean']*3 + ['Median']*3, 'Value': [ filtered_df['lifeExp'].mean(), filtered_df['pop'].mean(), filtered_df['gdpPercap'].mean(), filtered_df['lifeExp'].median(), filtered_df['pop'].median(), filtered_df['gdpPercap'].median() ] }) fig = px.bar(comparison_data, x='Metric', y='Value', color='Statistic', barmode='group', title='Mean vs Median Comparison', labels={'Value': 'Value'}, animation_frame=None) fig.update_layout(transition={'duration': 500}) st.plotly_chart(fig, use_container_width=True) with col2: # Standard Deviation std_data = pd.DataFrame({ 'Metric': numerical_cols, 'Std Dev': [ filtered_df['lifeExp'].std(), filtered_df['pop'].std(), filtered_df['gdpPercap'].std() ] }) fig = px.bar(std_data, x='Metric', y='Std Dev', title='Standard Deviation by Metric', color='Metric') fig.update_layout(transition={'duration': 500}) st.plotly_chart(fig, use_container_width=True) with viz_tab2: st.write("Explore relationships between different metrics") col1, col2 = st.columns(2) with col1: # GDP vs Life Expectancy fig = px.scatter(filtered_df, x='gdpPercap', y='lifeExp', size='pop', color='continent', hover_name='country', title=f'GDP per Capita vs Life Expectancy ({selected_year})', labels={'gdpPercap': 'GDP per Capita', 'lifeExp': 'Life Expectancy'}, log_x=True) fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey'))) fig.update_layout(transition={'duration': 800, 'easing': 'cubic-in-out'}) st.plotly_chart(fig, use_container_width=True) with col2: # Population vs Life Expectancy fig = px.scatter(filtered_df, x='pop', y='lifeExp', size='gdpPercap', color='continent', hover_name='country', title=f'Population vs Life Expectancy ({selected_year})', labels={'pop': 'Population', 'lifeExp': 'Life Expectancy'}, log_x=True) fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey'))) fig.update_layout(transition={'duration': 800, 'easing': 'cubic-in-out'}) st.plotly_chart(fig, use_container_width=True) with viz_tab3: st.write("Geographic distribution of metrics across countries") # Bar chart by country metric_choice = st.selectbox( "Select metric to visualize by country", options=['lifeExp', 'pop', 'gdpPercap'], format_func=lambda x: { 'lifeExp': 'Life Expectancy', 'pop': 'Population', 'gdpPercap': 'GDP per Capita' }[x] ) # Sort by selected metric sorted_df = filtered_df.sort_values(metric_choice, ascending=False) fig = px.bar(sorted_df, x='country', y=metric_choice, color='continent', title=f'{metric_choice.replace("_", " ").title()} by Country ({selected_year})', labels={metric_choice: metric_choice.replace('_', ' ').title()}) fig.update_layout( xaxis_tickangle=-45, height=500, transition={'duration': 500, 'easing': 'cubic-in-out'} ) st.plotly_chart(fig, use_container_width=True) # Top 10 and Bottom 10 col1, col2 = st.columns(2) with col1: st.write(f"🏆 Top 10 Countries - {metric_choice.replace('_', ' ').title()}") top_10 = sorted_df[['country', metric_choice, 'continent']].head(10) st.dataframe(top_10, hide_index=True) with col2: st.write(f"⚠️ Bottom 10 Countries - {metric_choice.replace('_', ' ').title()}") bottom_10 = sorted_df[['country', metric_choice, 'continent']].tail(10) st.dataframe(bottom_10, hide_index=True) with viz_tab4: st.write("Distribution patterns and box plots for filtered data") # Box plots by continent selected_metric_dist = st.selectbox( "Select metric for distribution analysis", options=['lifeExp', 'pop', 'gdpPercap'], format_func=lambda x: { 'lifeExp': 'Life Expectancy', 'pop': 'Population', 'gdpPercap': 'GDP per Capita' }[x], key='dist_metric' ) col1, col2 = st.columns(2) with col1: # Box plot fig = px.box(filtered_df, x='continent', y=selected_metric_dist, color='continent', title=f'{selected_metric_dist.replace("_", " ").title()} Distribution by Continent', labels={selected_metric_dist: selected_metric_dist.replace('_', ' ').title()}) fig.update_layout(transition={'duration': 600}) st.plotly_chart(fig, use_container_width=True) with col2: # Violin plot fig = px.violin(filtered_df, x='continent', y=selected_metric_dist, color='continent', box=True, title=f'{selected_metric_dist.replace("_", " ").title()} Violin Plot', labels={selected_metric_dist: selected_metric_dist.replace('_', ' ').title()}) fig.update_layout(transition={'duration': 600}) st.plotly_chart(fig, use_container_width=True) with viz_tab5: st.write("🎬 Watch how metrics evolve over time with animated visualizations") # Animation type selector anim_type = st.radio( "Select Animation Type", options=["Scatter Plot Timeline", "Bar Chart Race", "Line Chart Evolution"], horizontal=True ) if anim_type == "Scatter Plot Timeline": # Animated scatter plot over all years st.subheader("GDP vs Life Expectancy Over Time") # Filter by selected continents anim_df = df[df['continent'].isin(selected_continent)] fig = px.scatter(anim_df, x='gdpPercap', y='lifeExp', animation_frame='year', animation_group='country', size='pop', color='continent', hover_name='country', log_x=True, size_max=60, range_x=[100, 120000], range_y=[20, 90], title='GDP per Capita vs Life Expectancy (Animated)', labels={'gdpPercap': 'GDP per Capita', 'lifeExp': 'Life Expectancy', 'pop': 'Population'}) fig.update_layout( height=600, transition={'duration': 800} ) fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 800 fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 500 st.plotly_chart(fig, use_container_width=True) st.info("Click the Play button to watch the animation! Each frame represents a year from 1952 to 2007.") elif anim_type == "Bar Chart Race": # Animated bar chart race st.subheader("Top Countries Bar Chart Race") metric_race = st.selectbox( "Select metric for bar chart race", options=['lifeExp', 'pop', 'gdpPercap'], format_func=lambda x: { 'lifeExp': 'Life Expectancy', 'pop': 'Population', 'gdpPercap': 'GDP per Capita' }[x], key='race_metric' ) # Get top 15 countries for the latest year latest_year_df = df[df['year'] == df['year'].max()] top_countries = latest_year_df.nlargest(15, metric_race)['country'].tolist() # Filter data for these countries race_df = df[df['country'].isin(top_countries) & df['continent'].isin(selected_continent)] fig = px.bar(race_df, x=metric_race, y='country', color='continent', animation_frame='year', orientation='h', title=f'Top Countries by {metric_race.replace("_", " ").title()} Over Time', labels={metric_race: metric_race.replace('_', ' ').title()}, range_x=[0, race_df[metric_race].max() * 1.1]) fig.update_layout( height=600, yaxis={'categoryorder': 'total ascending'} ) fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1000 fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 500 st.plotly_chart(fig, use_container_width=True) st.info("Watch the ranking change over time! Countries rise and fall based on their performance.") else: # Line Chart Evolution st.subheader("Metric Evolution by Continent") metric_line = st.selectbox( "Select metric for time evolution", options=['lifeExp', 'pop', 'gdpPercap'], format_func=lambda x: { 'lifeExp': 'Life Expectancy', 'pop': 'Population', 'gdpPercap': 'GDP per Capita' }[x], key='line_metric' ) # Calculate mean by continent and year line_df = df[df['continent'].isin(selected_continent)].groupby(['year', 'continent'])[metric_line].mean().reset_index() fig = px.line(line_df, x='year', y=metric_line, color='continent', markers=True, title=f'Average {metric_line.replace("_", " ").title()} Evolution by Continent', labels={metric_line: f'Average {metric_line.replace("_", " ").title()}', 'year': 'Year'}) fig.update_traces( mode='lines+markers', line=dict(width=3), marker=dict(size=8) ) fig.update_layout( height=500, hovermode='x unified', transition={'duration': 600} ) # Add animation on load fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray') fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray') st.plotly_chart(fig, use_container_width=True) st.info("Hover over the chart to see detailed values for each year and continent.") # Distribution Visualizations st.header("📉 Overall Data Distributions") selected_metric = st.selectbox("Select metric to visualize", numerical_cols, format_func=lambda x: { 'lifeExp': 'Life Expectancy', 'pop': 'Population', 'gdpPercap': 'GDP per Capita' }[x]) col1, col2 = st.columns(2) with col1: fig = px.histogram(df, x=selected_metric, nbins=50, title=f'Distribution of {selected_metric}', labels={selected_metric: selected_metric.replace('_', ' ').title()}) fig.add_vline(x=df[selected_metric].mean(), line_dash="dash", line_color="red", annotation_text="Mean") fig.add_vline(x=df[selected_metric].median(), line_dash="dash", line_color="green", annotation_text="Median") fig.update_layout(transition={'duration': 700}) st.plotly_chart(fig, use_container_width=True) with col2: fig = px.box(df, y=selected_metric, x='continent', title=f'{selected_metric} by Continent', labels={selected_metric: selected_metric.replace('_', ' ').title()}) fig.update_layout(transition={'duration': 700}) st.plotly_chart(fig, use_container_width=True) # Footer st.markdown("---") st.markdown("Data source: [Gapminder](https://www.gapminder.org/)") st.markdown("**Version 2** - Enhanced with interactive visualizations in Filter & Analyze section")