eda-gapminder / app.py
tbqguy's picture
Initial deployment: Full-featured EDA Gapminder dashboard
da952bf
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
# Set page config
st.set_page_config(page_title="Gapminder EDA v2", layout="wide")
# Custom CSS to set Helvetica for body text while keeping headers in Source Sans Pro
st.markdown("""
<style>
/* Body text, paragraphs, and general content */
.stMarkdown p, .stMarkdown li, .stMarkdown span {
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
}
/* Dataframe text */
.stDataFrame, .stDataFrame * {
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
}
/* Metric values */
.stMetric {
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
}
/* Select boxes and inputs */
.stSelectbox, .stMultiSelect {
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
}
/* Tab labels */
.stTabs [data-baseweb="tab"] {
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
}
/* Info boxes */
.stAlert {
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
}
/* Caption text */
.stCaption {
font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important;
}
/* Keep headers in Source Sans Pro (default Streamlit font) */
h1, h2, h3, h4, h5, h6 {
font-family: 'Source Sans Pro', sans-serif !important;
}
</style>
""", unsafe_allow_html=True)
# Title
st.title("🌍 Gapminder Dataset - Exploratory Data Analysis v2")
# Load data
@st.cache_data
def load_data():
"""Load Gapminder dataset from local file with fallback to URL"""
# For HuggingFace Spaces, use gapminder.tsv in the same directory
local_path = 'gapminder.tsv'
try:
# Try loading from local file first
df = pd.read_csv(local_path, sep='\t')
except FileNotFoundError:
# Fallback to data/ directory (for local development)
try:
df = pd.read_csv('data/gapminder.tsv', sep='\t')
except FileNotFoundError:
# Final fallback to downloading from GitHub
url = "https://raw.githubusercontent.com/jennybc/gapminder/master/inst/extdata/gapminder.tsv"
df = pd.read_csv(url, sep='\t')
return df
df = load_data()
# Display basic info
st.header("Dataset Overview")
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Total Rows", df.shape[0])
with col2:
st.metric("Total Columns", df.shape[1])
with col3:
st.metric("Countries", df['country'].nunique())
st.subheader("Sample Data")
st.dataframe(df.head(10))
# Descriptive Statistics Section
st.header("πŸ“Š Descriptive Statistics")
# Select numerical columns
numerical_cols = ['lifeExp', 'pop', 'gdpPercap']
# Create tabs for each metric
tab1, tab2, tab3, tab4, tab5, tab6, tab7 = st.tabs([
"Mean", "Median", "Mode", "Standard Deviation",
"Range & IQR", "Coefficient of Variation", "Percentiles"
])
with tab1:
st.subheader("Mean (Average)")
st.write("The mean represents the average value of the data.")
mean_data = {
'Metric': numerical_cols,
'Mean Value': [df[col].mean() for col in numerical_cols]
}
mean_df = pd.DataFrame(mean_data)
col1, col2 = st.columns([1, 2])
with col1:
st.dataframe(mean_df, hide_index=True)
with col2:
fig = px.bar(mean_df, x='Metric', y='Mean Value',
title='Mean Values by Metric',
color='Metric')
st.plotly_chart(fig, use_container_width=True)
with tab2:
st.subheader("Median")
st.write("The median represents the middle value when data is sorted.")
median_data = {
'Metric': numerical_cols,
'Median Value': [df[col].median() for col in numerical_cols]
}
median_df = pd.DataFrame(median_data)
col1, col2 = st.columns([1, 2])
with col1:
st.dataframe(median_df, hide_index=True)
with col2:
fig = px.bar(median_df, x='Metric', y='Median Value',
title='Median Values by Metric',
color='Metric')
st.plotly_chart(fig, use_container_width=True)
with tab3:
st.subheader("Mode")
st.write("The mode represents the most frequently occurring value(s).")
mode_results = []
for col in numerical_cols:
mode_val = df[col].mode()
if len(mode_val) > 0:
mode_results.append({
'Metric': col,
'Mode Value': mode_val[0],
'Frequency': (df[col] == mode_val[0]).sum()
})
mode_df = pd.DataFrame(mode_results)
st.dataframe(mode_df, hide_index=True)
st.info("Note: For continuous data like life expectancy and GDP, mode may not be very meaningful as values rarely repeat exactly.")
with tab4:
st.subheader("Standard Deviation")
st.write("Standard deviation measures the amount of variation or dispersion in the data.")
std_data = {
'Metric': numerical_cols,
'Standard Deviation': [df[col].std() for col in numerical_cols],
'Variance': [df[col].var() for col in numerical_cols]
}
std_df = pd.DataFrame(std_data)
col1, col2 = st.columns([1, 2])
with col1:
st.dataframe(std_df, hide_index=True)
with col2:
fig = px.bar(std_df, x='Metric', y='Standard Deviation',
title='Standard Deviation by Metric',
color='Metric')
st.plotly_chart(fig, use_container_width=True)
with tab5:
st.subheader("Range & Interquartile Range (IQR)")
st.write("Range shows the total spread (Max - Min), while IQR shows the spread of the middle 50% of data.")
range_data = {
'Metric': numerical_cols,
'Min': [df[col].min() for col in numerical_cols],
'Q1 (25%)': [df[col].quantile(0.25) for col in numerical_cols],
'Q3 (75%)': [df[col].quantile(0.75) for col in numerical_cols],
'Max': [df[col].max() for col in numerical_cols],
'Range': [df[col].max() - df[col].min() for col in numerical_cols],
'IQR': [df[col].quantile(0.75) - df[col].quantile(0.25) for col in numerical_cols]
}
range_df = pd.DataFrame(range_data)
st.dataframe(range_df, hide_index=True)
# Visualizations
col1, col2 = st.columns(2)
with col1:
fig = px.bar(range_df, x='Metric', y='Range',
title='Range by Metric',
color='Metric')
fig.update_layout(transition={'duration': 500})
st.plotly_chart(fig, use_container_width=True)
with col2:
fig = px.bar(range_df, x='Metric', y='IQR',
title='Interquartile Range (IQR) by Metric',
color='Metric')
fig.update_layout(transition={'duration': 500})
st.plotly_chart(fig, use_container_width=True)
st.info("πŸ’‘ IQR is more robust to outliers than Range. A larger IQR indicates more variability in the middle 50% of the data.")
with tab6:
st.subheader("Coefficient of Variation (CV)")
st.write("CV = (Standard Deviation / Mean) Γ— 100. It allows comparison of variability across different scales.")
cv_data = {
'Metric': numerical_cols,
'Mean': [df[col].mean() for col in numerical_cols],
'Std Dev': [df[col].std() for col in numerical_cols],
'CV (%)': [(df[col].std() / df[col].mean()) * 100 for col in numerical_cols]
}
cv_df = pd.DataFrame(cv_data)
col1, col2 = st.columns([1, 2])
with col1:
st.dataframe(cv_df, hide_index=True)
st.markdown("**Interpretation:**")
st.markdown("- CV < 15%: Low variability")
st.markdown("- CV 15-30%: Moderate variability")
st.markdown("- CV > 30%: High variability")
with col2:
fig = px.bar(cv_df, x='Metric', y='CV (%)',
title='Coefficient of Variation by Metric',
color='Metric')
fig.update_layout(
yaxis_title="CV (%)",
transition={'duration': 500}
)
# Add reference lines for interpretation
fig.add_hline(y=15, line_dash="dash", line_color="green",
annotation_text="Low variability threshold")
fig.add_hline(y=30, line_dash="dash", line_color="orange",
annotation_text="High variability threshold")
st.plotly_chart(fig, use_container_width=True)
st.info("πŸ’‘ Higher CV indicates greater relative variability. This is especially useful when comparing metrics with different units or scales.")
with tab7:
st.subheader("Percentiles Analysis")
st.write("Percentiles divide the data into 100 equal parts. Key percentiles help understand data distribution.")
percentiles = [0, 10, 25, 50, 75, 90, 100]
percentile_data = {'Percentile': [f'{p}th' if p not in [0, 100] else ('Min' if p == 0 else 'Max')
for p in percentiles]}
for col in numerical_cols:
percentile_data[col] = [df[col].quantile(p/100) for p in percentiles]
percentile_df = pd.DataFrame(percentile_data)
st.dataframe(percentile_df, hide_index=True)
# Visualizations
st.subheader("Percentile Visualizations")
selected_percentile_metric = st.selectbox(
"Select metric for percentile visualization",
options=numerical_cols,
format_func=lambda x: {
'lifeExp': 'Life Expectancy',
'pop': 'Population',
'gdpPercap': 'GDP per Capita'
}[x],
key='percentile_metric'
)
col1, col2 = st.columns(2)
with col1:
# Line chart showing percentile progression
perc_viz_data = pd.DataFrame({
'Percentile': percentiles,
'Value': [df[selected_percentile_metric].quantile(p/100) for p in percentiles]
})
fig = px.line(perc_viz_data, x='Percentile', y='Value',
title=f'Percentile Progression - {selected_percentile_metric}',
markers=True)
fig.update_traces(line=dict(width=3), marker=dict(size=10))
fig.update_layout(transition={'duration': 600})
st.plotly_chart(fig, use_container_width=True)
with col2:
# Enhanced box plot with percentile annotations
fig = px.box(df, y=selected_percentile_metric,
title=f'Box Plot with Percentiles - {selected_percentile_metric}')
# Add percentile annotations
for p in [10, 25, 50, 75, 90]:
val = df[selected_percentile_metric].quantile(p/100)
fig.add_hline(y=val, line_dash="dot", line_color="red",
annotation_text=f"{p}th percentile",
annotation_position="right")
fig.update_layout(transition={'duration': 600})
st.plotly_chart(fig, use_container_width=True)
st.info("πŸ’‘ The 50th percentile is the median. The difference between 75th and 25th percentiles is the IQR.")
# Comprehensive Statistics Table
st.header("πŸ“ˆ Complete Statistical Summary")
summary_stats = df[numerical_cols].describe().T
summary_stats['mode'] = [df[col].mode()[0] if len(df[col].mode()) > 0 else np.nan for col in numerical_cols]
summary_stats = summary_stats[['count', 'mean', 'mode', 'std', '50%', 'min', 'max']]
summary_stats.columns = ['Count', 'Mean', 'Mode', 'Std Dev', 'Median', 'Min', 'Max']
st.dataframe(summary_stats)
# Add Min/Max country details
st.subheader("🌟 Extreme Values - Countries")
st.write("Discover which countries hold the minimum and maximum values for each metric")
extreme_cols = st.columns(3)
for idx, col in enumerate(numerical_cols):
with extreme_cols[idx]:
col_name = {
'lifeExp': 'Life Expectancy',
'pop': 'Population',
'gdpPercap': 'GDP per Capita'
}[col]
# Find min and max
min_val = df[col].min()
max_val = df[col].max()
min_row = df[df[col] == min_val].iloc[0]
max_row = df[df[col] == max_val].iloc[0]
st.markdown(f"**{col_name}**")
# Min value
st.markdown(f"**πŸ“‰ Minimum:** {min_val:,.2f}")
st.caption(f"πŸ“ {min_row['country']} ({min_row['continent']}) in {int(min_row['year'])}")
# Max value
st.markdown(f"**πŸ“ˆ Maximum:** {max_val:,.2f}")
st.caption(f"πŸ“ {max_row['country']} ({max_row['continent']}) in {int(max_row['year'])}")
st.markdown("---")
# Correlation Analysis
st.header("πŸ”— Correlation Analysis")
st.write("Correlation measures the strength and direction of relationships between variables (-1 to +1).")
col1, col2 = st.columns([1, 1])
with col1:
# Calculate correlation matrix
correlation_matrix = df[numerical_cols].corr()
st.subheader("Correlation Matrix")
st.dataframe(correlation_matrix.round(3))
st.markdown("**Interpretation:**")
st.markdown("- **+1**: Perfect positive correlation")
st.markdown("- **0**: No correlation")
st.markdown("- **-1**: Perfect negative correlation")
st.markdown("- **|r| > 0.7**: Strong correlation")
st.markdown("- **|r| 0.3-0.7**: Moderate correlation")
st.markdown("- **|r| < 0.3**: Weak correlation")
with col2:
# Heatmap visualization
fig = px.imshow(correlation_matrix,
text_auto='.2f',
aspect='auto',
color_continuous_scale='RdYlGn',
color_continuous_midpoint=0,
title='Correlation Heatmap',
labels=dict(color="Correlation"))
fig.update_layout(
height=400,
xaxis_title="",
yaxis_title=""
)
st.plotly_chart(fig, use_container_width=True)
# Detailed correlation insights
st.subheader("πŸ” Key Correlation Insights")
col1, col2, col3 = st.columns(3)
# Get correlations
corr_life_gdp = correlation_matrix.loc['lifeExp', 'gdpPercap']
corr_life_pop = correlation_matrix.loc['lifeExp', 'pop']
corr_gdp_pop = correlation_matrix.loc['gdpPercap', 'pop']
with col1:
st.metric(
"Life Expectancy ↔ GDP per Capita",
f"{corr_life_gdp:.3f}",
delta="Strong positive" if abs(corr_life_gdp) > 0.7 else "Moderate" if abs(corr_life_gdp) > 0.3 else "Weak"
)
st.caption("Higher GDP tends to correlate with longer life expectancy")
with col2:
st.metric(
"Life Expectancy ↔ Population",
f"{corr_life_pop:.3f}",
delta="Strong" if abs(corr_life_pop) > 0.7 else "Moderate" if abs(corr_life_pop) > 0.3 else "Weak"
)
st.caption("Relationship between population size and life expectancy")
with col3:
st.metric(
"GDP per Capita ↔ Population",
f"{corr_gdp_pop:.3f}",
delta="Strong" if abs(corr_gdp_pop) > 0.7 else "Moderate" if abs(corr_gdp_pop) > 0.3 else "Weak"
)
st.caption("Relationship between wealth and population size")
st.info("πŸ’‘ Correlation does not imply causation! High correlation indicates variables move together, but one doesn't necessarily cause the other.")
# Filter by Year or Country
st.header("πŸ”Ž Filter & Analyze")
col1, col2 = st.columns(2)
with col1:
selected_year = st.selectbox("Select Year", sorted(df['year'].unique()))
with col2:
selected_continent = st.multiselect("Select Continent(s)",
df['continent'].unique(),
default=df['continent'].unique())
filtered_df = df[(df['year'] == selected_year) & (df['continent'].isin(selected_continent))]
if not filtered_df.empty:
st.subheader(f"Statistics for {selected_year} - {', '.join(selected_continent)}")
# Metrics in columns
col1, col2, col3 = st.columns(3)
with col1:
st.metric("Life Expectancy (Mean)", f"{filtered_df['lifeExp'].mean():.2f}")
st.metric("Life Expectancy (Median)", f"{filtered_df['lifeExp'].median():.2f}")
st.metric("Life Expectancy (Std Dev)", f"{filtered_df['lifeExp'].std():.2f}")
with col2:
st.metric("Population (Mean)", f"{filtered_df['pop'].mean():,.0f}")
st.metric("Population (Median)", f"{filtered_df['pop'].median():,.0f}")
st.metric("Population (Std Dev)", f"{filtered_df['pop'].std():,.0f}")
with col3:
st.metric("GDP per Capita (Mean)", f"${filtered_df['gdpPercap'].mean():,.2f}")
st.metric("GDP per Capita (Median)", f"${filtered_df['gdpPercap'].median():.2f}")
st.metric("GDP per Capita (Std Dev)", f"${filtered_df['gdpPercap'].std():,.2f}")
# NEW: Add visualizations for filtered data
st.subheader("πŸ“Š Visual Analysis")
# Create tabs for different visualizations
viz_tab1, viz_tab2, viz_tab3, viz_tab4, viz_tab5 = st.tabs([
"Comparative Bar Charts",
"Scatter Analysis",
"Geographic Distribution",
"Statistical Distribution",
"Animated Timeline"
])
with viz_tab1:
st.write("Compare Mean, Median, and Standard Deviation across metrics")
# Create comparison charts
col1, col2 = st.columns(2)
with col1:
# Mean vs Median comparison
comparison_data = pd.DataFrame({
'Metric': numerical_cols * 2,
'Statistic': ['Mean']*3 + ['Median']*3,
'Value': [
filtered_df['lifeExp'].mean(),
filtered_df['pop'].mean(),
filtered_df['gdpPercap'].mean(),
filtered_df['lifeExp'].median(),
filtered_df['pop'].median(),
filtered_df['gdpPercap'].median()
]
})
fig = px.bar(comparison_data, x='Metric', y='Value',
color='Statistic', barmode='group',
title='Mean vs Median Comparison',
labels={'Value': 'Value'},
animation_frame=None)
fig.update_layout(transition={'duration': 500})
st.plotly_chart(fig, use_container_width=True)
with col2:
# Standard Deviation
std_data = pd.DataFrame({
'Metric': numerical_cols,
'Std Dev': [
filtered_df['lifeExp'].std(),
filtered_df['pop'].std(),
filtered_df['gdpPercap'].std()
]
})
fig = px.bar(std_data, x='Metric', y='Std Dev',
title='Standard Deviation by Metric',
color='Metric')
fig.update_layout(transition={'duration': 500})
st.plotly_chart(fig, use_container_width=True)
with viz_tab2:
st.write("Explore relationships between different metrics")
col1, col2 = st.columns(2)
with col1:
# GDP vs Life Expectancy
fig = px.scatter(filtered_df, x='gdpPercap', y='lifeExp',
size='pop', color='continent',
hover_name='country',
title=f'GDP per Capita vs Life Expectancy ({selected_year})',
labels={'gdpPercap': 'GDP per Capita',
'lifeExp': 'Life Expectancy'},
log_x=True)
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(transition={'duration': 800, 'easing': 'cubic-in-out'})
st.plotly_chart(fig, use_container_width=True)
with col2:
# Population vs Life Expectancy
fig = px.scatter(filtered_df, x='pop', y='lifeExp',
size='gdpPercap', color='continent',
hover_name='country',
title=f'Population vs Life Expectancy ({selected_year})',
labels={'pop': 'Population',
'lifeExp': 'Life Expectancy'},
log_x=True)
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(transition={'duration': 800, 'easing': 'cubic-in-out'})
st.plotly_chart(fig, use_container_width=True)
with viz_tab3:
st.write("Geographic distribution of metrics across countries")
# Bar chart by country
metric_choice = st.selectbox(
"Select metric to visualize by country",
options=['lifeExp', 'pop', 'gdpPercap'],
format_func=lambda x: {
'lifeExp': 'Life Expectancy',
'pop': 'Population',
'gdpPercap': 'GDP per Capita'
}[x]
)
# Sort by selected metric
sorted_df = filtered_df.sort_values(metric_choice, ascending=False)
fig = px.bar(sorted_df, x='country', y=metric_choice,
color='continent',
title=f'{metric_choice.replace("_", " ").title()} by Country ({selected_year})',
labels={metric_choice: metric_choice.replace('_', ' ').title()})
fig.update_layout(
xaxis_tickangle=-45,
height=500,
transition={'duration': 500, 'easing': 'cubic-in-out'}
)
st.plotly_chart(fig, use_container_width=True)
# Top 10 and Bottom 10
col1, col2 = st.columns(2)
with col1:
st.write(f"πŸ† Top 10 Countries - {metric_choice.replace('_', ' ').title()}")
top_10 = sorted_df[['country', metric_choice, 'continent']].head(10)
st.dataframe(top_10, hide_index=True)
with col2:
st.write(f"⚠️ Bottom 10 Countries - {metric_choice.replace('_', ' ').title()}")
bottom_10 = sorted_df[['country', metric_choice, 'continent']].tail(10)
st.dataframe(bottom_10, hide_index=True)
with viz_tab4:
st.write("Distribution patterns and box plots for filtered data")
# Box plots by continent
selected_metric_dist = st.selectbox(
"Select metric for distribution analysis",
options=['lifeExp', 'pop', 'gdpPercap'],
format_func=lambda x: {
'lifeExp': 'Life Expectancy',
'pop': 'Population',
'gdpPercap': 'GDP per Capita'
}[x],
key='dist_metric'
)
col1, col2 = st.columns(2)
with col1:
# Box plot
fig = px.box(filtered_df, x='continent', y=selected_metric_dist,
color='continent',
title=f'{selected_metric_dist.replace("_", " ").title()} Distribution by Continent',
labels={selected_metric_dist: selected_metric_dist.replace('_', ' ').title()})
fig.update_layout(transition={'duration': 600})
st.plotly_chart(fig, use_container_width=True)
with col2:
# Violin plot
fig = px.violin(filtered_df, x='continent', y=selected_metric_dist,
color='continent', box=True,
title=f'{selected_metric_dist.replace("_", " ").title()} Violin Plot',
labels={selected_metric_dist: selected_metric_dist.replace('_', ' ').title()})
fig.update_layout(transition={'duration': 600})
st.plotly_chart(fig, use_container_width=True)
with viz_tab5:
st.write("🎬 Watch how metrics evolve over time with animated visualizations")
# Animation type selector
anim_type = st.radio(
"Select Animation Type",
options=["Scatter Plot Timeline", "Bar Chart Race", "Line Chart Evolution"],
horizontal=True
)
if anim_type == "Scatter Plot Timeline":
# Animated scatter plot over all years
st.subheader("GDP vs Life Expectancy Over Time")
# Filter by selected continents
anim_df = df[df['continent'].isin(selected_continent)]
fig = px.scatter(anim_df,
x='gdpPercap',
y='lifeExp',
animation_frame='year',
animation_group='country',
size='pop',
color='continent',
hover_name='country',
log_x=True,
size_max=60,
range_x=[100, 120000],
range_y=[20, 90],
title='GDP per Capita vs Life Expectancy (Animated)',
labels={'gdpPercap': 'GDP per Capita',
'lifeExp': 'Life Expectancy',
'pop': 'Population'})
fig.update_layout(
height=600,
transition={'duration': 800}
)
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 800
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 500
st.plotly_chart(fig, use_container_width=True)
st.info("Click the Play button to watch the animation! Each frame represents a year from 1952 to 2007.")
elif anim_type == "Bar Chart Race":
# Animated bar chart race
st.subheader("Top Countries Bar Chart Race")
metric_race = st.selectbox(
"Select metric for bar chart race",
options=['lifeExp', 'pop', 'gdpPercap'],
format_func=lambda x: {
'lifeExp': 'Life Expectancy',
'pop': 'Population',
'gdpPercap': 'GDP per Capita'
}[x],
key='race_metric'
)
# Get top 15 countries for the latest year
latest_year_df = df[df['year'] == df['year'].max()]
top_countries = latest_year_df.nlargest(15, metric_race)['country'].tolist()
# Filter data for these countries
race_df = df[df['country'].isin(top_countries) & df['continent'].isin(selected_continent)]
fig = px.bar(race_df,
x=metric_race,
y='country',
color='continent',
animation_frame='year',
orientation='h',
title=f'Top Countries by {metric_race.replace("_", " ").title()} Over Time',
labels={metric_race: metric_race.replace('_', ' ').title()},
range_x=[0, race_df[metric_race].max() * 1.1])
fig.update_layout(
height=600,
yaxis={'categoryorder': 'total ascending'}
)
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1000
fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 500
st.plotly_chart(fig, use_container_width=True)
st.info("Watch the ranking change over time! Countries rise and fall based on their performance.")
else: # Line Chart Evolution
st.subheader("Metric Evolution by Continent")
metric_line = st.selectbox(
"Select metric for time evolution",
options=['lifeExp', 'pop', 'gdpPercap'],
format_func=lambda x: {
'lifeExp': 'Life Expectancy',
'pop': 'Population',
'gdpPercap': 'GDP per Capita'
}[x],
key='line_metric'
)
# Calculate mean by continent and year
line_df = df[df['continent'].isin(selected_continent)].groupby(['year', 'continent'])[metric_line].mean().reset_index()
fig = px.line(line_df,
x='year',
y=metric_line,
color='continent',
markers=True,
title=f'Average {metric_line.replace("_", " ").title()} Evolution by Continent',
labels={metric_line: f'Average {metric_line.replace("_", " ").title()}',
'year': 'Year'})
fig.update_traces(
mode='lines+markers',
line=dict(width=3),
marker=dict(size=8)
)
fig.update_layout(
height=500,
hovermode='x unified',
transition={'duration': 600}
)
# Add animation on load
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray')
st.plotly_chart(fig, use_container_width=True)
st.info("Hover over the chart to see detailed values for each year and continent.")
# Distribution Visualizations
st.header("πŸ“‰ Overall Data Distributions")
selected_metric = st.selectbox("Select metric to visualize", numerical_cols,
format_func=lambda x: {
'lifeExp': 'Life Expectancy',
'pop': 'Population',
'gdpPercap': 'GDP per Capita'
}[x])
col1, col2 = st.columns(2)
with col1:
fig = px.histogram(df, x=selected_metric, nbins=50,
title=f'Distribution of {selected_metric}',
labels={selected_metric: selected_metric.replace('_', ' ').title()})
fig.add_vline(x=df[selected_metric].mean(), line_dash="dash",
line_color="red", annotation_text="Mean")
fig.add_vline(x=df[selected_metric].median(), line_dash="dash",
line_color="green", annotation_text="Median")
fig.update_layout(transition={'duration': 700})
st.plotly_chart(fig, use_container_width=True)
with col2:
fig = px.box(df, y=selected_metric, x='continent',
title=f'{selected_metric} by Continent',
labels={selected_metric: selected_metric.replace('_', ' ').title()})
fig.update_layout(transition={'duration': 700})
st.plotly_chart(fig, use_container_width=True)
# Footer
st.markdown("---")
st.markdown("Data source: [Gapminder](https://www.gapminder.org/)")
st.markdown("**Version 2** - Enhanced with interactive visualizations in Filter & Analyze section")