Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from plotly.subplots import make_subplots | |
| # Set page config | |
| st.set_page_config(page_title="Gapminder EDA v2", layout="wide") | |
| # Custom CSS to set Helvetica for body text while keeping headers in Source Sans Pro | |
| st.markdown(""" | |
| <style> | |
| /* Body text, paragraphs, and general content */ | |
| .stMarkdown p, .stMarkdown li, .stMarkdown span { | |
| font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important; | |
| } | |
| /* Dataframe text */ | |
| .stDataFrame, .stDataFrame * { | |
| font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important; | |
| } | |
| /* Metric values */ | |
| .stMetric { | |
| font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important; | |
| } | |
| /* Select boxes and inputs */ | |
| .stSelectbox, .stMultiSelect { | |
| font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important; | |
| } | |
| /* Tab labels */ | |
| .stTabs [data-baseweb="tab"] { | |
| font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important; | |
| } | |
| /* Info boxes */ | |
| .stAlert { | |
| font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important; | |
| } | |
| /* Caption text */ | |
| .stCaption { | |
| font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif !important; | |
| } | |
| /* Keep headers in Source Sans Pro (default Streamlit font) */ | |
| h1, h2, h3, h4, h5, h6 { | |
| font-family: 'Source Sans Pro', sans-serif !important; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Title | |
| st.title("π Gapminder Dataset - Exploratory Data Analysis v2") | |
| # Load data | |
| def load_data(): | |
| """Load Gapminder dataset from local file with fallback to URL""" | |
| # For HuggingFace Spaces, use gapminder.tsv in the same directory | |
| local_path = 'gapminder.tsv' | |
| try: | |
| # Try loading from local file first | |
| df = pd.read_csv(local_path, sep='\t') | |
| except FileNotFoundError: | |
| # Fallback to data/ directory (for local development) | |
| try: | |
| df = pd.read_csv('data/gapminder.tsv', sep='\t') | |
| except FileNotFoundError: | |
| # Final fallback to downloading from GitHub | |
| url = "https://raw.githubusercontent.com/jennybc/gapminder/master/inst/extdata/gapminder.tsv" | |
| df = pd.read_csv(url, sep='\t') | |
| return df | |
| df = load_data() | |
| # Display basic info | |
| st.header("Dataset Overview") | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Total Rows", df.shape[0]) | |
| with col2: | |
| st.metric("Total Columns", df.shape[1]) | |
| with col3: | |
| st.metric("Countries", df['country'].nunique()) | |
| st.subheader("Sample Data") | |
| st.dataframe(df.head(10)) | |
| # Descriptive Statistics Section | |
| st.header("π Descriptive Statistics") | |
| # Select numerical columns | |
| numerical_cols = ['lifeExp', 'pop', 'gdpPercap'] | |
| # Create tabs for each metric | |
| tab1, tab2, tab3, tab4, tab5, tab6, tab7 = st.tabs([ | |
| "Mean", "Median", "Mode", "Standard Deviation", | |
| "Range & IQR", "Coefficient of Variation", "Percentiles" | |
| ]) | |
| with tab1: | |
| st.subheader("Mean (Average)") | |
| st.write("The mean represents the average value of the data.") | |
| mean_data = { | |
| 'Metric': numerical_cols, | |
| 'Mean Value': [df[col].mean() for col in numerical_cols] | |
| } | |
| mean_df = pd.DataFrame(mean_data) | |
| col1, col2 = st.columns([1, 2]) | |
| with col1: | |
| st.dataframe(mean_df, hide_index=True) | |
| with col2: | |
| fig = px.bar(mean_df, x='Metric', y='Mean Value', | |
| title='Mean Values by Metric', | |
| color='Metric') | |
| st.plotly_chart(fig, use_container_width=True) | |
| with tab2: | |
| st.subheader("Median") | |
| st.write("The median represents the middle value when data is sorted.") | |
| median_data = { | |
| 'Metric': numerical_cols, | |
| 'Median Value': [df[col].median() for col in numerical_cols] | |
| } | |
| median_df = pd.DataFrame(median_data) | |
| col1, col2 = st.columns([1, 2]) | |
| with col1: | |
| st.dataframe(median_df, hide_index=True) | |
| with col2: | |
| fig = px.bar(median_df, x='Metric', y='Median Value', | |
| title='Median Values by Metric', | |
| color='Metric') | |
| st.plotly_chart(fig, use_container_width=True) | |
| with tab3: | |
| st.subheader("Mode") | |
| st.write("The mode represents the most frequently occurring value(s).") | |
| mode_results = [] | |
| for col in numerical_cols: | |
| mode_val = df[col].mode() | |
| if len(mode_val) > 0: | |
| mode_results.append({ | |
| 'Metric': col, | |
| 'Mode Value': mode_val[0], | |
| 'Frequency': (df[col] == mode_val[0]).sum() | |
| }) | |
| mode_df = pd.DataFrame(mode_results) | |
| st.dataframe(mode_df, hide_index=True) | |
| st.info("Note: For continuous data like life expectancy and GDP, mode may not be very meaningful as values rarely repeat exactly.") | |
| with tab4: | |
| st.subheader("Standard Deviation") | |
| st.write("Standard deviation measures the amount of variation or dispersion in the data.") | |
| std_data = { | |
| 'Metric': numerical_cols, | |
| 'Standard Deviation': [df[col].std() for col in numerical_cols], | |
| 'Variance': [df[col].var() for col in numerical_cols] | |
| } | |
| std_df = pd.DataFrame(std_data) | |
| col1, col2 = st.columns([1, 2]) | |
| with col1: | |
| st.dataframe(std_df, hide_index=True) | |
| with col2: | |
| fig = px.bar(std_df, x='Metric', y='Standard Deviation', | |
| title='Standard Deviation by Metric', | |
| color='Metric') | |
| st.plotly_chart(fig, use_container_width=True) | |
| with tab5: | |
| st.subheader("Range & Interquartile Range (IQR)") | |
| st.write("Range shows the total spread (Max - Min), while IQR shows the spread of the middle 50% of data.") | |
| range_data = { | |
| 'Metric': numerical_cols, | |
| 'Min': [df[col].min() for col in numerical_cols], | |
| 'Q1 (25%)': [df[col].quantile(0.25) for col in numerical_cols], | |
| 'Q3 (75%)': [df[col].quantile(0.75) for col in numerical_cols], | |
| 'Max': [df[col].max() for col in numerical_cols], | |
| 'Range': [df[col].max() - df[col].min() for col in numerical_cols], | |
| 'IQR': [df[col].quantile(0.75) - df[col].quantile(0.25) for col in numerical_cols] | |
| } | |
| range_df = pd.DataFrame(range_data) | |
| st.dataframe(range_df, hide_index=True) | |
| # Visualizations | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| fig = px.bar(range_df, x='Metric', y='Range', | |
| title='Range by Metric', | |
| color='Metric') | |
| fig.update_layout(transition={'duration': 500}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| fig = px.bar(range_df, x='Metric', y='IQR', | |
| title='Interquartile Range (IQR) by Metric', | |
| color='Metric') | |
| fig.update_layout(transition={'duration': 500}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.info("π‘ IQR is more robust to outliers than Range. A larger IQR indicates more variability in the middle 50% of the data.") | |
| with tab6: | |
| st.subheader("Coefficient of Variation (CV)") | |
| st.write("CV = (Standard Deviation / Mean) Γ 100. It allows comparison of variability across different scales.") | |
| cv_data = { | |
| 'Metric': numerical_cols, | |
| 'Mean': [df[col].mean() for col in numerical_cols], | |
| 'Std Dev': [df[col].std() for col in numerical_cols], | |
| 'CV (%)': [(df[col].std() / df[col].mean()) * 100 for col in numerical_cols] | |
| } | |
| cv_df = pd.DataFrame(cv_data) | |
| col1, col2 = st.columns([1, 2]) | |
| with col1: | |
| st.dataframe(cv_df, hide_index=True) | |
| st.markdown("**Interpretation:**") | |
| st.markdown("- CV < 15%: Low variability") | |
| st.markdown("- CV 15-30%: Moderate variability") | |
| st.markdown("- CV > 30%: High variability") | |
| with col2: | |
| fig = px.bar(cv_df, x='Metric', y='CV (%)', | |
| title='Coefficient of Variation by Metric', | |
| color='Metric') | |
| fig.update_layout( | |
| yaxis_title="CV (%)", | |
| transition={'duration': 500} | |
| ) | |
| # Add reference lines for interpretation | |
| fig.add_hline(y=15, line_dash="dash", line_color="green", | |
| annotation_text="Low variability threshold") | |
| fig.add_hline(y=30, line_dash="dash", line_color="orange", | |
| annotation_text="High variability threshold") | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.info("π‘ Higher CV indicates greater relative variability. This is especially useful when comparing metrics with different units or scales.") | |
| with tab7: | |
| st.subheader("Percentiles Analysis") | |
| st.write("Percentiles divide the data into 100 equal parts. Key percentiles help understand data distribution.") | |
| percentiles = [0, 10, 25, 50, 75, 90, 100] | |
| percentile_data = {'Percentile': [f'{p}th' if p not in [0, 100] else ('Min' if p == 0 else 'Max') | |
| for p in percentiles]} | |
| for col in numerical_cols: | |
| percentile_data[col] = [df[col].quantile(p/100) for p in percentiles] | |
| percentile_df = pd.DataFrame(percentile_data) | |
| st.dataframe(percentile_df, hide_index=True) | |
| # Visualizations | |
| st.subheader("Percentile Visualizations") | |
| selected_percentile_metric = st.selectbox( | |
| "Select metric for percentile visualization", | |
| options=numerical_cols, | |
| format_func=lambda x: { | |
| 'lifeExp': 'Life Expectancy', | |
| 'pop': 'Population', | |
| 'gdpPercap': 'GDP per Capita' | |
| }[x], | |
| key='percentile_metric' | |
| ) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Line chart showing percentile progression | |
| perc_viz_data = pd.DataFrame({ | |
| 'Percentile': percentiles, | |
| 'Value': [df[selected_percentile_metric].quantile(p/100) for p in percentiles] | |
| }) | |
| fig = px.line(perc_viz_data, x='Percentile', y='Value', | |
| title=f'Percentile Progression - {selected_percentile_metric}', | |
| markers=True) | |
| fig.update_traces(line=dict(width=3), marker=dict(size=10)) | |
| fig.update_layout(transition={'duration': 600}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| # Enhanced box plot with percentile annotations | |
| fig = px.box(df, y=selected_percentile_metric, | |
| title=f'Box Plot with Percentiles - {selected_percentile_metric}') | |
| # Add percentile annotations | |
| for p in [10, 25, 50, 75, 90]: | |
| val = df[selected_percentile_metric].quantile(p/100) | |
| fig.add_hline(y=val, line_dash="dot", line_color="red", | |
| annotation_text=f"{p}th percentile", | |
| annotation_position="right") | |
| fig.update_layout(transition={'duration': 600}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.info("π‘ The 50th percentile is the median. The difference between 75th and 25th percentiles is the IQR.") | |
| # Comprehensive Statistics Table | |
| st.header("π Complete Statistical Summary") | |
| summary_stats = df[numerical_cols].describe().T | |
| summary_stats['mode'] = [df[col].mode()[0] if len(df[col].mode()) > 0 else np.nan for col in numerical_cols] | |
| summary_stats = summary_stats[['count', 'mean', 'mode', 'std', '50%', 'min', 'max']] | |
| summary_stats.columns = ['Count', 'Mean', 'Mode', 'Std Dev', 'Median', 'Min', 'Max'] | |
| st.dataframe(summary_stats) | |
| # Add Min/Max country details | |
| st.subheader("π Extreme Values - Countries") | |
| st.write("Discover which countries hold the minimum and maximum values for each metric") | |
| extreme_cols = st.columns(3) | |
| for idx, col in enumerate(numerical_cols): | |
| with extreme_cols[idx]: | |
| col_name = { | |
| 'lifeExp': 'Life Expectancy', | |
| 'pop': 'Population', | |
| 'gdpPercap': 'GDP per Capita' | |
| }[col] | |
| # Find min and max | |
| min_val = df[col].min() | |
| max_val = df[col].max() | |
| min_row = df[df[col] == min_val].iloc[0] | |
| max_row = df[df[col] == max_val].iloc[0] | |
| st.markdown(f"**{col_name}**") | |
| # Min value | |
| st.markdown(f"**π Minimum:** {min_val:,.2f}") | |
| st.caption(f"π {min_row['country']} ({min_row['continent']}) in {int(min_row['year'])}") | |
| # Max value | |
| st.markdown(f"**π Maximum:** {max_val:,.2f}") | |
| st.caption(f"π {max_row['country']} ({max_row['continent']}) in {int(max_row['year'])}") | |
| st.markdown("---") | |
| # Correlation Analysis | |
| st.header("π Correlation Analysis") | |
| st.write("Correlation measures the strength and direction of relationships between variables (-1 to +1).") | |
| col1, col2 = st.columns([1, 1]) | |
| with col1: | |
| # Calculate correlation matrix | |
| correlation_matrix = df[numerical_cols].corr() | |
| st.subheader("Correlation Matrix") | |
| st.dataframe(correlation_matrix.round(3)) | |
| st.markdown("**Interpretation:**") | |
| st.markdown("- **+1**: Perfect positive correlation") | |
| st.markdown("- **0**: No correlation") | |
| st.markdown("- **-1**: Perfect negative correlation") | |
| st.markdown("- **|r| > 0.7**: Strong correlation") | |
| st.markdown("- **|r| 0.3-0.7**: Moderate correlation") | |
| st.markdown("- **|r| < 0.3**: Weak correlation") | |
| with col2: | |
| # Heatmap visualization | |
| fig = px.imshow(correlation_matrix, | |
| text_auto='.2f', | |
| aspect='auto', | |
| color_continuous_scale='RdYlGn', | |
| color_continuous_midpoint=0, | |
| title='Correlation Heatmap', | |
| labels=dict(color="Correlation")) | |
| fig.update_layout( | |
| height=400, | |
| xaxis_title="", | |
| yaxis_title="" | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Detailed correlation insights | |
| st.subheader("π Key Correlation Insights") | |
| col1, col2, col3 = st.columns(3) | |
| # Get correlations | |
| corr_life_gdp = correlation_matrix.loc['lifeExp', 'gdpPercap'] | |
| corr_life_pop = correlation_matrix.loc['lifeExp', 'pop'] | |
| corr_gdp_pop = correlation_matrix.loc['gdpPercap', 'pop'] | |
| with col1: | |
| st.metric( | |
| "Life Expectancy β GDP per Capita", | |
| f"{corr_life_gdp:.3f}", | |
| delta="Strong positive" if abs(corr_life_gdp) > 0.7 else "Moderate" if abs(corr_life_gdp) > 0.3 else "Weak" | |
| ) | |
| st.caption("Higher GDP tends to correlate with longer life expectancy") | |
| with col2: | |
| st.metric( | |
| "Life Expectancy β Population", | |
| f"{corr_life_pop:.3f}", | |
| delta="Strong" if abs(corr_life_pop) > 0.7 else "Moderate" if abs(corr_life_pop) > 0.3 else "Weak" | |
| ) | |
| st.caption("Relationship between population size and life expectancy") | |
| with col3: | |
| st.metric( | |
| "GDP per Capita β Population", | |
| f"{corr_gdp_pop:.3f}", | |
| delta="Strong" if abs(corr_gdp_pop) > 0.7 else "Moderate" if abs(corr_gdp_pop) > 0.3 else "Weak" | |
| ) | |
| st.caption("Relationship between wealth and population size") | |
| st.info("π‘ Correlation does not imply causation! High correlation indicates variables move together, but one doesn't necessarily cause the other.") | |
| # Filter by Year or Country | |
| st.header("π Filter & Analyze") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| selected_year = st.selectbox("Select Year", sorted(df['year'].unique())) | |
| with col2: | |
| selected_continent = st.multiselect("Select Continent(s)", | |
| df['continent'].unique(), | |
| default=df['continent'].unique()) | |
| filtered_df = df[(df['year'] == selected_year) & (df['continent'].isin(selected_continent))] | |
| if not filtered_df.empty: | |
| st.subheader(f"Statistics for {selected_year} - {', '.join(selected_continent)}") | |
| # Metrics in columns | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Life Expectancy (Mean)", f"{filtered_df['lifeExp'].mean():.2f}") | |
| st.metric("Life Expectancy (Median)", f"{filtered_df['lifeExp'].median():.2f}") | |
| st.metric("Life Expectancy (Std Dev)", f"{filtered_df['lifeExp'].std():.2f}") | |
| with col2: | |
| st.metric("Population (Mean)", f"{filtered_df['pop'].mean():,.0f}") | |
| st.metric("Population (Median)", f"{filtered_df['pop'].median():,.0f}") | |
| st.metric("Population (Std Dev)", f"{filtered_df['pop'].std():,.0f}") | |
| with col3: | |
| st.metric("GDP per Capita (Mean)", f"${filtered_df['gdpPercap'].mean():,.2f}") | |
| st.metric("GDP per Capita (Median)", f"${filtered_df['gdpPercap'].median():.2f}") | |
| st.metric("GDP per Capita (Std Dev)", f"${filtered_df['gdpPercap'].std():,.2f}") | |
| # NEW: Add visualizations for filtered data | |
| st.subheader("π Visual Analysis") | |
| # Create tabs for different visualizations | |
| viz_tab1, viz_tab2, viz_tab3, viz_tab4, viz_tab5 = st.tabs([ | |
| "Comparative Bar Charts", | |
| "Scatter Analysis", | |
| "Geographic Distribution", | |
| "Statistical Distribution", | |
| "Animated Timeline" | |
| ]) | |
| with viz_tab1: | |
| st.write("Compare Mean, Median, and Standard Deviation across metrics") | |
| # Create comparison charts | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Mean vs Median comparison | |
| comparison_data = pd.DataFrame({ | |
| 'Metric': numerical_cols * 2, | |
| 'Statistic': ['Mean']*3 + ['Median']*3, | |
| 'Value': [ | |
| filtered_df['lifeExp'].mean(), | |
| filtered_df['pop'].mean(), | |
| filtered_df['gdpPercap'].mean(), | |
| filtered_df['lifeExp'].median(), | |
| filtered_df['pop'].median(), | |
| filtered_df['gdpPercap'].median() | |
| ] | |
| }) | |
| fig = px.bar(comparison_data, x='Metric', y='Value', | |
| color='Statistic', barmode='group', | |
| title='Mean vs Median Comparison', | |
| labels={'Value': 'Value'}, | |
| animation_frame=None) | |
| fig.update_layout(transition={'duration': 500}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| # Standard Deviation | |
| std_data = pd.DataFrame({ | |
| 'Metric': numerical_cols, | |
| 'Std Dev': [ | |
| filtered_df['lifeExp'].std(), | |
| filtered_df['pop'].std(), | |
| filtered_df['gdpPercap'].std() | |
| ] | |
| }) | |
| fig = px.bar(std_data, x='Metric', y='Std Dev', | |
| title='Standard Deviation by Metric', | |
| color='Metric') | |
| fig.update_layout(transition={'duration': 500}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with viz_tab2: | |
| st.write("Explore relationships between different metrics") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # GDP vs Life Expectancy | |
| fig = px.scatter(filtered_df, x='gdpPercap', y='lifeExp', | |
| size='pop', color='continent', | |
| hover_name='country', | |
| title=f'GDP per Capita vs Life Expectancy ({selected_year})', | |
| labels={'gdpPercap': 'GDP per Capita', | |
| 'lifeExp': 'Life Expectancy'}, | |
| log_x=True) | |
| fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey'))) | |
| fig.update_layout(transition={'duration': 800, 'easing': 'cubic-in-out'}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| # Population vs Life Expectancy | |
| fig = px.scatter(filtered_df, x='pop', y='lifeExp', | |
| size='gdpPercap', color='continent', | |
| hover_name='country', | |
| title=f'Population vs Life Expectancy ({selected_year})', | |
| labels={'pop': 'Population', | |
| 'lifeExp': 'Life Expectancy'}, | |
| log_x=True) | |
| fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey'))) | |
| fig.update_layout(transition={'duration': 800, 'easing': 'cubic-in-out'}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with viz_tab3: | |
| st.write("Geographic distribution of metrics across countries") | |
| # Bar chart by country | |
| metric_choice = st.selectbox( | |
| "Select metric to visualize by country", | |
| options=['lifeExp', 'pop', 'gdpPercap'], | |
| format_func=lambda x: { | |
| 'lifeExp': 'Life Expectancy', | |
| 'pop': 'Population', | |
| 'gdpPercap': 'GDP per Capita' | |
| }[x] | |
| ) | |
| # Sort by selected metric | |
| sorted_df = filtered_df.sort_values(metric_choice, ascending=False) | |
| fig = px.bar(sorted_df, x='country', y=metric_choice, | |
| color='continent', | |
| title=f'{metric_choice.replace("_", " ").title()} by Country ({selected_year})', | |
| labels={metric_choice: metric_choice.replace('_', ' ').title()}) | |
| fig.update_layout( | |
| xaxis_tickangle=-45, | |
| height=500, | |
| transition={'duration': 500, 'easing': 'cubic-in-out'} | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Top 10 and Bottom 10 | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.write(f"π Top 10 Countries - {metric_choice.replace('_', ' ').title()}") | |
| top_10 = sorted_df[['country', metric_choice, 'continent']].head(10) | |
| st.dataframe(top_10, hide_index=True) | |
| with col2: | |
| st.write(f"β οΈ Bottom 10 Countries - {metric_choice.replace('_', ' ').title()}") | |
| bottom_10 = sorted_df[['country', metric_choice, 'continent']].tail(10) | |
| st.dataframe(bottom_10, hide_index=True) | |
| with viz_tab4: | |
| st.write("Distribution patterns and box plots for filtered data") | |
| # Box plots by continent | |
| selected_metric_dist = st.selectbox( | |
| "Select metric for distribution analysis", | |
| options=['lifeExp', 'pop', 'gdpPercap'], | |
| format_func=lambda x: { | |
| 'lifeExp': 'Life Expectancy', | |
| 'pop': 'Population', | |
| 'gdpPercap': 'GDP per Capita' | |
| }[x], | |
| key='dist_metric' | |
| ) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Box plot | |
| fig = px.box(filtered_df, x='continent', y=selected_metric_dist, | |
| color='continent', | |
| title=f'{selected_metric_dist.replace("_", " ").title()} Distribution by Continent', | |
| labels={selected_metric_dist: selected_metric_dist.replace('_', ' ').title()}) | |
| fig.update_layout(transition={'duration': 600}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| # Violin plot | |
| fig = px.violin(filtered_df, x='continent', y=selected_metric_dist, | |
| color='continent', box=True, | |
| title=f'{selected_metric_dist.replace("_", " ").title()} Violin Plot', | |
| labels={selected_metric_dist: selected_metric_dist.replace('_', ' ').title()}) | |
| fig.update_layout(transition={'duration': 600}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with viz_tab5: | |
| st.write("π¬ Watch how metrics evolve over time with animated visualizations") | |
| # Animation type selector | |
| anim_type = st.radio( | |
| "Select Animation Type", | |
| options=["Scatter Plot Timeline", "Bar Chart Race", "Line Chart Evolution"], | |
| horizontal=True | |
| ) | |
| if anim_type == "Scatter Plot Timeline": | |
| # Animated scatter plot over all years | |
| st.subheader("GDP vs Life Expectancy Over Time") | |
| # Filter by selected continents | |
| anim_df = df[df['continent'].isin(selected_continent)] | |
| fig = px.scatter(anim_df, | |
| x='gdpPercap', | |
| y='lifeExp', | |
| animation_frame='year', | |
| animation_group='country', | |
| size='pop', | |
| color='continent', | |
| hover_name='country', | |
| log_x=True, | |
| size_max=60, | |
| range_x=[100, 120000], | |
| range_y=[20, 90], | |
| title='GDP per Capita vs Life Expectancy (Animated)', | |
| labels={'gdpPercap': 'GDP per Capita', | |
| 'lifeExp': 'Life Expectancy', | |
| 'pop': 'Population'}) | |
| fig.update_layout( | |
| height=600, | |
| transition={'duration': 800} | |
| ) | |
| fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 800 | |
| fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 500 | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.info("Click the Play button to watch the animation! Each frame represents a year from 1952 to 2007.") | |
| elif anim_type == "Bar Chart Race": | |
| # Animated bar chart race | |
| st.subheader("Top Countries Bar Chart Race") | |
| metric_race = st.selectbox( | |
| "Select metric for bar chart race", | |
| options=['lifeExp', 'pop', 'gdpPercap'], | |
| format_func=lambda x: { | |
| 'lifeExp': 'Life Expectancy', | |
| 'pop': 'Population', | |
| 'gdpPercap': 'GDP per Capita' | |
| }[x], | |
| key='race_metric' | |
| ) | |
| # Get top 15 countries for the latest year | |
| latest_year_df = df[df['year'] == df['year'].max()] | |
| top_countries = latest_year_df.nlargest(15, metric_race)['country'].tolist() | |
| # Filter data for these countries | |
| race_df = df[df['country'].isin(top_countries) & df['continent'].isin(selected_continent)] | |
| fig = px.bar(race_df, | |
| x=metric_race, | |
| y='country', | |
| color='continent', | |
| animation_frame='year', | |
| orientation='h', | |
| title=f'Top Countries by {metric_race.replace("_", " ").title()} Over Time', | |
| labels={metric_race: metric_race.replace('_', ' ').title()}, | |
| range_x=[0, race_df[metric_race].max() * 1.1]) | |
| fig.update_layout( | |
| height=600, | |
| yaxis={'categoryorder': 'total ascending'} | |
| ) | |
| fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 1000 | |
| fig.layout.updatemenus[0].buttons[0].args[1]['transition']['duration'] = 500 | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.info("Watch the ranking change over time! Countries rise and fall based on their performance.") | |
| else: # Line Chart Evolution | |
| st.subheader("Metric Evolution by Continent") | |
| metric_line = st.selectbox( | |
| "Select metric for time evolution", | |
| options=['lifeExp', 'pop', 'gdpPercap'], | |
| format_func=lambda x: { | |
| 'lifeExp': 'Life Expectancy', | |
| 'pop': 'Population', | |
| 'gdpPercap': 'GDP per Capita' | |
| }[x], | |
| key='line_metric' | |
| ) | |
| # Calculate mean by continent and year | |
| line_df = df[df['continent'].isin(selected_continent)].groupby(['year', 'continent'])[metric_line].mean().reset_index() | |
| fig = px.line(line_df, | |
| x='year', | |
| y=metric_line, | |
| color='continent', | |
| markers=True, | |
| title=f'Average {metric_line.replace("_", " ").title()} Evolution by Continent', | |
| labels={metric_line: f'Average {metric_line.replace("_", " ").title()}', | |
| 'year': 'Year'}) | |
| fig.update_traces( | |
| mode='lines+markers', | |
| line=dict(width=3), | |
| marker=dict(size=8) | |
| ) | |
| fig.update_layout( | |
| height=500, | |
| hovermode='x unified', | |
| transition={'duration': 600} | |
| ) | |
| # Add animation on load | |
| fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGray') | |
| fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGray') | |
| st.plotly_chart(fig, use_container_width=True) | |
| st.info("Hover over the chart to see detailed values for each year and continent.") | |
| # Distribution Visualizations | |
| st.header("π Overall Data Distributions") | |
| selected_metric = st.selectbox("Select metric to visualize", numerical_cols, | |
| format_func=lambda x: { | |
| 'lifeExp': 'Life Expectancy', | |
| 'pop': 'Population', | |
| 'gdpPercap': 'GDP per Capita' | |
| }[x]) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| fig = px.histogram(df, x=selected_metric, nbins=50, | |
| title=f'Distribution of {selected_metric}', | |
| labels={selected_metric: selected_metric.replace('_', ' ').title()}) | |
| fig.add_vline(x=df[selected_metric].mean(), line_dash="dash", | |
| line_color="red", annotation_text="Mean") | |
| fig.add_vline(x=df[selected_metric].median(), line_dash="dash", | |
| line_color="green", annotation_text="Median") | |
| fig.update_layout(transition={'duration': 700}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| fig = px.box(df, y=selected_metric, x='continent', | |
| title=f'{selected_metric} by Continent', | |
| labels={selected_metric: selected_metric.replace('_', ' ').title()}) | |
| fig.update_layout(transition={'duration': 700}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Footer | |
| st.markdown("---") | |
| st.markdown("Data source: [Gapminder](https://www.gapminder.org/)") | |
| st.markdown("**Version 2** - Enhanced with interactive visualizations in Filter & Analyze section") | |