import streamlit as st import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from scipy import stats import statsmodels.api as sm from statsmodels.formula.api import ols st.set_page_config(layout="wide", page_title="Student Performance EDA") st.title("📊 Student Performance in Exams: Exploratory Data Analysis") st.markdown("This application presents an Exploratory Data Analysis of the Students Performance in Exams dataset.") # Load the dataset @st.cache_data def load_data(): df = pd.read_csv('StudentsPerformance.csv') # Rename columns for consistency and easier use with statsmodels and plotting df.rename(columns={ 'parental level of education': 'parental_level_of_education', 'test preparation course': 'test_preparation_course', 'math score': 'math_score', 'reading score': 'reading_score', 'writing score': 'writing_score', 'race/ethnicity': 'race_ethnicity' }, inplace=True) df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1) return df df = load_data() st.header("1. Dataset Overview") st.write("**Shape of the dataset:**", df.shape) st.write("**First 5 rows:**") st.dataframe(df.head()) st.write("**Column Information:**") st.text(df.info()) st.write("**Descriptive Statistics for Scores:**") st.dataframe(df[['math_score', 'reading_score', 'writing_score', 'average_score']].describe()) st.write("**Missing values per column:**") st.dataframe(df.isnull().sum().to_frame(name='Missing Values')) st.header("2. Score Distributions") fig_hist, axes_hist = plt.subplots(nrows=1, ncols=3, figsize=(20, 6)) fig_hist.suptitle('Distribution of Student Scores', fontsize=18) sns.histplot(df['math_score'], kde=True, ax=axes_hist[0], color='skyblue') axes_hist[0].set_title('Math Scores') sns.histplot(df['reading_score'], kde=True, ax=axes_hist[1], color='lightcoral') axes_hist[1].set_title('Reading Scores') sns.histplot(df['writing_score'], kde=True, ax=axes_hist[2], color='lightgreen') axes_hist[2].set_title('Writing Scores') st.pyplot(fig_hist) st.header("3. Correlation Between Scores") st.write("The scores (math, reading, writing) are highly correlated with each other, indicating that performance in one subject is generally reflected in others.") score_cols = ['math_score', 'reading_score', 'writing_score', 'average_score'] correlation_matrix = df[score_cols].corr() fig_corr, ax_corr = plt.subplots(figsize=(8, 6)) sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax_corr) ax_corr.set_title('Correlation Matrix of Student Scores') st.pyplot(fig_corr) st.header("4. Impact of Categorical Variables on Average Score") def plot_and_analyze_categorical(dataframe, category_col, score_col='average_score'): st.subheader(f"4.{plot_and_analyze_categorical.counter}. {category_col.replace('_', ' ').title()}") plot_and_analyze_categorical.counter += 1 # Violin Plot fig_violin, ax_violin = plt.subplots(figsize=(10, 6)) sns.violinplot(x=category_col, y=score_col, data=dataframe, ax=ax_violin) ax_violin.set_title(f"{score_col.replace('_', ' ').title()} by {category_col.replace('_', ' ').title()}") # Corrected line: Removed ha='right' ax_violin.tick_params(axis='x', rotation=45) st.pyplot(fig_violin) # Statistical Test st.write("**Statistical Test Results:**") if dataframe[category_col].nunique() == 2: group1 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[0]][score_col] group2 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[1]][score_col] t_stat, p_val = stats.ttest_ind(group1, group2) st.write(f"Independent t-test between **{dataframe[category_col].unique()[0]}** and **{dataframe[category_col].unique()[1]}** for {score_col.replace('_', ' ')}:") st.write(f" t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}") if p_val < 0.05: st.success(" **Conclusion: Statistically significant difference between groups (p < 0.05).**") else: st.info(" Conclusion: No statistically significant difference between groups (p >= 0.05).") else: model = ols(f'{score_col} ~ C({category_col})', data=dataframe).fit() anova_table = sm.stats.anova_lm(model, typ=2) st.write(f"ANOVA for {category_col} on {score_col.replace('_', ' ')}:") st.dataframe(anova_table) if anova_table['PR(>F)'][0] < 0.05: st.success(" **Conclusion: Statistically significant differences between group means (p < 0.05).**") else: st.info(" Conclusion: No statistically significant differences between group means (p >= 0.05).") plot_and_analyze_categorical.counter = 1 # Initialize counter categorical_cols = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course'] for col in categorical_cols: plot_and_analyze_categorical(df, col) st.markdown("--- ") st.success("**EDA complete!** The application highlights key insights into factors influencing student performance.")