Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from scipy import stats | |
| import statsmodels.api as sm | |
| from statsmodels.formula.api import ols | |
| st.set_page_config(layout="wide", page_title="Student Performance EDA") | |
| st.title("📊 Student Performance in Exams: Exploratory Data Analysis") | |
| st.markdown("This application presents an Exploratory Data Analysis of the Students Performance in Exams dataset.") | |
| # Load the dataset | |
| def load_data(): | |
| df = pd.read_csv('StudentsPerformance.csv') | |
| # Rename columns for consistency and easier use with statsmodels and plotting | |
| df.rename(columns={ | |
| 'parental level of education': 'parental_level_of_education', | |
| 'test preparation course': 'test_preparation_course', | |
| 'math score': 'math_score', | |
| 'reading score': 'reading_score', | |
| 'writing score': 'writing_score', | |
| 'race/ethnicity': 'race_ethnicity' | |
| }, inplace=True) | |
| df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1) | |
| return df | |
| df = load_data() | |
| st.header("1. Dataset Overview") | |
| st.write("**Shape of the dataset:**", df.shape) | |
| st.write("**First 5 rows:**") | |
| st.dataframe(df.head()) | |
| st.write("**Column Information:**") | |
| st.text(df.info()) | |
| st.write("**Descriptive Statistics for Scores:**") | |
| st.dataframe(df[['math_score', 'reading_score', 'writing_score', 'average_score']].describe()) | |
| st.write("**Missing values per column:**") | |
| st.dataframe(df.isnull().sum().to_frame(name='Missing Values')) | |
| st.header("2. Score Distributions") | |
| fig_hist, axes_hist = plt.subplots(nrows=1, ncols=3, figsize=(20, 6)) | |
| fig_hist.suptitle('Distribution of Student Scores', fontsize=18) | |
| sns.histplot(df['math_score'], kde=True, ax=axes_hist[0], color='skyblue') | |
| axes_hist[0].set_title('Math Scores') | |
| sns.histplot(df['reading_score'], kde=True, ax=axes_hist[1], color='lightcoral') | |
| axes_hist[1].set_title('Reading Scores') | |
| sns.histplot(df['writing_score'], kde=True, ax=axes_hist[2], color='lightgreen') | |
| axes_hist[2].set_title('Writing Scores') | |
| st.pyplot(fig_hist) | |
| st.header("3. Correlation Between Scores") | |
| st.write("The scores (math, reading, writing) are highly correlated with each other, indicating that performance in one subject is generally reflected in others.") | |
| score_cols = ['math_score', 'reading_score', 'writing_score', 'average_score'] | |
| correlation_matrix = df[score_cols].corr() | |
| fig_corr, ax_corr = plt.subplots(figsize=(8, 6)) | |
| sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax_corr) | |
| ax_corr.set_title('Correlation Matrix of Student Scores') | |
| st.pyplot(fig_corr) | |
| st.header("4. Impact of Categorical Variables on Average Score") | |
| def plot_and_analyze_categorical(dataframe, category_col, score_col='average_score'): | |
| st.subheader(f"4.{plot_and_analyze_categorical.counter}. {category_col.replace('_', ' ').title()}") | |
| plot_and_analyze_categorical.counter += 1 | |
| # Violin Plot | |
| fig_violin, ax_violin = plt.subplots(figsize=(10, 6)) | |
| sns.violinplot(x=category_col, y=score_col, data=dataframe, ax=ax_violin) | |
| ax_violin.set_title(f"{score_col.replace('_', ' ').title()} by {category_col.replace('_', ' ').title()}") | |
| # Corrected line: Removed ha='right' | |
| ax_violin.tick_params(axis='x', rotation=45) | |
| st.pyplot(fig_violin) | |
| # Statistical Test | |
| st.write("**Statistical Test Results:**") | |
| if dataframe[category_col].nunique() == 2: | |
| group1 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[0]][score_col] | |
| group2 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[1]][score_col] | |
| t_stat, p_val = stats.ttest_ind(group1, group2) | |
| st.write(f"Independent t-test between **{dataframe[category_col].unique()[0]}** and **{dataframe[category_col].unique()[1]}** for {score_col.replace('_', ' ')}:") | |
| st.write(f" t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}") | |
| if p_val < 0.05: | |
| st.success(" **Conclusion: Statistically significant difference between groups (p < 0.05).**") | |
| else: | |
| st.info(" Conclusion: No statistically significant difference between groups (p >= 0.05).") | |
| else: | |
| model = ols(f'{score_col} ~ C({category_col})', data=dataframe).fit() | |
| anova_table = sm.stats.anova_lm(model, typ=2) | |
| st.write(f"ANOVA for {category_col} on {score_col.replace('_', ' ')}:") | |
| st.dataframe(anova_table) | |
| if anova_table['PR(>F)'][0] < 0.05: | |
| st.success(" **Conclusion: Statistically significant differences between group means (p < 0.05).**") | |
| else: | |
| st.info(" Conclusion: No statistically significant differences between group means (p >= 0.05).") | |
| plot_and_analyze_categorical.counter = 1 # Initialize counter | |
| categorical_cols = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course'] | |
| for col in categorical_cols: | |
| plot_and_analyze_categorical(df, col) | |
| st.markdown("--- ") | |
| st.success("**EDA complete!** The application highlights key insights into factors influencing student performance.") | |