eagle0504's picture
Upload folder using huggingface_hub
305e4f3 verified
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
st.set_page_config(layout="wide", page_title="Student Performance EDA")
st.title("📊 Student Performance in Exams: Exploratory Data Analysis")
st.markdown("This application presents an Exploratory Data Analysis of the Students Performance in Exams dataset.")
# Load the dataset
@st.cache_data
def load_data():
df = pd.read_csv('StudentsPerformance.csv')
# Rename columns for consistency and easier use with statsmodels and plotting
df.rename(columns={
'parental level of education': 'parental_level_of_education',
'test preparation course': 'test_preparation_course',
'math score': 'math_score',
'reading score': 'reading_score',
'writing score': 'writing_score',
'race/ethnicity': 'race_ethnicity'
}, inplace=True)
df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)
return df
df = load_data()
st.header("1. Dataset Overview")
st.write("**Shape of the dataset:**", df.shape)
st.write("**First 5 rows:**")
st.dataframe(df.head())
st.write("**Column Information:**")
st.text(df.info())
st.write("**Descriptive Statistics for Scores:**")
st.dataframe(df[['math_score', 'reading_score', 'writing_score', 'average_score']].describe())
st.write("**Missing values per column:**")
st.dataframe(df.isnull().sum().to_frame(name='Missing Values'))
st.header("2. Score Distributions")
fig_hist, axes_hist = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
fig_hist.suptitle('Distribution of Student Scores', fontsize=18)
sns.histplot(df['math_score'], kde=True, ax=axes_hist[0], color='skyblue')
axes_hist[0].set_title('Math Scores')
sns.histplot(df['reading_score'], kde=True, ax=axes_hist[1], color='lightcoral')
axes_hist[1].set_title('Reading Scores')
sns.histplot(df['writing_score'], kde=True, ax=axes_hist[2], color='lightgreen')
axes_hist[2].set_title('Writing Scores')
st.pyplot(fig_hist)
st.header("3. Correlation Between Scores")
st.write("The scores (math, reading, writing) are highly correlated with each other, indicating that performance in one subject is generally reflected in others.")
score_cols = ['math_score', 'reading_score', 'writing_score', 'average_score']
correlation_matrix = df[score_cols].corr()
fig_corr, ax_corr = plt.subplots(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax_corr)
ax_corr.set_title('Correlation Matrix of Student Scores')
st.pyplot(fig_corr)
st.header("4. Impact of Categorical Variables on Average Score")
def plot_and_analyze_categorical(dataframe, category_col, score_col='average_score'):
st.subheader(f"4.{plot_and_analyze_categorical.counter}. {category_col.replace('_', ' ').title()}")
plot_and_analyze_categorical.counter += 1
# Violin Plot
fig_violin, ax_violin = plt.subplots(figsize=(10, 6))
sns.violinplot(x=category_col, y=score_col, data=dataframe, ax=ax_violin)
ax_violin.set_title(f"{score_col.replace('_', ' ').title()} by {category_col.replace('_', ' ').title()}")
# Corrected line: Removed ha='right'
ax_violin.tick_params(axis='x', rotation=45)
st.pyplot(fig_violin)
# Statistical Test
st.write("**Statistical Test Results:**")
if dataframe[category_col].nunique() == 2:
group1 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[0]][score_col]
group2 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[1]][score_col]
t_stat, p_val = stats.ttest_ind(group1, group2)
st.write(f"Independent t-test between **{dataframe[category_col].unique()[0]}** and **{dataframe[category_col].unique()[1]}** for {score_col.replace('_', ' ')}:")
st.write(f" t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")
if p_val < 0.05:
st.success(" **Conclusion: Statistically significant difference between groups (p < 0.05).**")
else:
st.info(" Conclusion: No statistically significant difference between groups (p >= 0.05).")
else:
model = ols(f'{score_col} ~ C({category_col})', data=dataframe).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
st.write(f"ANOVA for {category_col} on {score_col.replace('_', ' ')}:")
st.dataframe(anova_table)
if anova_table['PR(>F)'][0] < 0.05:
st.success(" **Conclusion: Statistically significant differences between group means (p < 0.05).**")
else:
st.info(" Conclusion: No statistically significant differences between group means (p >= 0.05).")
plot_and_analyze_categorical.counter = 1 # Initialize counter
categorical_cols = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
for col in categorical_cols:
plot_and_analyze_categorical(df, col)
st.markdown("--- ")
st.success("**EDA complete!** The application highlights key insights into factors influencing student performance.")