Spaces:
Sleeping
Sleeping
File size: 5,123 Bytes
3e50e1d fa7a811 3e50e1d 5b7fadd 3e50e1d ca7cccf 305e4f3 3e50e1d 0752a49 3e50e1d a14e5d3 3e50e1d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 | import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
st.set_page_config(layout="wide", page_title="Student Performance EDA")
st.title("📊 Student Performance in Exams: Exploratory Data Analysis")
st.markdown("This application presents an Exploratory Data Analysis of the Students Performance in Exams dataset.")
# Load the dataset
@st.cache_data
def load_data():
df = pd.read_csv('StudentsPerformance.csv')
# Rename columns for consistency and easier use with statsmodels and plotting
df.rename(columns={
'parental level of education': 'parental_level_of_education',
'test preparation course': 'test_preparation_course',
'math score': 'math_score',
'reading score': 'reading_score',
'writing score': 'writing_score',
'race/ethnicity': 'race_ethnicity'
}, inplace=True)
df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)
return df
df = load_data()
st.header("1. Dataset Overview")
st.write("**Shape of the dataset:**", df.shape)
st.write("**First 5 rows:**")
st.dataframe(df.head())
st.write("**Column Information:**")
st.text(df.info())
st.write("**Descriptive Statistics for Scores:**")
st.dataframe(df[['math_score', 'reading_score', 'writing_score', 'average_score']].describe())
st.write("**Missing values per column:**")
st.dataframe(df.isnull().sum().to_frame(name='Missing Values'))
st.header("2. Score Distributions")
fig_hist, axes_hist = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
fig_hist.suptitle('Distribution of Student Scores', fontsize=18)
sns.histplot(df['math_score'], kde=True, ax=axes_hist[0], color='skyblue')
axes_hist[0].set_title('Math Scores')
sns.histplot(df['reading_score'], kde=True, ax=axes_hist[1], color='lightcoral')
axes_hist[1].set_title('Reading Scores')
sns.histplot(df['writing_score'], kde=True, ax=axes_hist[2], color='lightgreen')
axes_hist[2].set_title('Writing Scores')
st.pyplot(fig_hist)
st.header("3. Correlation Between Scores")
st.write("The scores (math, reading, writing) are highly correlated with each other, indicating that performance in one subject is generally reflected in others.")
score_cols = ['math_score', 'reading_score', 'writing_score', 'average_score']
correlation_matrix = df[score_cols].corr()
fig_corr, ax_corr = plt.subplots(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax_corr)
ax_corr.set_title('Correlation Matrix of Student Scores')
st.pyplot(fig_corr)
st.header("4. Impact of Categorical Variables on Average Score")
def plot_and_analyze_categorical(dataframe, category_col, score_col='average_score'):
st.subheader(f"4.{plot_and_analyze_categorical.counter}. {category_col.replace('_', ' ').title()}")
plot_and_analyze_categorical.counter += 1
# Violin Plot
fig_violin, ax_violin = plt.subplots(figsize=(10, 6))
sns.violinplot(x=category_col, y=score_col, data=dataframe, ax=ax_violin)
ax_violin.set_title(f"{score_col.replace('_', ' ').title()} by {category_col.replace('_', ' ').title()}")
# Corrected line: Removed ha='right'
ax_violin.tick_params(axis='x', rotation=45)
st.pyplot(fig_violin)
# Statistical Test
st.write("**Statistical Test Results:**")
if dataframe[category_col].nunique() == 2:
group1 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[0]][score_col]
group2 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[1]][score_col]
t_stat, p_val = stats.ttest_ind(group1, group2)
st.write(f"Independent t-test between **{dataframe[category_col].unique()[0]}** and **{dataframe[category_col].unique()[1]}** for {score_col.replace('_', ' ')}:")
st.write(f" t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")
if p_val < 0.05:
st.success(" **Conclusion: Statistically significant difference between groups (p < 0.05).**")
else:
st.info(" Conclusion: No statistically significant difference between groups (p >= 0.05).")
else:
model = ols(f'{score_col} ~ C({category_col})', data=dataframe).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
st.write(f"ANOVA for {category_col} on {score_col.replace('_', ' ')}:")
st.dataframe(anova_table)
if anova_table['PR(>F)'][0] < 0.05:
st.success(" **Conclusion: Statistically significant differences between group means (p < 0.05).**")
else:
st.info(" Conclusion: No statistically significant differences between group means (p >= 0.05).")
plot_and_analyze_categorical.counter = 1 # Initialize counter
categorical_cols = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
for col in categorical_cols:
plot_and_analyze_categorical(df, col)
st.markdown("--- ")
st.success("**EDA complete!** The application highlights key insights into factors influencing student performance.")
|