Spaces:

eagle0504
/

eda_on_exam_data

Sleeping

File size: 5,123 Bytes

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

st.set_page_config(layout="wide", page_title="Student Performance EDA")

st.title("📊 Student Performance in Exams: Exploratory Data Analysis")
st.markdown("This application presents an Exploratory Data Analysis of the Students Performance in Exams dataset.")

# Load the dataset
@st.cache_data
def load_data():
    df = pd.read_csv('StudentsPerformance.csv')
    # Rename columns for consistency and easier use with statsmodels and plotting
    df.rename(columns={
        'parental level of education': 'parental_level_of_education',
        'test preparation course': 'test_preparation_course',
        'math score': 'math_score',
        'reading score': 'reading_score',
        'writing score': 'writing_score',
        'race/ethnicity': 'race_ethnicity'
    }, inplace=True)
    df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)
    return df

df = load_data()

st.header("1. Dataset Overview")
st.write("**Shape of the dataset:**", df.shape)
st.write("**First 5 rows:**")
st.dataframe(df.head())

st.write("**Column Information:**")
st.text(df.info())

st.write("**Descriptive Statistics for Scores:**")
st.dataframe(df[['math_score', 'reading_score', 'writing_score', 'average_score']].describe())

st.write("**Missing values per column:**")
st.dataframe(df.isnull().sum().to_frame(name='Missing Values'))

st.header("2. Score Distributions")
fig_hist, axes_hist = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
fig_hist.suptitle('Distribution of Student Scores', fontsize=18)

sns.histplot(df['math_score'], kde=True, ax=axes_hist[0], color='skyblue')
axes_hist[0].set_title('Math Scores')
sns.histplot(df['reading_score'], kde=True, ax=axes_hist[1], color='lightcoral')
axes_hist[1].set_title('Reading Scores')
sns.histplot(df['writing_score'], kde=True, ax=axes_hist[2], color='lightgreen')
axes_hist[2].set_title('Writing Scores')

st.pyplot(fig_hist)

st.header("3. Correlation Between Scores")
st.write("The scores (math, reading, writing) are highly correlated with each other, indicating that performance in one subject is generally reflected in others.")
score_cols = ['math_score', 'reading_score', 'writing_score', 'average_score']
correlation_matrix = df[score_cols].corr()
fig_corr, ax_corr = plt.subplots(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax_corr)
ax_corr.set_title('Correlation Matrix of Student Scores')
st.pyplot(fig_corr)

st.header("4. Impact of Categorical Variables on Average Score")

def plot_and_analyze_categorical(dataframe, category_col, score_col='average_score'):
    st.subheader(f"4.{plot_and_analyze_categorical.counter}. {category_col.replace('_', ' ').title()}")
    plot_and_analyze_categorical.counter += 1

    # Violin Plot
    fig_violin, ax_violin = plt.subplots(figsize=(10, 6))
    sns.violinplot(x=category_col, y=score_col, data=dataframe, ax=ax_violin)
    ax_violin.set_title(f"{score_col.replace('_', ' ').title()} by {category_col.replace('_', ' ').title()}")
    # Corrected line: Removed ha='right'
    ax_violin.tick_params(axis='x', rotation=45)
    st.pyplot(fig_violin)

    # Statistical Test
    st.write("**Statistical Test Results:**")
    if dataframe[category_col].nunique() == 2:
        group1 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[0]][score_col]
        group2 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[1]][score_col]
        t_stat, p_val = stats.ttest_ind(group1, group2)
        st.write(f"Independent t-test between **{dataframe[category_col].unique()[0]}** and **{dataframe[category_col].unique()[1]}** for {score_col.replace('_', ' ')}:")
        st.write(f"  t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")
        if p_val < 0.05:
            st.success("  **Conclusion: Statistically significant difference between groups (p < 0.05).**")
        else:
            st.info("  Conclusion: No statistically significant difference between groups (p >= 0.05).")
    else:
        model = ols(f'{score_col} ~ C({category_col})', data=dataframe).fit()
        anova_table = sm.stats.anova_lm(model, typ=2)
        st.write(f"ANOVA for {category_col} on {score_col.replace('_', ' ')}:")
        st.dataframe(anova_table)
        if anova_table['PR(>F)'][0] < 0.05:
            st.success("  **Conclusion: Statistically significant differences between group means (p < 0.05).**")
        else:
            st.info("  Conclusion: No statistically significant differences between group means (p >= 0.05).")

plot_and_analyze_categorical.counter = 1 # Initialize counter

categorical_cols = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
for col in categorical_cols:
    plot_and_analyze_categorical(df, col)

st.markdown("--- ")
st.success("**EDA complete!** The application highlights key insights into factors influencing student performance.")