File size: 5,123 Bytes
3e50e1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa7a811
3e50e1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b7fadd
3e50e1d
 
 
 
 
ca7cccf
305e4f3
 
3e50e1d
 
 
 
 
 
 
 
0752a49
3e50e1d
 
 
 
 
 
 
 
a14e5d3
3e50e1d
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

st.set_page_config(layout="wide", page_title="Student Performance EDA")

st.title("📊 Student Performance in Exams: Exploratory Data Analysis")
st.markdown("This application presents an Exploratory Data Analysis of the Students Performance in Exams dataset.")

# Load the dataset
@st.cache_data
def load_data():
    df = pd.read_csv('StudentsPerformance.csv')
    # Rename columns for consistency and easier use with statsmodels and plotting
    df.rename(columns={
        'parental level of education': 'parental_level_of_education',
        'test preparation course': 'test_preparation_course',
        'math score': 'math_score',
        'reading score': 'reading_score',
        'writing score': 'writing_score',
        'race/ethnicity': 'race_ethnicity'
    }, inplace=True)
    df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)
    return df

df = load_data()

st.header("1. Dataset Overview")
st.write("**Shape of the dataset:**", df.shape)
st.write("**First 5 rows:**")
st.dataframe(df.head())

st.write("**Column Information:**")
st.text(df.info())

st.write("**Descriptive Statistics for Scores:**")
st.dataframe(df[['math_score', 'reading_score', 'writing_score', 'average_score']].describe())

st.write("**Missing values per column:**")
st.dataframe(df.isnull().sum().to_frame(name='Missing Values'))

st.header("2. Score Distributions")
fig_hist, axes_hist = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
fig_hist.suptitle('Distribution of Student Scores', fontsize=18)

sns.histplot(df['math_score'], kde=True, ax=axes_hist[0], color='skyblue')
axes_hist[0].set_title('Math Scores')
sns.histplot(df['reading_score'], kde=True, ax=axes_hist[1], color='lightcoral')
axes_hist[1].set_title('Reading Scores')
sns.histplot(df['writing_score'], kde=True, ax=axes_hist[2], color='lightgreen')
axes_hist[2].set_title('Writing Scores')

st.pyplot(fig_hist)

st.header("3. Correlation Between Scores")
st.write("The scores (math, reading, writing) are highly correlated with each other, indicating that performance in one subject is generally reflected in others.")
score_cols = ['math_score', 'reading_score', 'writing_score', 'average_score']
correlation_matrix = df[score_cols].corr()
fig_corr, ax_corr = plt.subplots(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax_corr)
ax_corr.set_title('Correlation Matrix of Student Scores')
st.pyplot(fig_corr)

st.header("4. Impact of Categorical Variables on Average Score")

def plot_and_analyze_categorical(dataframe, category_col, score_col='average_score'):
    st.subheader(f"4.{plot_and_analyze_categorical.counter}. {category_col.replace('_', ' ').title()}")
    plot_and_analyze_categorical.counter += 1

    # Violin Plot
    fig_violin, ax_violin = plt.subplots(figsize=(10, 6))
    sns.violinplot(x=category_col, y=score_col, data=dataframe, ax=ax_violin)
    ax_violin.set_title(f"{score_col.replace('_', ' ').title()} by {category_col.replace('_', ' ').title()}")
    # Corrected line: Removed ha='right'
    ax_violin.tick_params(axis='x', rotation=45)
    st.pyplot(fig_violin)

    # Statistical Test
    st.write("**Statistical Test Results:**")
    if dataframe[category_col].nunique() == 2:
        group1 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[0]][score_col]
        group2 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[1]][score_col]
        t_stat, p_val = stats.ttest_ind(group1, group2)
        st.write(f"Independent t-test between **{dataframe[category_col].unique()[0]}** and **{dataframe[category_col].unique()[1]}** for {score_col.replace('_', ' ')}:")
        st.write(f"  t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")
        if p_val < 0.05:
            st.success("  **Conclusion: Statistically significant difference between groups (p < 0.05).**")
        else:
            st.info("  Conclusion: No statistically significant difference between groups (p >= 0.05).")
    else:
        model = ols(f'{score_col} ~ C({category_col})', data=dataframe).fit()
        anova_table = sm.stats.anova_lm(model, typ=2)
        st.write(f"ANOVA for {category_col} on {score_col.replace('_', ' ')}:")
        st.dataframe(anova_table)
        if anova_table['PR(>F)'][0] < 0.05:
            st.success("  **Conclusion: Statistically significant differences between group means (p < 0.05).**")
        else:
            st.info("  Conclusion: No statistically significant differences between group means (p >= 0.05).")

plot_and_analyze_categorical.counter = 1 # Initialize counter

categorical_cols = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
for col in categorical_cols:
    plot_and_analyze_categorical(df, col)

st.markdown("--- ")
st.success("**EDA complete!** The application highlights key insights into factors influencing student performance.")