import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols

st.set_page_config(layout="wide", page_title="Student Performance EDA")

st.title("📊 Student Performance in Exams: Exploratory Data Analysis")
st.markdown("This application presents an Exploratory Data Analysis of the Students Performance in Exams dataset.")

# Load the dataset
@st.cache_data
def load_data():
    df = pd.read_csv('StudentsPerformance.csv')
    # Rename columns for consistency and easier use with statsmodels and plotting
    df.rename(columns={
        'parental level of education': 'parental_level_of_education',
        'test preparation course': 'test_preparation_course',
        'math score': 'math_score',
        'reading score': 'reading_score',
        'writing score': 'writing_score',
        'race/ethnicity': 'race_ethnicity'
    }, inplace=True)
    df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)
    return df

df = load_data()

st.header("1. Dataset Overview")
st.write("**Shape of the dataset:**", df.shape)
st.write("**First 5 rows:**")
st.dataframe(df.head())

st.write("**Column Information:**")
st.text(df.info())

st.write("**Descriptive Statistics for Scores:**")
st.dataframe(df[['math_score', 'reading_score', 'writing_score', 'average_score']].describe())

st.write("**Missing values per column:**")
st.dataframe(df.isnull().sum().to_frame(name='Missing Values'))

st.header("2. Score Distributions")
fig_hist, axes_hist = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
fig_hist.suptitle('Distribution of Student Scores', fontsize=18)

sns.histplot(df['math_score'], kde=True, ax=axes_hist[0], color='skyblue')
axes_hist[0].set_title('Math Scores')
sns.histplot(df['reading_score'], kde=True, ax=axes_hist[1], color='lightcoral')
axes_hist[1].set_title('Reading Scores')
sns.histplot(df['writing_score'], kde=True, ax=axes_hist[2], color='lightgreen')
axes_hist[2].set_title('Writing Scores')

st.pyplot(fig_hist)

st.header("3. Correlation Between Scores")
st.write("The scores (math, reading, writing) are highly correlated with each other, indicating that performance in one subject is generally reflected in others.")
score_cols = ['math_score', 'reading_score', 'writing_score', 'average_score']
correlation_matrix = df[score_cols].corr()
fig_corr, ax_corr = plt.subplots(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax_corr)
ax_corr.set_title('Correlation Matrix of Student Scores')
st.pyplot(fig_corr)

st.header("4. Impact of Categorical Variables on Average Score")

def plot_and_analyze_categorical(dataframe, category_col, score_col='average_score'):
    st.subheader(f"4.{plot_and_analyze_categorical.counter}. {category_col.replace('_', ' ').title()}")
    plot_and_analyze_categorical.counter += 1

    # Violin Plot
    fig_violin, ax_violin = plt.subplots(figsize=(10, 6))
    sns.violinplot(x=category_col, y=score_col, data=dataframe, ax=ax_violin)
    ax_violin.set_title(f"{score_col.replace('_', ' ').title()} by {category_col.replace('_', ' ').title()}")
    # Corrected line: Removed ha='right'
    ax_violin.tick_params(axis='x', rotation=45)
    st.pyplot(fig_violin)

    # Statistical Test
    st.write("**Statistical Test Results:**")
    if dataframe[category_col].nunique() == 2:
        group1 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[0]][score_col]
        group2 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[1]][score_col]
        t_stat, p_val = stats.ttest_ind(group1, group2)
        st.write(f"Independent t-test between **{dataframe[category_col].unique()[0]}** and **{dataframe[category_col].unique()[1]}** for {score_col.replace('_', ' ')}:")
        st.write(f"  t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")
        if p_val < 0.05:
            st.success("  **Conclusion: Statistically significant difference between groups (p < 0.05).**")
        else:
            st.info("  Conclusion: No statistically significant difference between groups (p >= 0.05).")
    else:
        model = ols(f'{score_col} ~ C({category_col})', data=dataframe).fit()
        anova_table = sm.stats.anova_lm(model, typ=2)
        st.write(f"ANOVA for {category_col} on {score_col.replace('_', ' ')}:")
        st.dataframe(anova_table)
        if anova_table['PR(>F)'][0] < 0.05:
            st.success("  **Conclusion: Statistically significant differences between group means (p < 0.05).**")
        else:
            st.info("  Conclusion: No statistically significant differences between group means (p >= 0.05).")

plot_and_analyze_categorical.counter = 1 # Initialize counter

categorical_cols = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
for col in categorical_cols:
    plot_and_analyze_categorical(df, col)

st.markdown("--- ")
st.success("**EDA complete!** The application highlights key insights into factors influencing student performance.")