Spaces:

eagle0504
/

eda_on_exam_data

Sleeping

App Files Files Community

eda_on_exam_data / app.py

eagle0504

Upload folder using huggingface_hub

305e4f3 verified 2 months ago

raw

history blame contribute delete

5.12 kB

	import streamlit as st
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from scipy import stats
	import statsmodels.api as sm
	from statsmodels.formula.api import ols

	st.set_page_config(layout="wide", page_title="Student Performance EDA")

	st.title("📊 Student Performance in Exams: Exploratory Data Analysis")
	st.markdown("This application presents an Exploratory Data Analysis of the Students Performance in Exams dataset.")

	# Load the dataset
	@st.cache_data
	def load_data():
	df = pd.read_csv('StudentsPerformance.csv')
	# Rename columns for consistency and easier use with statsmodels and plotting
	df.rename(columns={
	'parental level of education': 'parental_level_of_education',
	'test preparation course': 'test_preparation_course',
	'math score': 'math_score',
	'reading score': 'reading_score',
	'writing score': 'writing_score',
	'race/ethnicity': 'race_ethnicity'
	}, inplace=True)
	df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)
	return df

	df = load_data()

	st.header("1. Dataset Overview")
	st.write("Shape of the dataset:", df.shape)
	st.write("First 5 rows:")
	st.dataframe(df.head())

	st.write("Column Information:")
	st.text(df.info())

	st.write("Descriptive Statistics for Scores:")
	st.dataframe(df[['math_score', 'reading_score', 'writing_score', 'average_score']].describe())

	st.write("Missing values per column:")
	st.dataframe(df.isnull().sum().to_frame(name='Missing Values'))

	st.header("2. Score Distributions")
	fig_hist, axes_hist = plt.subplots(nrows=1, ncols=3, figsize=(20, 6))
	fig_hist.suptitle('Distribution of Student Scores', fontsize=18)

	sns.histplot(df['math_score'], kde=True, ax=axes_hist[0], color='skyblue')
	axes_hist[0].set_title('Math Scores')
	sns.histplot(df['reading_score'], kde=True, ax=axes_hist[1], color='lightcoral')
	axes_hist[1].set_title('Reading Scores')
	sns.histplot(df['writing_score'], kde=True, ax=axes_hist[2], color='lightgreen')
	axes_hist[2].set_title('Writing Scores')

	st.pyplot(fig_hist)

	st.header("3. Correlation Between Scores")
	st.write("The scores (math, reading, writing) are highly correlated with each other, indicating that performance in one subject is generally reflected in others.")
	score_cols = ['math_score', 'reading_score', 'writing_score', 'average_score']
	correlation_matrix = df[score_cols].corr()
	fig_corr, ax_corr = plt.subplots(figsize=(8, 6))
	sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", ax=ax_corr)
	ax_corr.set_title('Correlation Matrix of Student Scores')
	st.pyplot(fig_corr)

	st.header("4. Impact of Categorical Variables on Average Score")

	def plot_and_analyze_categorical(dataframe, category_col, score_col='average_score'):
	st.subheader(f"4.{plot_and_analyze_categorical.counter}. {category_col.replace('_', ' ').title()}")
	plot_and_analyze_categorical.counter += 1

	# Violin Plot
	fig_violin, ax_violin = plt.subplots(figsize=(10, 6))
	sns.violinplot(x=category_col, y=score_col, data=dataframe, ax=ax_violin)
	ax_violin.set_title(f"{score_col.replace('_', ' ').title()} by {category_col.replace('_', ' ').title()}")
	# Corrected line: Removed ha='right'
	ax_violin.tick_params(axis='x', rotation=45)
	st.pyplot(fig_violin)

	# Statistical Test
	st.write("Statistical Test Results:")
	if dataframe[category_col].nunique() == 2:
	group1 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[0]][score_col]
	group2 = dataframe[dataframe[category_col] == dataframe[category_col].unique()[1]][score_col]
	t_stat, p_val = stats.ttest_ind(group1, group2)
	st.write(f"Independent t-test between {dataframe[category_col].unique()[0]} and {dataframe[category_col].unique()[1]} for {score_col.replace('_', ' ')}:")
	st.write(f" t-statistic = {t_stat:.3f}, p-value = {p_val:.3f}")
	if p_val < 0.05:
	st.success(" Conclusion: Statistically significant difference between groups (p < 0.05).")
	else:
	st.info(" Conclusion: No statistically significant difference between groups (p >= 0.05).")
	else:
	model = ols(f'{score_col} ~ C({category_col})', data=dataframe).fit()
	anova_table = sm.stats.anova_lm(model, typ=2)
	st.write(f"ANOVA for {category_col} on {score_col.replace('_', ' ')}:")
	st.dataframe(anova_table)
	if anova_table['PR(>F)'][0] < 0.05:
	st.success(" Conclusion: Statistically significant differences between group means (p < 0.05).")
	else:
	st.info(" Conclusion: No statistically significant differences between group means (p >= 0.05).")

	plot_and_analyze_categorical.counter = 1 # Initialize counter

	categorical_cols = ['gender', 'race_ethnicity', 'parental_level_of_education', 'lunch', 'test_preparation_course']
	for col in categorical_cols:
	plot_and_analyze_categorical(df, col)

	st.markdown("--- ")
	st.success("EDA complete! The application highlights key insights into factors influencing student performance.")