Spaces:

enacimie
/

SimpleStats

Sleeping

App Files Files Community

SimpleStats / src /streamlit_app.py

enacimie

Update src/streamlit_app.py

094716a verified 4 months ago

raw

history blame contribute delete

8.63 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	from scipy import stats
	import io

	# Metadata
	AUTHOR = "Eduardo Nacimiento García"
	EMAIL = "enacimie@ull.edu.es"
	LICENSE = "Apache 2.0"

	# Page config
	st.set_page_config(
	page_title="SimpleStats",
	page_icon="📊",
	layout="wide",
	initial_sidebar_state="expanded",
	)

	# Title and credits
	st.title("📊 SimpleStats")
	st.markdown(f"Author: {AUTHOR} \| Email: {EMAIL} \| License: {LICENSE}")

	st.write("""
	Upload a CSV file or try the built-in demo dataset to perform statistical analysis: summary, charts, hypothesis tests, and more.
	""")

	# Generate demo dataset
	@st.cache_data
	def create_demo_data():
	np.random.seed(42)
	n = 200
	data = {
	"Age": np.random.normal(35, 12, n).astype(int),
	"Income": np.random.normal(45000, 15000, n),
	"Satisfaction": np.random.randint(1, 11, n), # scale 1-10
	"Group": np.random.choice(["A", "B", "C"], n),
	"Gender": np.random.choice(["M", "F"], n, p=[0.6, 0.4]),
	"Purchase": np.random.choice([0, 1], n, p=[0.7, 0.3])
	}
	df = pd.DataFrame(data)
	# Introduce some nulls for demo
	df.loc[np.random.choice(df.index, 10), "Income"] = np.nan
	df.loc[np.random.choice(df.index, 5), "Age"] = np.nan
	return df

	demo_df = create_demo_data()

	# Button to load demo data
	if st.button("🧪 Load Demo Dataset"):
	st.session_state['uploaded_file'] = demo_df.to_csv(index=False).encode('utf-8')
	st.session_state['file_name'] = "demo_data.csv"
	st.success("✅ Demo dataset loaded. Explore the features!")

	# File uploader
	uploaded_file = st.file_uploader("📂 Upload your CSV file", type=["csv"])

	# Determine data source
	if 'uploaded_file' in st.session_state and not uploaded_file:
	# Use demo if no file uploaded
	csv_bytes = st.session_state['uploaded_file']
	file_name = st.session_state['file_name']
	df = pd.read_csv(io.BytesIO(csv_bytes))
	st.info(f"Using demo dataset: `{file_name}`")
	elif uploaded_file is not None:
	df = pd.read_csv(uploaded_file)
	st.success("✅ File uploaded successfully.")
	else:
	df = None

	if df is not None:
	# Show data preview
	with st.expander("🔍 Data Preview (first 10 rows)"):
	st.dataframe(df.head(10))

	# Basic info
	st.subheader("📌 Dataset Information")
	col1, col2, col3 = st.columns(3)
	col1.metric("Rows", df.shape[0])
	col2.metric("Columns", df.shape[1])
	col3.metric("Missing Values", df.isnull().sum().sum())

	# Identify numeric and categorical columns
	numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
	categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()

	if len(numeric_cols) == 0:
	st.warning("⚠️ No numeric columns found for statistical analysis.")
	else:
	st.subheader("📈 Descriptive Statistics")
	st.dataframe(df[numeric_cols].describe())

	# Histogram
	st.subheader("📊 Histogram")
	selected_col = st.selectbox("Select a numeric column for histogram:", numeric_cols)
	fig_hist = px.histogram(df, x=selected_col, nbins=30, title=f"Histogram of {selected_col}", marginal="box")
	st.plotly_chart(fig_hist, use_container_width=True)

	# Scatter plot
	if len(numeric_cols) >= 2:
	st.subheader("📉 Scatter Plot")
	col_x = st.selectbox("Select X-axis column:", numeric_cols, key="x")
	col_y = st.selectbox("Select Y-axis column:", numeric_cols, key="y")
	color_by = st.selectbox("Color by (optional):", [None] + categorical_cols, key="color")
	if col_x != col_y:
	fig_scatter = px.scatter(df, x=col_x, y=col_y, color=color_by, title=f"{col_x} vs {col_y}")
	st.plotly_chart(fig_scatter, use_container_width=True)
	else:
	st.warning("⚠️ Please select two different columns.")

	# Correlation matrix
	st.subheader("🔗 Correlation Matrix")
	corr = df[numeric_cols].corr()
	fig_corr = px.imshow(corr, text_auto=".2f", aspect="auto", title="Correlation Matrix", color_continuous_scale='RdBu_r')
	st.plotly_chart(fig_corr, use_container_width=True)

	# Missing values per column
	st.subheader("🕳️ Missing Values by Column")
	nulls = df.isnull().sum()
	if nulls.sum() > 0:
	fig_nulls = px.bar(nulls, title="Missing Values by Column", labels={'value': 'Count', 'index': 'Column'}, color=nulls)
	st.plotly_chart(fig_nulls, use_container_width=True)
	else:
	st.success("✅ No missing values in the dataset.")

	# === STATISTICAL TESTS ===
	st.header("🧪 Statistical Tests")

	# Independent T-Test (for 2 groups)
	if len(numeric_cols) > 0 and len(categorical_cols) > 0:
	st.subheader("Independent T-Test (2 groups)")
	t_num_col = st.selectbox("Numeric variable:", numeric_cols, key="t_num")
	t_cat_col = st.selectbox("Categorical variable (must have exactly 2 groups):",
	[col for col in categorical_cols if df[col].nunique() == 2],
	key="t_cat")
	if t_cat_col:
	groups = df[t_cat_col].unique()
	if len(groups) == 2:
	group1 = df[df[t_cat_col] == groups[0]][t_num_col].dropna()
	group2 = df[df[t_cat_col] == groups[1]][t_num_col].dropna()
	t_stat, p_val = stats.ttest_ind(group1, group2, equal_var=False)
	st.write(f"T-Test result between `{groups[0]}` and `{groups[1]}` for `{t_num_col}`:")
	st.write(f"- T-statistic: {t_stat:.4f}")
	st.write(f"- P-value: {p_val:.4f}")
	if p_val < 0.05:
	st.success("🟢 Statistically significant difference (p < 0.05)")
	else:
	st.error("🔴 No statistically significant difference (p >= 0.05)")
	else:
	st.warning("Selected categorical variable does not have exactly 2 groups.")

	# ANOVA (for 3+ groups)
	if len(numeric_cols) > 0 and len(categorical_cols) > 0:
	st.subheader("ANOVA (3 or more groups)")
	anova_num_col = st.selectbox("Numeric variable:", numeric_cols, key="anova_num")
	anova_cat_col = st.selectbox("Categorical variable (3 or more groups):",
	[col for col in categorical_cols if df[col].nunique() >= 3],
	key="anova_cat")
	if anova_cat_col:
	groups = [df[df[anova_cat_col] == group][anova_num_col].dropna() for group in df[anova_cat_col].unique()]
	if len(groups) >= 3:
	f_stat, p_val = stats.f_oneway(*groups)
	st.write(f"ANOVA result for `{anova_num_col}` grouped by `{anova_cat_col}`:")
	st.write(f"- F-statistic: {f_stat:.4f}")
	st.write(f"- P-value: {p_val:.4f}")
	if p_val < 0.05:
	st.success("🟢 At least one group is significantly different (p < 0.05)")
	else:
	st.error("🔴 No significant differences between groups (p >= 0.05)")

	# Chi-Square Test (between two categorical variables)
	if len(categorical_cols) >= 2:
	st.subheader("Chi-Square Test (Association between categorical variables)")
	chi_col1 = st.selectbox("First categorical variable:", categorical_cols, key="chi1")
	chi_col2 = st.selectbox("Second categorical variable:", [col for col in categorical_cols if col != chi_col1], key="chi2")
	if chi_col1 and chi_col2:
	contingency_table = pd.crosstab(df[chi_col1], df[chi_col2])
	chi2, p_val, dof, expected = stats.chi2_contingency(contingency_table)
	st.write(f"Chi-Square result between `{chi_col1}` and `{chi_col2}`:")
	st.write(f"- Chi² statistic: {chi2:.4f}")
	st.write(f"- P-value: {p_val:.4f}")
	st.write(f"- Degrees of freedom: {dof}")
	if p_val < 0.05:
	st.success("🟢 Variables are associated (p < 0.05)")
	else:
	st.error("🔴 No evidence of association between variables (p >= 0.05)")
	with st.expander("📋 Contingency Table"):
	st.dataframe(contingency_table)

	else:
	st.info("👆 Upload a CSV file or click 'Load Demo Dataset' to get started.")

	# Footer
	st.markdown("---")
	st.caption(f"© {AUTHOR} \| License {LICENSE} \| Contact: {EMAIL}")