Spaces:

Deevyankar
/

CGA_Insights

Sleeping

App Files Files Community

CGA_Insights / app1.py

Deevyankar

Rename app.py to app1.py

b903f55 verified 4 months ago

raw

history blame contribute delete

15.2 kB

	# app.py — Marks Clustering (Student + Section) for RE.xlsx
	# Hugging Face Gradio app:
	# - Student-level clustering (k=3)
	# - Section-level clustering (k=3)
	# - Graphs + auto-generated reasons
	# - Downloadable Excel report (fixed: returns file path, not tuple)

	import io
	import tempfile
	import numpy as np
	import pandas as pd
	import gradio as gr

	from sklearn.preprocessing import StandardScaler
	from sklearn.cluster import KMeans
	from sklearn.decomposition import PCA
	from sklearn.metrics import silhouette_score

	import plotly.express as px


	MAXIMA = {
	"Quiz": 10.0,
	"Mid": 20.0,
	"Lab Report": 20.0,
	"Lab Activity": 10.0,
	"Final": 40.0,
	}


	def _normalize_colname(c: str) -> str:
	c = str(c).strip().lower()
	c = c.replace("_", " ").replace("-", " ")
	c = c.replace("0", "o") # fixes '0ctivity' -> 'octivity'
	c = " ".join(c.split())
	return c


	def _read_excel_safely(path: str) -> pd.DataFrame:
	"""
	Auto-handle cases where the first row is blank and headers start on row 2.
	"""
	df_try = pd.read_excel(path)
	if all(str(c).startswith("Unnamed") for c in df_try.columns):
	df_try = pd.read_excel(path, header=1)
	return df_try


	def _map_columns(df: pd.DataFrame) -> pd.DataFrame:
	cols = list(df.columns)
	norm = {_normalize_colname(c): c for c in cols}

	candidates = {
	"studentid": ["studentid", "student id", "id", "student", "student number", "student no"],
	"section": ["section", "sec", "class", "group"],
	"quiz": ["quiz", "quiz (10)", "quiz(10)", "quizzes"],
	"mid": ["mid", "mid (20)", "mid(20)", "midterm", "mid term"],
	"lab report": ["lab report", "lab report(20)", "labreport", "report", "lab reports", "labreport(20)"],
	"lab activity": [
	"lab activity", "lab activity(10)", "labactivity", "activity", "lab act",
	"lab octivity(10)" # after normalization, 0->o helps too
	],
	"final": ["final", "final (40)", "final(40)", "final exam", "exam", "endterm", "end term"],
	}

	mapped = {}
	for logical, opts in candidates.items():
	for o in opts:
	o2 = _normalize_colname(o)
	if o2 in norm:
	mapped[logical] = norm[o2]
	break

	# StudentID is optional (we can generate)
	required = ["section", "quiz", "mid", "lab report", "lab activity", "final"]
	missing = [r for r in required if r not in mapped]
	if missing:
	raise ValueError(
	"Missing required columns (after auto-matching): "
	+ ", ".join(missing)
	+ "\n\nYour file should include: Section, Quiz, Mid, Lab Report, Lab Activity, Final."
	)

	df2 = df.rename(
	columns={
	mapped["section"]: "Section",
	mapped["quiz"]: "Quiz",
	mapped["mid"]: "Mid",
	mapped["lab report"]: "Lab Report",
	mapped["lab activity"]: "Lab Activity",
	mapped["final"]: "Final",
	}
	).copy()

	if "studentid" in mapped:
	df2 = df2.rename(columns={mapped["studentid"]: "StudentID"})
	else:
	df2.insert(0, "StudentID", [f"S{i+1:03d}" for i in range(len(df2))])

	return df2


	def _clean(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()
	df["StudentID"] = df["StudentID"].astype(str).str.strip()
	df["Section"] = df["Section"].astype(str).str.strip()

	for c in ["Quiz", "Mid", "Lab Report", "Lab Activity", "Final"]:
	df[c] = pd.to_numeric(df[c], errors="coerce")

	df = df.dropna(subset=["Section", "Quiz", "Mid", "Lab Report", "Lab Activity", "Final"])
	df = df[(df["StudentID"] != "") & (df["Section"] != "")]

	for c, mx in MAXIMA.items():
	df[c] = df[c].clip(lower=0, upper=mx)

	return df


	def _features(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()

	for c, mx in MAXIMA.items():
	df[f"{c}_pct"] = df[c] / mx

	comp = [f"{c}_pct" for c in ["Quiz", "Mid", "Lab Report", "Lab Activity", "Final"]]
	df["mean_score"] = df[comp].mean(axis=1)
	df["std_score"] = df[comp].std(axis=1)

	df["lab_avg"] = df[["Lab Report_pct", "Lab Activity_pct"]].mean(axis=1)
	df["theory_avg"] = df[["Quiz_pct", "Mid_pct", "Final_pct"]].mean(axis=1)

	df["exam_gap"] = df["Final_pct"] - df[["Quiz_pct", "Mid_pct"]].mean(axis=1)
	df["coursework_avg"] = df[["Quiz_pct", "Mid_pct", "Lab Report_pct", "Lab Activity_pct"]].mean(axis=1)
	df["final_vs_coursework_gap"] = df["Final_pct"] - df["coursework_avg"]
	df["lab_theory_gap"] = df["lab_avg"] - df["theory_avg"]

	return df


	def _kmeans(df_feat: pd.DataFrame, k=3):
	scaler = StandardScaler()
	X = scaler.fit_transform(df_feat.values)
	model = KMeans(n_clusters=k, random_state=42, n_init=10)
	labels = model.fit_predict(X)

	sil = None
	try:
	if len(df_feat) >= k + 2 and len(set(labels)) > 1:
	sil = float(silhouette_score(X, labels))
	except Exception:
	sil = None

	return labels, X, sil


	def _pca_fig(X, labels, hover_df, title):
	pca = PCA(n_components=2, random_state=42)
	coords = pca.fit_transform(X)

	plot_df = hover_df.copy()
	plot_df["PC1"] = coords[:, 0]
	plot_df["PC2"] = coords[:, 1]
	plot_df["Cluster"] = labels.astype(int).astype(str)

	fig = px.scatter(
	plot_df,
	x="PC1",
	y="PC2",
	color="Cluster",
	hover_data=list(hover_df.columns),
	title=title,
	)
	fig.update_layout(height=480)
	return fig


	def _student_reason(r):
	s = float(r["std_score"])
	eg = float(r["final_vs_coursework_gap"])
	ltg = float(r["lab_theory_gap"])

	bullets = []
	if s <= 0.10:
	bullets.append("Low variation across assessments (consistent distribution).")
	elif s <= 0.18:
	bullets.append("Moderate variation across assessments (expected academic spread).")
	else:
	bullets.append("High variation across components (irregular distribution).")

	if abs(eg) >= 0.25:
	bullets.append("Large mismatch between final exam and coursework trend.")
	elif abs(eg) >= 0.15:
	bullets.append("Noticeable final vs coursework difference.")

	if abs(ltg) >= 0.20:
	bullets.append("Large mismatch between lab/practical and theory performance.")
	elif abs(ltg) >= 0.12:
	bullets.append("Some lab vs theory mismatch.")

	if (s > 0.18) or (abs(eg) >= 0.25) or (abs(ltg) >= 0.20):
	rec = "Recommendation: Consider academic review for assessment alignment and/or student support (not an allegation)."
	else:
	rec = "Recommendation: Pattern looks academically reasonable; routine moderation is sufficient."

	return "Key observations:\n- " + "\n- ".join(bullets) + "\n\n" + rec


	def _section_reason(r):
	s = float(r["std_score"])
	eg = float(r["final_vs_coursework_gap"])
	ltg = float(r["lab_theory_gap"])

	bullets = []
	if s <= 0.08:
	bullets.append("Uniform distribution across components in this section.")
	elif s <= 0.14:
	bullets.append("Genuine variation consistent with expected student diversity.")
	else:
	bullets.append("Higher variability across components (less uniform distribution).")

	if abs(eg) >= 0.18:
	bullets.append("Final vs coursework trend mismatch at section level.")
	if abs(ltg) >= 0.15:
	bullets.append("Lab vs theory mismatch at section level.")

	if (s > 0.14) or (abs(eg) >= 0.18) or (abs(ltg) >= 0.15):
	rec = "Recommendation: Review assessment design/moderation consistency for this section (not an allegation)."
	else:
	rec = "Recommendation: Distribution appears aligned; routine moderation is sufficient."

	return "Key observations:\n- " + "\n- ".join(bullets) + "\n\n" + rec


	def _make_excel(student_out, section_out, student_profile, section_profile) -> bytes:
	"""
	Return Excel file bytes.
	"""
	out = io.BytesIO()
	with pd.ExcelWriter(out, engine="openpyxl") as writer:
	student_out.to_excel(writer, index=False, sheet_name="Student_Level")
	section_out.to_excel(writer, index=False, sheet_name="Section_Level")
	student_profile.to_excel(writer, index=False, sheet_name="Student_Cluster_Profile")
	section_profile.to_excel(writer, index=False, sheet_name="Section_Cluster_Profile")
	out.seek(0)
	return out.getvalue()


	def run_analysis(file_obj):
	if file_obj is None:
	raise gr.Error("Please upload your Excel/CSV file first.")

	name = (getattr(file_obj, "name", "") or "").lower()
	try:
	if name.endswith(".csv"):
	raw = pd.read_csv(file_obj.name)
	else:
	raw = _read_excel_safely(file_obj.name)
	except Exception as e:
	raise gr.Error(f"Could not read the file. Details: {e}")

	df = _map_columns(raw)
	df = _clean(df)
	df = _features(df)

	if df.empty:
	raise gr.Error("No valid rows after cleaning. Please check missing values.")

	# Student-level clustering
	stud_feat = df[["mean_score", "std_score", "exam_gap", "lab_theory_gap", "final_vs_coursework_gap"]]
	stud_labels, Xs, stud_sil = _kmeans(stud_feat, k=3)
	df["StudentCluster"] = stud_labels.astype(int)
	df["StudentReason"] = df.apply(_student_reason, axis=1)

	hover_student = df[
	[
	"StudentID",
	"Section",
	"Quiz",
	"Mid",
	"Lab Report",
	"Lab Activity",
	"Final",
	"StudentCluster",
	"mean_score",
	"std_score",
	"final_vs_coursework_gap",
	"lab_theory_gap",
	]
	].copy()
	fig_student = _pca_fig(Xs, stud_labels, hover_student, "Student-Level Clustering (PCA)")

	stud_profile = (
	df.groupby("StudentCluster")[
	[
	"Quiz_pct",
	"Mid_pct",
	"Lab Report_pct",
	"Lab Activity_pct",
	"Final_pct",
	"mean_score",
	"std_score",
	"final_vs_coursework_gap",
	"lab_theory_gap",
	]
	]
	.mean()
	.reset_index()
	.sort_values("StudentCluster")
	)

	# Section-level aggregation + clustering
	sec_cols = [
	"Quiz_pct",
	"Mid_pct",
	"Lab Report_pct",
	"Lab Activity_pct",
	"Final_pct",
	"mean_score",
	"std_score",
	"exam_gap",
	"lab_theory_gap",
	"final_vs_coursework_gap",
	]
	sec = df.groupby("Section")[sec_cols].mean().reset_index()

	sec_feat = sec[["mean_score", "std_score", "exam_gap", "lab_theory_gap", "final_vs_coursework_gap"]]
	sec_labels, Xsec, sec_sil = _kmeans(sec_feat, k=3)
	sec["SectionCluster"] = sec_labels.astype(int)
	sec["SectionReason"] = sec.apply(_section_reason, axis=1)

	hover_section = sec[
	["Section", "SectionCluster", "mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]
	].copy()
	fig_section = _pca_fig(Xsec, sec_labels, hover_section, "Section-Level Clustering (PCA)")

	sec_profile = (
	sec.groupby("SectionCluster")[
	[
	"Quiz_pct",
	"Mid_pct",
	"Lab Report_pct",
	"Lab Activity_pct",
	"Final_pct",
	"mean_score",
	"std_score",
	"final_vs_coursework_gap",
	"lab_theory_gap",
	]
	]
	.mean()
	.reset_index()
	.sort_values("SectionCluster")
	)

	# Output tables (convert key metrics to % for readability)
	student_out = df[
	[
	"StudentID",
	"Section",
	"Quiz",
	"Mid",
	"Lab Report",
	"Lab Activity",
	"Final",
	"StudentCluster",
	"mean_score",
	"std_score",
	"final_vs_coursework_gap",
	"lab_theory_gap",
	"StudentReason",
	]
	].copy()
	for c in ["mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]:
	student_out[c] = (student_out[c] * 100).round(1)

	section_out = sec[
	[
	"Section",
	"SectionCluster",
	"mean_score",
	"std_score",
	"final_vs_coursework_gap",
	"lab_theory_gap",
	"SectionReason",
	]
	].copy()
	for c in ["mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]:
	section_out[c] = (section_out[c] * 100).round(1)

	# Profiles as %
	stud_profile_show = stud_profile.copy()
	sec_profile_show = sec_profile.copy()
	for t in [stud_profile_show, sec_profile_show]:
	for c in t.columns:
	if c != t.columns[0]:
	t[c] = (t[c] * 100).round(1)

	# Summary
	sil_s = f"{stud_sil:.3f}" if stud_sil is not None else "N/A"
	sil_c = f"{sec_sil:.3f}" if sec_sil is not None else "N/A"
	summary = (
	f"Students analysed: {len(df)}\n"
	f"Sections analysed: {df['Section'].nunique()}\n\n"
	f"Student-level silhouette (k=3): {sil_s}\n"
	f"Section-level silhouette (k=3): {sil_c}\n\n"
	f"Note: Clusters indicate patterns for review/support; they do not imply misconduct."
	)

	# Create downloadable Excel file and RETURN FILE PATH (Gradio expects a path)
	excel_bytes = _make_excel(student_out, section_out, stud_profile_show, sec_profile_show)
	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
	tmp.write(excel_bytes)
	tmp.flush()
	tmp.close()

	return summary, student_out, fig_student, stud_profile_show, section_out, fig_section, sec_profile_show, tmp.name


	with gr.Blocks(title="Marks Clustering (Student + Section)") as demo:
	gr.Markdown(
	"""
	# Marks Clustering Dashboard (Unsupervised)

	Upload your Excel/CSV marks file.
	This app produces Student-level + Section-level clustering (3 clusters) with graphs and reasons.

	Output is for academic QA/moderation and student support (no allegations).
	""".strip()
	)

	file_in = gr.File(label="Upload Excel/CSV", file_types=[".xlsx", ".xls", ".csv"])
	run_btn = gr.Button("Run Clustering", variant="primary")
	summary = gr.Textbox(label="Summary", lines=8)

	with gr.Tabs():
	with gr.Tab("Student-Level"):
	student_df = gr.Dataframe(label="Student-Level Results (with reasons)", wrap=True)
	student_plot = gr.Plot(label="Student PCA Plot")
	student_profile = gr.Dataframe(label="Student Cluster Profile (avg %)", wrap=True)

	with gr.Tab("Section-Level"):
	section_df = gr.Dataframe(label="Section-Level Results (with reasons)", wrap=True)
	section_plot = gr.Plot(label="Section PCA Plot")
	section_profile = gr.Dataframe(label="Section Cluster Profile (avg %)", wrap=True)

	with gr.Tab("Download"):
	dl = gr.File(label="Download Excel Report")

	run_btn.click(
	run_analysis,
	inputs=[file_in],
	outputs=[summary, student_df, student_plot, student_profile, section_df, section_plot, section_profile, dl],
	)

	if __name__ == "__main__":
	demo.launch()