Spaces:

Deevyankar
/

cga

Sleeping

App Files Files Community

cga / app.py

Deevyankar

Update app.py

bb85f6c verified about 1 month ago

raw

history blame contribute delete

17 kB

	import io
	import numpy as np
	import pandas as pd
	import gradio as gr
	import matplotlib.pyplot as plt


	# =============================
	# Helpers
	# =============================
	def _read_file_bytes(file_obj):
	if file_obj is None:
	raise ValueError("No file uploaded.")
	file_path = getattr(file_obj, "name", None)
	if not file_path:
	raise ValueError("Invalid uploaded file object.")
	with open(file_path, "rb") as f:
	b = f.read()
	if not b:
	raise ValueError("Uploaded file is empty.")
	return b


	def _drop_useless(df: pd.DataFrame) -> pd.DataFrame:
	df = df.dropna(axis=1, how="all").dropna(axis=0, how="all")
	unnamed = [c for c in df.columns if str(c).strip().lower().startswith("unnamed")]
	if unnamed:
	df = df.drop(columns=unnamed, errors="ignore")
	return df


	def _safe_numeric(s):
	return pd.to_numeric(s, errors="coerce")


	def _guess_marks_col(df: pd.DataFrame):
	cols = list(df.columns)
	best, best_score = cols[0], -1
	for c in cols:
	score = _safe_numeric(df[c]).notna().mean()
	if score > best_score:
	best_score = score
	best = c
	return best


	def _guess_grade_col(cols):
	low = {c: str(c).strip().lower() for c in cols}
	return next((c for c in cols if "grade" in low[c] or "grde" in low[c]), cols[0])


	def _guess_optional(cols):
	low = {c: str(c).strip().lower() for c in cols}
	course = next((c for c in cols if any(k in low[c] for k in ["course", "module", "subject"])), None)
	section = next((c for c in cols if any(k in low[c] for k in ["section", "group", "batch", "class"])), None)
	return course, section


	def apply_filters(df, course_col, section_col, course_filter, section_filter):
	d = df.copy()
	if course_col and course_col in d.columns and course_filter and course_filter != "(all)":
	d = d[d[course_col].astype(str).fillna("NA") == course_filter]
	if section_col and section_col in d.columns and section_filter and section_filter != "(all)":
	d = d[d[section_col].astype(str).fillna("NA") == section_filter]
	return d


	# =============================
	# Core HoD Insights (no student tables)
	# =============================
	def compute_hod_insights(df, marks_col, grade_col, pass_mark, course_col, section_col, course_filter, section_filter):
	if df is None or df.empty:
	raise gr.Error("Sheet is empty.")

	d = apply_filters(df, course_col, section_col, course_filter, section_filter).copy()

	d["_marks"] = _safe_numeric(d[marks_col]) if marks_col in d.columns else np.nan
	d["_grade"] = d[grade_col].astype(str).str.strip().replace({"nan": "NA"}) if grade_col in d.columns else "NA"

	total = int(len(d))
	valid = d[d["_marks"].notna()].copy()
	n = int(len(valid))
	missing = int(d["_marks"].isna().sum())

	pass_mark = int(pass_mark)
	mean = float(valid["_marks"].mean()) if n else 0.0
	std = float(valid["_marks"].std(ddof=0)) if n else 0.0
	minv = float(valid["_marks"].min()) if n else 0.0
	maxv = float(valid["_marks"].max()) if n else 0.0

	pass_count = int((valid["_marks"] >= pass_mark).sum()) if n else 0
	pass_rate = (pass_count / n * 100.0) if n else 0.0

	# Borderline (pass to pass+5) and just-below (pass-5 to pass-1)
	borderline_pass = int(((valid["_marks"] >= pass_mark) & (valid["_marks"] < pass_mark + 5)).sum()) if n else 0
	borderline_fail = int(((valid["_marks"] < pass_mark) & (valid["_marks"] >= pass_mark - 5)).sum()) if n else 0

	# Distribution shape
	skew = float(valid["_marks"].skew()) if n else 0.0
	kurt = float(valid["_marks"].kurt()) if n else 0.0

	# Outliers by IQR
	outlier_count = 0
	low_thr = high_thr = 0.0
	if n:
	q1 = float(np.percentile(valid["_marks"], 25))
	q3 = float(np.percentile(valid["_marks"], 75))
	iqr = q3 - q1
	low_thr = q1 - 1.5 * iqr
	high_thr = q3 + 1.5 * iqr
	outlier_count = int(((valid["_marks"] < low_thr) \| (valid["_marks"] > high_thr)).sum())

	# Percentiles
	pct_df = pd.DataFrame(
	[(f"P{p}", round(float(np.percentile(valid["_marks"], p)), 2)) for p in [10, 25, 50, 75, 90]]
	if n else [],
	columns=["Percentile", "Marks"]
	)

	# Grade distribution
	grade_dist = d["_grade"].value_counts(dropna=False).rename("count").to_frame().reset_index()
	grade_dist.columns = [grade_col, "count"]
	grade_dist["%"] = (grade_dist["count"] / grade_dist["count"].sum() * 100).round(2) if len(grade_dist) else 0

	# Grade ↔ marks mapping (moderation evidence)
	grade_stats = (
	valid.groupby(d["_grade"])["_marks"]
	.agg(["count", "mean", "std", "min", "median", "max"])
	.reset_index()
	.rename(columns={"_grade": "Grade"})
	.sort_values("mean", ascending=False)
	)

	# Mark heaping (repeated marks)
	heaping = (
	valid["_marks"].round(0).astype(int)
	.value_counts().head(12)
	.rename("count").reset_index()
	.rename(columns={"index": "Mark"})
	)

	# Course status
	if pass_rate >= 80:
	status = "GREEN"
	elif pass_rate >= 60:
	status = "AMBER"
	else:
	status = "RED"

	# Flags
	flags = []
	if missing > 0:
	flags.append(f"{missing} missing mark(s) — verify completeness.")
	if borderline_fail > max(5, 0.03 * n):
	flags.append("Many students just below pass — consider targeted support / moderation review.")
	if borderline_pass > max(5, 0.03 * n):
	flags.append("Many students just above pass — borderline attainment cluster.")
	if abs(skew) > 0.7:
	flags.append("Skewed distribution — check assessment balance and marking consistency.")
	if outlier_count > 0:
	flags.append(f"{outlier_count} outlier(s) by IQR — spot-check extremes.")
	if len(heaping) and heaping["count"].iloc[0] >= max(10, 0.06 * n):
	flags.append("Heaping detected — many students share identical marks (rounding/marking pattern).")

	flags_text = " \| ".join(flags) if flags else "No major warning patterns detected."

	insight = (
	f"Status: {status} \n"
	f"Pass rate: {pass_rate:.1f}% (Pass mark = {pass_mark}) \n"
	f"Avg: {mean:.1f} \| Std: {std:.1f} \| Min/Max: {minv:.1f}/{maxv:.1f} \n"
	f"Borderline (just below pass): {borderline_fail} \| Borderline (just above pass): {borderline_pass} \n"
	f"Skew: {skew:.2f} \| Kurtosis: {kurt:.2f} \| Outliers: {outlier_count} \| Missing: {missing} \n"
	f"Flags: {flags_text}"
	)

	# KPI table
	kpi = pd.DataFrame(
	[
	("Total rows (filtered)", total),
	("Students with numeric marks", n),
	("Missing marks", missing),
	("Pass mark", pass_mark),
	("Pass count", pass_count),
	("Pass rate (%)", round(pass_rate, 2)),
	("Borderline just below pass", borderline_fail),
	("Borderline just above pass", borderline_pass),
	("Average", round(mean, 2)),
	("Std deviation", round(std, 2)),
	("Minimum", round(minv, 2)),
	("Maximum", round(maxv, 2)),
	("Skewness", round(skew, 3)),
	("Kurtosis", round(kurt, 3)),
	("Outlier low threshold (IQR)", round(low_thr, 2)),
	("Outlier high threshold (IQR)", round(high_thr, 2)),
	("Outlier count (IQR)", outlier_count),
	("Status", status),
	],
	columns=["Metric", "Value"],
	)

	# Charts
	# 1 Histogram
	fig1 = plt.figure()
	plt.hist(valid["_marks"].dropna(), bins=12)
	plt.axvline(pass_mark, linestyle="--")
	plt.title("Marks distribution (Histogram)")
	plt.xlabel("Marks")
	plt.ylabel("Students")

	# 2 CDF
	fig2 = plt.figure()
	xs = np.sort(valid["_marks"].dropna().values) if n else np.array([])
	ys = np.arange(1, len(xs) + 1) / len(xs) if len(xs) else np.array([])
	if len(xs):
	plt.plot(xs, ys)
	plt.axvline(pass_mark, linestyle="--")
	plt.title("CDF (Proportion of students ≤ mark)")
	plt.xlabel("Marks")
	plt.ylabel("Proportion")

	# 3 Grade distribution
	fig3 = plt.figure()
	gd = grade_dist.set_index(grade_col)["count"]
	plt.bar(gd.index.astype(str), gd.values)
	plt.title("Grade distribution")
	plt.xlabel("Grade")
	plt.ylabel("Count")
	plt.xticks(rotation=45, ha="right")

	# 4 Boxplot by grade (moderation)
	fig4 = plt.figure()
	if not grade_stats.empty:
	order = grade_stats["Grade"].tolist()
	data = [valid.loc[d["_grade"] == g, "_marks"].dropna().values for g in order]
	plt.boxplot(data, tick_labels=[str(g) for g in order], vert=True)
	plt.title("Marks spread by Grade (Boxplot)")
	plt.xlabel("Grade")
	plt.ylabel("Marks")
	plt.xticks(rotation=45, ha="right")
	else:
	plt.title("Marks spread by Grade (Boxplot)")

	# 5 Section comparison (optional)
	fig5 = plt.figure()
	section_table = pd.DataFrame()
	if section_col and section_col in d.columns and n:
	sec = valid.groupby(d[section_col].astype(str).fillna("NA"))["_marks"].agg(["count", "mean"]).reset_index()
	sec["pass_rate_%"] = (valid.groupby(d[section_col].astype(str).fillna("NA"))["_marks"].apply(lambda x: (x >= pass_mark).mean() * 100)).values
	sec = sec.rename(columns={section_col: "Section"})
	section_table = sec.sort_values("pass_rate_%", ascending=False)

	plt.bar(section_table["Section"].astype(str), section_table["pass_rate_%"].values)
	plt.title("Section-wise Pass Rate (%)")
	plt.xlabel("Section")
	plt.ylabel("Pass rate (%)")
	plt.xticks(rotation=45, ha="right")
	else:
	plt.title("Section-wise Pass Rate (%) — not available (no section column)")
	plt.axis("off")

	return kpi, pct_df, grade_dist, grade_stats, heaping, section_table, insight, fig1, fig2, fig3, fig4, fig5


	# =============================
	# UI
	# =============================
	with gr.Blocks(title="HoD Result Dashboard") as demo:
	gr.Markdown("## 📊 HoD Result Dashboard — Insights & Patterns (No Student Tables, No PDF)")

	file_bytes_state = gr.State(None) # bytes
	sheet_state = gr.State(None) # sheet name string

	with gr.Row():
	upload = gr.File(label="Upload Excel (.xlsx)", file_types=[".xlsx"])
	sheet_dd = gr.Dropdown(label="Sheet", choices=[], interactive=False)

	with gr.Row():
	marks_col = gr.Dropdown(label="Marks column", choices=[], interactive=False)
	grade_col = gr.Dropdown(label="Grade column", choices=[], interactive=False)
	pass_mark = gr.Number(label="Pass mark", value=50, precision=0)

	with gr.Row():
	course_col = gr.Dropdown(label="Course column (optional)", choices=[], interactive=False, visible=False)
	section_col = gr.Dropdown(label="Section column (optional)", choices=[], interactive=False, visible=False)

	with gr.Row():
	course_filter = gr.Dropdown(label="Course filter", choices=["(all)"], value="(all)", interactive=False, visible=False)
	section_filter = gr.Dropdown(label="Section filter", choices=["(all)"], value="(all)", interactive=False, visible=False)

	analyze_btn = gr.Button("🔍 Refresh HoD Dashboard")

	insight_md = gr.Markdown("")

	with gr.Tab("Tables"):
	with gr.Row():
	kpi_table = gr.Dataframe(label="KPI Summary", interactive=False, wrap=True)
	pct_table = gr.Dataframe(label="Percentiles", interactive=False, wrap=True)
	with gr.Row():
	grade_dist_table = gr.Dataframe(label="Grade Distribution", interactive=False, wrap=True)
	heaping_table = gr.Dataframe(label="Mark Heaping (Top repeated marks)", interactive=False, wrap=True)
	grade_stats_table = gr.Dataframe(label="Grade ↔ Marks (Moderation evidence)", interactive=False, wrap=True)
	section_table = gr.Dataframe(label="Section Comparison (if available)", interactive=False, wrap=True)

	with gr.Tab("Charts"):
	with gr.Row():
	hist_plot = gr.Plot(label="Histogram")
	cdf_plot = gr.Plot(label="CDF")
	with gr.Row():
	grade_plot = gr.Plot(label="Grade distribution")
	grade_box = gr.Plot(label="Boxplot by grade")
	section_plot = gr.Plot(label="Section-wise pass rate")

	# -------- callbacks
	def on_upload(file_obj):
	b = _read_file_bytes(file_obj)
	xls = pd.ExcelFile(io.BytesIO(b), engine="openpyxl")
	sheets = xls.sheet_names or []
	if not sheets:
	raise gr.Error("No sheets found in workbook.")

	sheet0 = sheets[0]
	df0 = _drop_useless(pd.read_excel(io.BytesIO(b), sheet_name=sheet0, engine="openpyxl"))

	cols = list(df0.columns)
	m_guess = _guess_marks_col(df0)
	g_guess = _guess_grade_col(cols)
	c_guess, s_guess = _guess_optional(cols)

	# Optional filters
	course_col_upd = gr.update(choices=cols, value=(c_guess or cols[0]), visible=bool(c_guess), interactive=bool(c_guess))
	section_col_upd = gr.update(choices=cols, value=(s_guess or cols[0]), visible=bool(s_guess), interactive=bool(s_guess))

	course_filter_upd = gr.update(choices=["(all)"], value="(all)", visible=False, interactive=False)
	section_filter_upd = gr.update(choices=["(all)"], value="(all)", visible=False, interactive=False)

	if c_guess and c_guess in df0.columns:
	vals = ["(all)"] + sorted(df0[c_guess].astype(str).fillna("NA").unique().tolist())
	course_filter_upd = gr.update(choices=vals, value="(all)", visible=True, interactive=True)

	if s_guess and s_guess in df0.columns:
	vals = ["(all)"] + sorted(df0[s_guess].astype(str).fillna("NA").unique().tolist())
	section_filter_upd = gr.update(choices=vals, value="(all)", visible=True, interactive=True)

	return (
	gr.update(choices=sheets, value=sheet0, interactive=True), # sheet_dd
	gr.update(choices=cols, value=m_guess, interactive=True), # marks_col
	gr.update(choices=cols, value=g_guess, interactive=True), # grade_col
	course_col_upd,
	section_col_upd,
	course_filter_upd,
	section_filter_upd,
	b, # file_bytes_state
	sheet0, # sheet_state
	)

	upload.change(
	fn=on_upload,
	inputs=[upload],
	outputs=[sheet_dd, marks_col, grade_col, course_col, section_col, course_filter, section_filter, file_bytes_state, sheet_state],
	)

	def on_sheet_change(sheet_name, file_bytes, course_col_val, section_col_val):
	if not file_bytes:
	raise gr.Error("Upload Excel first.")

	df = _drop_useless(pd.read_excel(io.BytesIO(file_bytes), sheet_name=sheet_name, engine="openpyxl"))

	cf = gr.update(choices=["(all)"], value="(all)", visible=False, interactive=False)
	sf = gr.update(choices=["(all)"], value="(all)", visible=False, interactive=False)

	if course_col_val and course_col_val in df.columns:
	vals = ["(all)"] + sorted(df[course_col_val].astype(str).fillna("NA").unique().tolist())
	cf = gr.update(choices=vals, value="(all)", visible=True, interactive=True)

	if section_col_val and section_col_val in df.columns:
	vals = ["(all)"] + sorted(df[section_col_val].astype(str).fillna("NA").unique().tolist())
	sf = gr.update(choices=vals, value="(all)", visible=True, interactive=True)

	return cf, sf, sheet_name

	sheet_dd.change(
	fn=on_sheet_change,
	inputs=[sheet_dd, file_bytes_state, course_col, section_col],
	outputs=[course_filter, section_filter, sheet_state],
	)

	def on_refresh(file_bytes, sheet_name, m_col, g_col, pmark, c_col, s_col, c_filter, s_filter):
	if not file_bytes:
	raise gr.Error("Upload Excel first.")
	if not sheet_name:
	raise gr.Error("Select a sheet.")

	df = _drop_useless(pd.read_excel(io.BytesIO(file_bytes), sheet_name=sheet_name, engine="openpyxl"))

	kpi, pct, gdist, gstats, heap, sec_tbl, insight, f1, f2, f3, f4, f5 = compute_hod_insights(
	df, m_col, g_col, int(pmark), c_col, s_col, c_filter, s_filter
	)

	return (
	"### HoD Insight\n\n" + insight,
	kpi, pct, gdist, heap, gstats, sec_tbl,
	f1, f2, f3, f4, f5
	)

	analyze_btn.click(
	fn=on_refresh,
	inputs=[file_bytes_state, sheet_state, marks_col, grade_col, pass_mark, course_col, section_col, course_filter, section_filter],
	outputs=[insight_md, kpi_table, pct_table, grade_dist_table, heaping_table, grade_stats_table, section_table,
	hist_plot, cdf_plot, grade_plot, grade_box, section_plot],
	)

	demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)