CGA_Insights / app3.py
Deevyankar's picture
Rename app.py to app3.py
d0c53ff verified
# app.py — Unsupervised Clustering of Marks (Student + Section) with Manager Summary Graphs
# Robust to different column names + row-2 headers + CSV/XLSX.
# Adds: Manager Summary (graphs + brief recommendations) without changing clustering logic.
import io
import tempfile
import numpy as np
import pandas as pd
import gradio as gr
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import plotly.express as px
MAXIMA = {
"Quiz": 10.0,
"Mid": 20.0,
"Lab Report": 20.0,
"Lab Activity": 10.0,
"Final": 40.0,
}
# ----------------------------
# File reading + column mapping
# ----------------------------
def _normalize_colname(c: str) -> str:
c = str(c).strip().lower()
c = c.replace("_", " ").replace("-", " ")
c = c.replace("0", "o") # fixes Lab 0ctivity -> Lab octivity
c = " ".join(c.split())
return c
def _read_excel_safely(path: str) -> pd.DataFrame:
"""
Handles cases where first row is blank and real headers start on row 2.
"""
df_try = pd.read_excel(path)
if len(df_try.columns) > 0 and all(str(c).startswith("Unnamed") for c in df_try.columns):
df_try = pd.read_excel(path, header=1)
return df_try
def _map_columns(df: pd.DataFrame) -> pd.DataFrame:
cols = list(df.columns)
norm = {_normalize_colname(c): c for c in cols}
candidates = {
"studentid": ["studentid", "student id", "student no", "student number", "id", "roll", "roll no", "roll number"],
"section": ["section", "sec", "class", "group", "batch", "tutorial", "lab section"],
"quiz": ["quiz", "quiz (10)", "quiz(10)", "quiz out of 10", "quizzes", "qz"],
"mid": ["mid", "mid (20)", "mid(20)", "mid out of 20", "midterm", "mid term"],
"lab report": ["lab report", "lab report (20)", "lab report(20)", "labreport", "report", "lab reports"],
"lab activity": [
"lab activity", "lab activity (10)", "lab activity(10)", "labactivity",
"activity", "lab act", "lab octivity(10)", "lab 0ctivity(10)"
],
"final": ["final", "final (40)", "final(40)", "final out of 40", "final exam", "exam", "endterm", "end term"],
}
mapped = {}
for logical, opts in candidates.items():
for o in opts:
o2 = _normalize_colname(o)
if o2 in norm:
mapped[logical] = norm[o2]
break
required = ["section", "quiz", "mid", "lab report", "lab activity", "final"]
missing = [r for r in required if r not in mapped]
if missing:
raise ValueError(
"Missing required columns (after auto-matching): "
+ ", ".join(missing)
+ "\n\nYour file must contain columns representing: "
"Section, Quiz(10), Mid(20), Lab Report(20), Lab Activity(10), Final(40).\n"
"StudentID is optional."
)
df2 = df.rename(
columns={
mapped["section"]: "Section",
mapped["quiz"]: "Quiz",
mapped["mid"]: "Mid",
mapped["lab report"]: "Lab Report",
mapped["lab activity"]: "Lab Activity",
mapped["final"]: "Final",
}
).copy()
if "studentid" in mapped:
df2 = df2.rename(columns={mapped["studentid"]: "StudentID"})
else:
df2.insert(0, "StudentID", [f"S{i+1:03d}" for i in range(len(df2))])
return df2
def _clean(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df["StudentID"] = df["StudentID"].astype(str).str.strip()
df["Section"] = df["Section"].astype(str).str.strip()
for c in ["Quiz", "Mid", "Lab Report", "Lab Activity", "Final"]:
df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=["Section", "Quiz", "Mid", "Lab Report", "Lab Activity", "Final"])
df = df[(df["StudentID"] != "") & (df["Section"] != "")]
for c, mx in MAXIMA.items():
df[c] = df[c].clip(lower=0, upper=mx)
return df
# ----------------------------
# Features
# ----------------------------
def _features(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
for c, mx in MAXIMA.items():
df[f"{c}_pct"] = df[c] / mx
comp = [f"{c}_pct" for c in ["Quiz", "Mid", "Lab Report", "Lab Activity", "Final"]]
df["mean_score"] = df[comp].mean(axis=1)
df["std_score"] = df[comp].std(axis=1)
df["lab_avg"] = df[["Lab Report_pct", "Lab Activity_pct"]].mean(axis=1)
df["theory_avg"] = df[["Quiz_pct", "Mid_pct", "Final_pct"]].mean(axis=1)
df["exam_gap"] = df["Final_pct"] - df[["Quiz_pct", "Mid_pct"]].mean(axis=1)
df["coursework_avg"] = df[["Quiz_pct", "Mid_pct", "Lab Report_pct", "Lab Activity_pct"]].mean(axis=1)
df["final_vs_coursework_gap"] = df["Final_pct"] - df["coursework_avg"]
df["lab_theory_gap"] = df["lab_avg"] - df["theory_avg"]
return df
# ----------------------------
# Clustering helpers
# ----------------------------
def _kmeans(df_feat: pd.DataFrame, k=3):
scaler = StandardScaler()
X = scaler.fit_transform(df_feat.values)
model = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = model.fit_predict(X)
sil = None
try:
if len(df_feat) >= k + 2 and len(set(labels)) > 1:
sil = float(silhouette_score(X, labels))
except Exception:
sil = None
return labels, X, sil
def _pca_fig(X, labels, hover_df, title):
pca = PCA(n_components=2, random_state=42)
coords = pca.fit_transform(X)
plot_df = hover_df.copy()
plot_df["PC1"] = coords[:, 0]
plot_df["PC2"] = coords[:, 1]
plot_df["Cluster"] = labels.astype(int).astype(str)
fig = px.scatter(
plot_df, x="PC1", y="PC2",
color="Cluster",
hover_data=list(hover_df.columns),
title=title
)
fig.update_layout(height=420)
return fig
# ----------------------------
# Reasons (kept)
# ----------------------------
def _student_reason(r):
s = float(r["std_score"])
eg = float(r["final_vs_coursework_gap"])
ltg = float(r["lab_theory_gap"])
bullets = []
if s <= 0.10:
bullets.append("Low variation across assessments (consistent distribution).")
elif s <= 0.18:
bullets.append("Moderate variation across assessments (expected academic spread).")
else:
bullets.append("High variation across components (uneven distribution).")
if abs(eg) >= 0.25:
bullets.append("Large mismatch between final exam and coursework trend.")
elif abs(eg) >= 0.15:
bullets.append("Noticeable final vs coursework difference.")
if abs(ltg) >= 0.20:
bullets.append("Large mismatch between lab/practical and theory performance.")
elif abs(ltg) >= 0.12:
bullets.append("Some lab vs theory mismatch.")
if (s > 0.18) or (abs(eg) >= 0.25) or (abs(ltg) >= 0.20):
rec = "Recommendation: Consider academic review for assessment alignment and/or student support (not an allegation)."
else:
rec = "Recommendation: Pattern looks academically reasonable; routine moderation is sufficient."
return "Key observations:\n- " + "\n- ".join(bullets) + "\n\n" + rec
def _section_reason(r):
s = float(r["std_score"])
eg = float(r["final_vs_coursework_gap"])
ltg = float(r["lab_theory_gap"])
bullets = []
if s <= 0.08:
bullets.append("Uniform distribution across components in this section.")
elif s <= 0.14:
bullets.append("Genuine variation consistent with expected student diversity.")
else:
bullets.append("Higher variability across components (less uniform distribution).")
if abs(eg) >= 0.18:
bullets.append("Final vs coursework trend mismatch at section level.")
if abs(ltg) >= 0.15:
bullets.append("Lab vs theory mismatch at section level.")
if (s > 0.14) or (abs(eg) >= 0.18) or (abs(ltg) >= 0.15):
rec = "Recommendation: Review assessment design/moderation consistency for this section (not an allegation)."
else:
rec = "Recommendation: Distribution appears aligned; routine moderation is sufficient."
return "Key observations:\n- " + "\n- ".join(bullets) + "\n\n" + rec
# ----------------------------
# Manager-friendly meaning mapping (robust; no "white cluster" gaps)
# ----------------------------
def _assign_cluster_meanings(profile_df: pd.DataFrame, cluster_col: str) -> dict:
"""
Robust mapping:
- Final Exam Misalignment = most negative final_vs_coursework_gap
- Lab–Theory Mismatch = largest positive lab_theory_gap
- Remaining = Balanced / Acceptable Pattern
Guarantees ALL clusters get a label.
"""
dfp = profile_df.copy()
cids = [int(x) for x in dfp[cluster_col].tolist()]
if "final_vs_coursework_gap" not in dfp.columns or "lab_theory_gap" not in dfp.columns or len(cids) == 0:
return {cid: "Balanced / Acceptable Pattern" for cid in cids}
cid_final = int(dfp.loc[dfp["final_vs_coursework_gap"].idxmin(), cluster_col])
cid_lab = int(dfp.loc[dfp["lab_theory_gap"].idxmax(), cluster_col])
meaning = {cid_final: "Final Exam Misalignment", cid_lab: "Lab–Theory Mismatch"}
for cid in cids:
if cid not in meaning:
meaning[cid] = "Balanced / Acceptable Pattern"
return meaning
def _traffic_light(meaning: str) -> str:
return {
"Balanced / Acceptable Pattern": "🟢 Green",
"Lab–Theory Mismatch": "🟡 Amber",
"Final Exam Misalignment": "🔴 Red",
}.get(meaning, "⚪")
def _manager_signal(meaning: str) -> str:
if meaning == "Balanced / Acceptable Pattern":
return "Assessment components are broadly aligned (routine moderation)."
if meaning == "Lab–Theory Mismatch":
return "Practical outcomes stronger than theory; consider theory support and alignment review."
if meaning == "Final Exam Misalignment":
return "Final exam outcomes lower than coursework; review exam difficulty/format/weighting."
return ""
def _build_distribution_table(labels: pd.Series, meaning_map: dict, entity_name: str) -> pd.DataFrame:
total = int(labels.shape[0])
counts = labels.value_counts().sort_index()
rows = []
for cid, cnt in counts.items():
cid = int(cid)
meaning = meaning_map.get(cid, f"Cluster {cid}")
rows.append({
"Cluster": cid,
"Cluster Meaning": meaning,
"Traffic Light": _traffic_light(meaning),
"Count": int(cnt),
f"% of {entity_name}": round((int(cnt) / total) * 100, 1) if total else 0.0,
"Manager interpretation": _manager_signal(meaning),
})
return pd.DataFrame(rows)
# ----------------------------
# Manager graphs + recommendations
# ----------------------------
def _manager_graphs_and_reco(df_students: pd.DataFrame, df_sections: pd.DataFrame):
# Pie chart: students by meaning
s_counts = df_students["StudentClusterMeaning"].value_counts().reset_index()
s_counts.columns = ["Pattern", "Count"]
fig_students = px.pie(s_counts, values="Count", names="Pattern", title="Students by Assessment Pattern")
fig_students.update_layout(height=360)
# Bar chart: sections by meaning
c_counts = df_sections["SectionClusterMeaning"].value_counts().reset_index()
c_counts.columns = ["Pattern", "Count"]
fig_sections = px.bar(c_counts, x="Pattern", y="Count", title="Sections by Assessment Pattern")
fig_sections.update_layout(height=360)
# Coursework vs Final (overall)
avg_coursework = df_students["coursework_avg"].mean() * 100
avg_final = df_students["Final_pct"].mean() * 100
fig_exam = px.bar(
x=["Average Coursework", "Average Final Exam"],
y=[avg_coursework, avg_final],
title="Overall Coursework vs Final Exam (Average %)",
labels={"x": "Assessment", "y": "Average %"}
)
fig_exam.update_layout(height=360)
# Recommendations (simple, brief, manager style)
# Pull percentages from student distribution
total_students = len(df_students)
pct_bal = (df_students["StudentClusterMeaning"].eq("Balanced / Acceptable Pattern").mean() * 100) if total_students else 0
pct_lab = (df_students["StudentClusterMeaning"].eq("Lab–Theory Mismatch").mean() * 100) if total_students else 0
pct_final = (df_students["StudentClusterMeaning"].eq("Final Exam Misalignment").mean() * 100) if total_students else 0
rec = (
f"• {pct_bal:.1f}% of students show a balanced/acceptable assessment pattern.\n"
f"• {pct_lab:.1f}% show stronger lab performance than theory → consider targeted theory support and alignment checks.\n"
f"• {pct_final:.1f}% show final exam outcomes lower than coursework → review exam difficulty, format, and/or weightage.\n"
f"• These are assessment-pattern signals for QA/moderation; they do not indicate student misconduct."
)
return fig_students, fig_sections, fig_exam, rec
# ----------------------------
# Excel download
# ----------------------------
def _make_excel(student_out, section_out, student_profile, section_profile,
student_distribution, section_distribution, section_list) -> bytes:
out = io.BytesIO()
with pd.ExcelWriter(out, engine="openpyxl") as writer:
student_out.to_excel(writer, index=False, sheet_name="Student_Level")
section_out.to_excel(writer, index=False, sheet_name="Section_Level")
student_profile.to_excel(writer, index=False, sheet_name="Student_Cluster_Profile")
section_profile.to_excel(writer, index=False, sheet_name="Section_Cluster_Profile")
student_distribution.to_excel(writer, index=False, sheet_name="Student_Distribution")
section_distribution.to_excel(writer, index=False, sheet_name="Section_Distribution")
section_list.to_excel(writer, index=False, sheet_name="Section_List_With_Meaning")
out.seek(0)
return out.getvalue()
# ----------------------------
# Main analysis
# ----------------------------
def run_analysis(file_obj):
if file_obj is None:
raise gr.Error("Please upload an Excel/CSV file first.")
name = (getattr(file_obj, "name", "") or "").lower()
try:
if name.endswith(".csv"):
raw = pd.read_csv(file_obj.name)
else:
raw = _read_excel_safely(file_obj.name)
except Exception as e:
raise gr.Error(f"Could not read the file. Details: {e}")
try:
df = _map_columns(raw)
except Exception as e:
raise gr.Error(str(e))
df = _clean(df)
df = _features(df)
if df.empty:
raise gr.Error("No valid rows after cleaning. Please check missing values and column contents.")
# Student-level clustering
stud_feat = df[["mean_score", "std_score", "exam_gap", "lab_theory_gap", "final_vs_coursework_gap"]]
stud_labels, Xs, stud_sil = _kmeans(stud_feat, k=3)
df["StudentCluster"] = stud_labels.astype(int)
df["StudentReason"] = df.apply(_student_reason, axis=1)
# Student profile for meaning mapping
stud_profile = (
df.groupby("StudentCluster")[
["Quiz_pct", "Mid_pct", "Lab Report_pct", "Lab Activity_pct", "Final_pct",
"mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]
]
.mean()
.reset_index()
.sort_values("StudentCluster")
)
# Section-level aggregation + clustering
sec_cols = [
"Quiz_pct", "Mid_pct", "Lab Report_pct", "Lab Activity_pct", "Final_pct",
"mean_score", "std_score", "exam_gap", "lab_theory_gap", "final_vs_coursework_gap",
]
sec = df.groupby("Section")[sec_cols].mean().reset_index()
sec_feat = sec[["mean_score", "std_score", "exam_gap", "lab_theory_gap", "final_vs_coursework_gap"]]
sec_labels, Xsec, sec_sil = _kmeans(sec_feat, k=3)
sec["SectionCluster"] = sec_labels.astype(int)
sec["SectionReason"] = sec.apply(_section_reason, axis=1)
sec_profile = (
sec.groupby("SectionCluster")[
["Quiz_pct", "Mid_pct", "Lab Report_pct", "Lab Activity_pct", "Final_pct",
"mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]
]
.mean()
.reset_index()
.sort_values("SectionCluster")
)
# Meaning mapping (guaranteed labels)
student_meaning_map = _assign_cluster_meanings(stud_profile, "StudentCluster")
section_meaning_map = _assign_cluster_meanings(sec_profile, "SectionCluster")
df["StudentClusterMeaning"] = df["StudentCluster"].map(lambda x: student_meaning_map.get(int(x), f"Cluster {int(x)}"))
df["StudentTrafficLight"] = df["StudentClusterMeaning"].map(_traffic_light)
df["StudentManagerSignal"] = df["StudentClusterMeaning"].map(_manager_signal)
sec["SectionClusterMeaning"] = sec["SectionCluster"].map(lambda x: section_meaning_map.get(int(x), f"Cluster {int(x)}"))
sec["SectionTrafficLight"] = sec["SectionClusterMeaning"].map(_traffic_light)
sec["SectionManagerSignal"] = sec["SectionClusterMeaning"].map(_manager_signal)
# Manager distributions
student_distribution = _build_distribution_table(df["StudentCluster"], student_meaning_map, "Students")
section_distribution = _build_distribution_table(sec["SectionCluster"], section_meaning_map, "Sections")
# Manager graphs + recommendations
fig_students, fig_sections, fig_exam, reco = _manager_graphs_and_reco(df, sec)
# Plots (PCA)
hover_student = df[
["StudentID", "Section", "Quiz", "Mid", "Lab Report", "Lab Activity", "Final",
"StudentCluster", "StudentClusterMeaning", "StudentTrafficLight",
"mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]
].copy()
fig_student_pca = _pca_fig(Xs, stud_labels, hover_student, "Student-Level Clustering (PCA)")
hover_section = sec[
["Section", "SectionCluster", "SectionClusterMeaning", "SectionTrafficLight",
"mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]
].copy()
fig_section_pca = _pca_fig(Xsec, sec_labels, hover_section, "Section-Level Clustering (PCA)")
# Section list for managers (section numbers included)
section_list = sec[
["Section", "SectionCluster", "SectionClusterMeaning", "SectionTrafficLight", "SectionManagerSignal",
"mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap", "SectionReason"]
].copy()
for c in ["mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]:
section_list[c] = (section_list[c] * 100).round(1)
# Output tables
student_out = df[
["StudentID", "Section",
"StudentCluster", "StudentClusterMeaning", "StudentTrafficLight", "StudentManagerSignal",
"Quiz", "Mid", "Lab Report", "Lab Activity", "Final",
"mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap",
"StudentReason"]
].copy()
for c in ["mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]:
student_out[c] = (student_out[c] * 100).round(1)
section_out = sec[
["Section",
"SectionCluster", "SectionClusterMeaning", "SectionTrafficLight", "SectionManagerSignal",
"mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap",
"SectionReason"]
].copy()
for c in ["mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]:
section_out[c] = (section_out[c] * 100).round(1)
# Profiles as %
stud_profile_show = stud_profile.copy()
sec_profile_show = sec_profile.copy()
for t in [stud_profile_show, sec_profile_show]:
for c in t.columns:
if c != t.columns[0]:
t[c] = (t[c] * 100).round(1)
sil_s = f"{stud_sil:.3f}" if stud_sil is not None else "N/A"
sil_c = f"{sec_sil:.3f}" if sec_sil is not None else "N/A"
summary = (
f"Students analysed: {len(df)}\n"
f"Sections analysed: {df['Section'].nunique()}\n\n"
f"Student-level silhouette (k=3): {sil_s}\n"
f"Section-level silhouette (k=3): {sil_c}\n\n"
f"Use 'Cluster Meaning' and 'Traffic Light' for interpretation. "
f"Clusters reflect assessment patterns for QA/moderation (no allegations)."
)
# Download Excel
excel_bytes = _make_excel(
student_out, section_out,
stud_profile_show, sec_profile_show,
student_distribution, section_distribution,
section_list
)
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
tmp.write(excel_bytes)
tmp.flush()
tmp.close()
return (
summary,
# Manager Summary outputs
fig_students, fig_sections, fig_exam, reco,
student_distribution, section_distribution, section_list,
# Student tab outputs
student_out, fig_student_pca, stud_profile_show,
# Section tab outputs
section_out, fig_section_pca, sec_profile_show,
# Download
tmp.name
)
# ----------------------------
# UI
# ----------------------------
with gr.Blocks(title="Marks Clustering (Student + Section) — Manager Friendly") as demo:
gr.Markdown(
"""
# Marks Clustering Dashboard (Unsupervised) — Manager Friendly
Upload any **Excel/CSV** file that contains the assessment components:
**Section, Quiz(10), Mid(20), Lab Report(20), Lab Activity(10), Final(40)**.
StudentID is optional (auto-generated if missing).
This dashboard provides:
- **Manager Summary** (graphs + short recommendations)
- **Student-level** clustering (with reasons)
- **Section-level** clustering (with reasons)
- Downloadable Excel report
""".strip()
)
file_in = gr.File(label="Upload Excel/CSV", file_types=[".xlsx", ".xls", ".csv"])
run_btn = gr.Button("Run Analysis", variant="primary")
summary = gr.Textbox(label="Summary", lines=8)
with gr.Tabs():
with gr.Tab("Manager Summary"):
gr.Markdown("## Visual Summary (Easy for Managers)")
ms_fig_students = gr.Plot(label="Students by Pattern (Pie)")
ms_fig_sections = gr.Plot(label="Sections by Pattern (Bar)")
ms_fig_exam = gr.Plot(label="Coursework vs Final Exam (Overall)")
ms_reco = gr.Textbox(label="Brief Recommendations", lines=6)
gr.Markdown("## Quick Tables")
ms_student_dist = gr.Dataframe(label="Student Distribution", wrap=True, interactive=False)
ms_section_dist = gr.Dataframe(label="Section Distribution", wrap=True, interactive=False)
ms_section_list = gr.Dataframe(label="Section List (with meaning)", wrap=True, interactive=False)
with gr.Tab("Student-Level"):
student_df = gr.Dataframe(label="Student-Level Results (with meanings & reasons)", wrap=True)
student_plot = gr.Plot(label="Student PCA Plot")
student_profile = gr.Dataframe(label="Student Cluster Profile (avg %)", wrap=True)
with gr.Tab("Section-Level"):
section_df = gr.Dataframe(label="Section-Level Results (with meanings & reasons)", wrap=True)
section_plot = gr.Plot(label="Section PCA Plot")
section_profile = gr.Dataframe(label="Section Cluster Profile (avg %)", wrap=True)
with gr.Tab("Download"):
dl = gr.File(label="Download Excel Report")
run_btn.click(
run_analysis,
inputs=[file_in],
outputs=[
summary,
# Manager Summary
ms_fig_students, ms_fig_sections, ms_fig_exam, ms_reco,
ms_student_dist, ms_section_dist, ms_section_list,
# Student tab
student_df, student_plot, student_profile,
# Section tab
section_df, section_plot, section_profile,
# Download
dl
],
)
if __name__ == "__main__":
demo.launch()