CGA_Insights / app1.py
Deevyankar's picture
Rename app.py to app1.py
b903f55 verified
# app.py — Marks Clustering (Student + Section) for RE.xlsx
# Hugging Face Gradio app:
# - Student-level clustering (k=3)
# - Section-level clustering (k=3)
# - Graphs + auto-generated reasons
# - Downloadable Excel report (fixed: returns file path, not tuple)
import io
import tempfile
import numpy as np
import pandas as pd
import gradio as gr
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import plotly.express as px
MAXIMA = {
"Quiz": 10.0,
"Mid": 20.0,
"Lab Report": 20.0,
"Lab Activity": 10.0,
"Final": 40.0,
}
def _normalize_colname(c: str) -> str:
c = str(c).strip().lower()
c = c.replace("_", " ").replace("-", " ")
c = c.replace("0", "o") # fixes '0ctivity' -> 'octivity'
c = " ".join(c.split())
return c
def _read_excel_safely(path: str) -> pd.DataFrame:
"""
Auto-handle cases where the first row is blank and headers start on row 2.
"""
df_try = pd.read_excel(path)
if all(str(c).startswith("Unnamed") for c in df_try.columns):
df_try = pd.read_excel(path, header=1)
return df_try
def _map_columns(df: pd.DataFrame) -> pd.DataFrame:
cols = list(df.columns)
norm = {_normalize_colname(c): c for c in cols}
candidates = {
"studentid": ["studentid", "student id", "id", "student", "student number", "student no"],
"section": ["section", "sec", "class", "group"],
"quiz": ["quiz", "quiz (10)", "quiz(10)", "quizzes"],
"mid": ["mid", "mid (20)", "mid(20)", "midterm", "mid term"],
"lab report": ["lab report", "lab report(20)", "labreport", "report", "lab reports", "labreport(20)"],
"lab activity": [
"lab activity", "lab activity(10)", "labactivity", "activity", "lab act",
"lab octivity(10)" # after normalization, 0->o helps too
],
"final": ["final", "final (40)", "final(40)", "final exam", "exam", "endterm", "end term"],
}
mapped = {}
for logical, opts in candidates.items():
for o in opts:
o2 = _normalize_colname(o)
if o2 in norm:
mapped[logical] = norm[o2]
break
# StudentID is optional (we can generate)
required = ["section", "quiz", "mid", "lab report", "lab activity", "final"]
missing = [r for r in required if r not in mapped]
if missing:
raise ValueError(
"Missing required columns (after auto-matching): "
+ ", ".join(missing)
+ "\n\nYour file should include: Section, Quiz, Mid, Lab Report, Lab Activity, Final."
)
df2 = df.rename(
columns={
mapped["section"]: "Section",
mapped["quiz"]: "Quiz",
mapped["mid"]: "Mid",
mapped["lab report"]: "Lab Report",
mapped["lab activity"]: "Lab Activity",
mapped["final"]: "Final",
}
).copy()
if "studentid" in mapped:
df2 = df2.rename(columns={mapped["studentid"]: "StudentID"})
else:
df2.insert(0, "StudentID", [f"S{i+1:03d}" for i in range(len(df2))])
return df2
def _clean(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df["StudentID"] = df["StudentID"].astype(str).str.strip()
df["Section"] = df["Section"].astype(str).str.strip()
for c in ["Quiz", "Mid", "Lab Report", "Lab Activity", "Final"]:
df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=["Section", "Quiz", "Mid", "Lab Report", "Lab Activity", "Final"])
df = df[(df["StudentID"] != "") & (df["Section"] != "")]
for c, mx in MAXIMA.items():
df[c] = df[c].clip(lower=0, upper=mx)
return df
def _features(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
for c, mx in MAXIMA.items():
df[f"{c}_pct"] = df[c] / mx
comp = [f"{c}_pct" for c in ["Quiz", "Mid", "Lab Report", "Lab Activity", "Final"]]
df["mean_score"] = df[comp].mean(axis=1)
df["std_score"] = df[comp].std(axis=1)
df["lab_avg"] = df[["Lab Report_pct", "Lab Activity_pct"]].mean(axis=1)
df["theory_avg"] = df[["Quiz_pct", "Mid_pct", "Final_pct"]].mean(axis=1)
df["exam_gap"] = df["Final_pct"] - df[["Quiz_pct", "Mid_pct"]].mean(axis=1)
df["coursework_avg"] = df[["Quiz_pct", "Mid_pct", "Lab Report_pct", "Lab Activity_pct"]].mean(axis=1)
df["final_vs_coursework_gap"] = df["Final_pct"] - df["coursework_avg"]
df["lab_theory_gap"] = df["lab_avg"] - df["theory_avg"]
return df
def _kmeans(df_feat: pd.DataFrame, k=3):
scaler = StandardScaler()
X = scaler.fit_transform(df_feat.values)
model = KMeans(n_clusters=k, random_state=42, n_init=10)
labels = model.fit_predict(X)
sil = None
try:
if len(df_feat) >= k + 2 and len(set(labels)) > 1:
sil = float(silhouette_score(X, labels))
except Exception:
sil = None
return labels, X, sil
def _pca_fig(X, labels, hover_df, title):
pca = PCA(n_components=2, random_state=42)
coords = pca.fit_transform(X)
plot_df = hover_df.copy()
plot_df["PC1"] = coords[:, 0]
plot_df["PC2"] = coords[:, 1]
plot_df["Cluster"] = labels.astype(int).astype(str)
fig = px.scatter(
plot_df,
x="PC1",
y="PC2",
color="Cluster",
hover_data=list(hover_df.columns),
title=title,
)
fig.update_layout(height=480)
return fig
def _student_reason(r):
s = float(r["std_score"])
eg = float(r["final_vs_coursework_gap"])
ltg = float(r["lab_theory_gap"])
bullets = []
if s <= 0.10:
bullets.append("Low variation across assessments (consistent distribution).")
elif s <= 0.18:
bullets.append("Moderate variation across assessments (expected academic spread).")
else:
bullets.append("High variation across components (irregular distribution).")
if abs(eg) >= 0.25:
bullets.append("Large mismatch between final exam and coursework trend.")
elif abs(eg) >= 0.15:
bullets.append("Noticeable final vs coursework difference.")
if abs(ltg) >= 0.20:
bullets.append("Large mismatch between lab/practical and theory performance.")
elif abs(ltg) >= 0.12:
bullets.append("Some lab vs theory mismatch.")
if (s > 0.18) or (abs(eg) >= 0.25) or (abs(ltg) >= 0.20):
rec = "Recommendation: Consider academic review for assessment alignment and/or student support (not an allegation)."
else:
rec = "Recommendation: Pattern looks academically reasonable; routine moderation is sufficient."
return "Key observations:\n- " + "\n- ".join(bullets) + "\n\n" + rec
def _section_reason(r):
s = float(r["std_score"])
eg = float(r["final_vs_coursework_gap"])
ltg = float(r["lab_theory_gap"])
bullets = []
if s <= 0.08:
bullets.append("Uniform distribution across components in this section.")
elif s <= 0.14:
bullets.append("Genuine variation consistent with expected student diversity.")
else:
bullets.append("Higher variability across components (less uniform distribution).")
if abs(eg) >= 0.18:
bullets.append("Final vs coursework trend mismatch at section level.")
if abs(ltg) >= 0.15:
bullets.append("Lab vs theory mismatch at section level.")
if (s > 0.14) or (abs(eg) >= 0.18) or (abs(ltg) >= 0.15):
rec = "Recommendation: Review assessment design/moderation consistency for this section (not an allegation)."
else:
rec = "Recommendation: Distribution appears aligned; routine moderation is sufficient."
return "Key observations:\n- " + "\n- ".join(bullets) + "\n\n" + rec
def _make_excel(student_out, section_out, student_profile, section_profile) -> bytes:
"""
Return Excel file bytes.
"""
out = io.BytesIO()
with pd.ExcelWriter(out, engine="openpyxl") as writer:
student_out.to_excel(writer, index=False, sheet_name="Student_Level")
section_out.to_excel(writer, index=False, sheet_name="Section_Level")
student_profile.to_excel(writer, index=False, sheet_name="Student_Cluster_Profile")
section_profile.to_excel(writer, index=False, sheet_name="Section_Cluster_Profile")
out.seek(0)
return out.getvalue()
def run_analysis(file_obj):
if file_obj is None:
raise gr.Error("Please upload your Excel/CSV file first.")
name = (getattr(file_obj, "name", "") or "").lower()
try:
if name.endswith(".csv"):
raw = pd.read_csv(file_obj.name)
else:
raw = _read_excel_safely(file_obj.name)
except Exception as e:
raise gr.Error(f"Could not read the file. Details: {e}")
df = _map_columns(raw)
df = _clean(df)
df = _features(df)
if df.empty:
raise gr.Error("No valid rows after cleaning. Please check missing values.")
# Student-level clustering
stud_feat = df[["mean_score", "std_score", "exam_gap", "lab_theory_gap", "final_vs_coursework_gap"]]
stud_labels, Xs, stud_sil = _kmeans(stud_feat, k=3)
df["StudentCluster"] = stud_labels.astype(int)
df["StudentReason"] = df.apply(_student_reason, axis=1)
hover_student = df[
[
"StudentID",
"Section",
"Quiz",
"Mid",
"Lab Report",
"Lab Activity",
"Final",
"StudentCluster",
"mean_score",
"std_score",
"final_vs_coursework_gap",
"lab_theory_gap",
]
].copy()
fig_student = _pca_fig(Xs, stud_labels, hover_student, "Student-Level Clustering (PCA)")
stud_profile = (
df.groupby("StudentCluster")[
[
"Quiz_pct",
"Mid_pct",
"Lab Report_pct",
"Lab Activity_pct",
"Final_pct",
"mean_score",
"std_score",
"final_vs_coursework_gap",
"lab_theory_gap",
]
]
.mean()
.reset_index()
.sort_values("StudentCluster")
)
# Section-level aggregation + clustering
sec_cols = [
"Quiz_pct",
"Mid_pct",
"Lab Report_pct",
"Lab Activity_pct",
"Final_pct",
"mean_score",
"std_score",
"exam_gap",
"lab_theory_gap",
"final_vs_coursework_gap",
]
sec = df.groupby("Section")[sec_cols].mean().reset_index()
sec_feat = sec[["mean_score", "std_score", "exam_gap", "lab_theory_gap", "final_vs_coursework_gap"]]
sec_labels, Xsec, sec_sil = _kmeans(sec_feat, k=3)
sec["SectionCluster"] = sec_labels.astype(int)
sec["SectionReason"] = sec.apply(_section_reason, axis=1)
hover_section = sec[
["Section", "SectionCluster", "mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]
].copy()
fig_section = _pca_fig(Xsec, sec_labels, hover_section, "Section-Level Clustering (PCA)")
sec_profile = (
sec.groupby("SectionCluster")[
[
"Quiz_pct",
"Mid_pct",
"Lab Report_pct",
"Lab Activity_pct",
"Final_pct",
"mean_score",
"std_score",
"final_vs_coursework_gap",
"lab_theory_gap",
]
]
.mean()
.reset_index()
.sort_values("SectionCluster")
)
# Output tables (convert key metrics to % for readability)
student_out = df[
[
"StudentID",
"Section",
"Quiz",
"Mid",
"Lab Report",
"Lab Activity",
"Final",
"StudentCluster",
"mean_score",
"std_score",
"final_vs_coursework_gap",
"lab_theory_gap",
"StudentReason",
]
].copy()
for c in ["mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]:
student_out[c] = (student_out[c] * 100).round(1)
section_out = sec[
[
"Section",
"SectionCluster",
"mean_score",
"std_score",
"final_vs_coursework_gap",
"lab_theory_gap",
"SectionReason",
]
].copy()
for c in ["mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]:
section_out[c] = (section_out[c] * 100).round(1)
# Profiles as %
stud_profile_show = stud_profile.copy()
sec_profile_show = sec_profile.copy()
for t in [stud_profile_show, sec_profile_show]:
for c in t.columns:
if c != t.columns[0]:
t[c] = (t[c] * 100).round(1)
# Summary
sil_s = f"{stud_sil:.3f}" if stud_sil is not None else "N/A"
sil_c = f"{sec_sil:.3f}" if sec_sil is not None else "N/A"
summary = (
f"Students analysed: {len(df)}\n"
f"Sections analysed: {df['Section'].nunique()}\n\n"
f"Student-level silhouette (k=3): {sil_s}\n"
f"Section-level silhouette (k=3): {sil_c}\n\n"
f"Note: Clusters indicate patterns for review/support; they do not imply misconduct."
)
# Create downloadable Excel file and RETURN FILE PATH (Gradio expects a path)
excel_bytes = _make_excel(student_out, section_out, stud_profile_show, sec_profile_show)
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
tmp.write(excel_bytes)
tmp.flush()
tmp.close()
return summary, student_out, fig_student, stud_profile_show, section_out, fig_section, sec_profile_show, tmp.name
with gr.Blocks(title="Marks Clustering (Student + Section)") as demo:
gr.Markdown(
"""
# Marks Clustering Dashboard (Unsupervised)
Upload your Excel/CSV marks file.
This app produces **Student-level + Section-level clustering (3 clusters)** with **graphs and reasons**.
**Output is for academic QA/moderation and student support** (no allegations).
""".strip()
)
file_in = gr.File(label="Upload Excel/CSV", file_types=[".xlsx", ".xls", ".csv"])
run_btn = gr.Button("Run Clustering", variant="primary")
summary = gr.Textbox(label="Summary", lines=8)
with gr.Tabs():
with gr.Tab("Student-Level"):
student_df = gr.Dataframe(label="Student-Level Results (with reasons)", wrap=True)
student_plot = gr.Plot(label="Student PCA Plot")
student_profile = gr.Dataframe(label="Student Cluster Profile (avg %)", wrap=True)
with gr.Tab("Section-Level"):
section_df = gr.Dataframe(label="Section-Level Results (with reasons)", wrap=True)
section_plot = gr.Plot(label="Section PCA Plot")
section_profile = gr.Dataframe(label="Section Cluster Profile (avg %)", wrap=True)
with gr.Tab("Download"):
dl = gr.File(label="Download Excel Report")
run_btn.click(
run_analysis,
inputs=[file_in],
outputs=[summary, student_df, student_plot, student_profile, section_df, section_plot, section_profile, dl],
)
if __name__ == "__main__":
demo.launch()