Spaces:
Sleeping
Sleeping
| # app.py — Marks Clustering (Student + Section) for RE.xlsx | |
| # Hugging Face Gradio app: | |
| # - Student-level clustering (k=3) | |
| # - Section-level clustering (k=3) | |
| # - Graphs + auto-generated reasons | |
| # - Downloadable Excel report (fixed: returns file path, not tuple) | |
| import io | |
| import tempfile | |
| import numpy as np | |
| import pandas as pd | |
| import gradio as gr | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.cluster import KMeans | |
| from sklearn.decomposition import PCA | |
| from sklearn.metrics import silhouette_score | |
| import plotly.express as px | |
| MAXIMA = { | |
| "Quiz": 10.0, | |
| "Mid": 20.0, | |
| "Lab Report": 20.0, | |
| "Lab Activity": 10.0, | |
| "Final": 40.0, | |
| } | |
| def _normalize_colname(c: str) -> str: | |
| c = str(c).strip().lower() | |
| c = c.replace("_", " ").replace("-", " ") | |
| c = c.replace("0", "o") # fixes '0ctivity' -> 'octivity' | |
| c = " ".join(c.split()) | |
| return c | |
| def _read_excel_safely(path: str) -> pd.DataFrame: | |
| """ | |
| Auto-handle cases where the first row is blank and headers start on row 2. | |
| """ | |
| df_try = pd.read_excel(path) | |
| if all(str(c).startswith("Unnamed") for c in df_try.columns): | |
| df_try = pd.read_excel(path, header=1) | |
| return df_try | |
| def _map_columns(df: pd.DataFrame) -> pd.DataFrame: | |
| cols = list(df.columns) | |
| norm = {_normalize_colname(c): c for c in cols} | |
| candidates = { | |
| "studentid": ["studentid", "student id", "id", "student", "student number", "student no"], | |
| "section": ["section", "sec", "class", "group"], | |
| "quiz": ["quiz", "quiz (10)", "quiz(10)", "quizzes"], | |
| "mid": ["mid", "mid (20)", "mid(20)", "midterm", "mid term"], | |
| "lab report": ["lab report", "lab report(20)", "labreport", "report", "lab reports", "labreport(20)"], | |
| "lab activity": [ | |
| "lab activity", "lab activity(10)", "labactivity", "activity", "lab act", | |
| "lab octivity(10)" # after normalization, 0->o helps too | |
| ], | |
| "final": ["final", "final (40)", "final(40)", "final exam", "exam", "endterm", "end term"], | |
| } | |
| mapped = {} | |
| for logical, opts in candidates.items(): | |
| for o in opts: | |
| o2 = _normalize_colname(o) | |
| if o2 in norm: | |
| mapped[logical] = norm[o2] | |
| break | |
| # StudentID is optional (we can generate) | |
| required = ["section", "quiz", "mid", "lab report", "lab activity", "final"] | |
| missing = [r for r in required if r not in mapped] | |
| if missing: | |
| raise ValueError( | |
| "Missing required columns (after auto-matching): " | |
| + ", ".join(missing) | |
| + "\n\nYour file should include: Section, Quiz, Mid, Lab Report, Lab Activity, Final." | |
| ) | |
| df2 = df.rename( | |
| columns={ | |
| mapped["section"]: "Section", | |
| mapped["quiz"]: "Quiz", | |
| mapped["mid"]: "Mid", | |
| mapped["lab report"]: "Lab Report", | |
| mapped["lab activity"]: "Lab Activity", | |
| mapped["final"]: "Final", | |
| } | |
| ).copy() | |
| if "studentid" in mapped: | |
| df2 = df2.rename(columns={mapped["studentid"]: "StudentID"}) | |
| else: | |
| df2.insert(0, "StudentID", [f"S{i+1:03d}" for i in range(len(df2))]) | |
| return df2 | |
| def _clean(df: pd.DataFrame) -> pd.DataFrame: | |
| df = df.copy() | |
| df["StudentID"] = df["StudentID"].astype(str).str.strip() | |
| df["Section"] = df["Section"].astype(str).str.strip() | |
| for c in ["Quiz", "Mid", "Lab Report", "Lab Activity", "Final"]: | |
| df[c] = pd.to_numeric(df[c], errors="coerce") | |
| df = df.dropna(subset=["Section", "Quiz", "Mid", "Lab Report", "Lab Activity", "Final"]) | |
| df = df[(df["StudentID"] != "") & (df["Section"] != "")] | |
| for c, mx in MAXIMA.items(): | |
| df[c] = df[c].clip(lower=0, upper=mx) | |
| return df | |
| def _features(df: pd.DataFrame) -> pd.DataFrame: | |
| df = df.copy() | |
| for c, mx in MAXIMA.items(): | |
| df[f"{c}_pct"] = df[c] / mx | |
| comp = [f"{c}_pct" for c in ["Quiz", "Mid", "Lab Report", "Lab Activity", "Final"]] | |
| df["mean_score"] = df[comp].mean(axis=1) | |
| df["std_score"] = df[comp].std(axis=1) | |
| df["lab_avg"] = df[["Lab Report_pct", "Lab Activity_pct"]].mean(axis=1) | |
| df["theory_avg"] = df[["Quiz_pct", "Mid_pct", "Final_pct"]].mean(axis=1) | |
| df["exam_gap"] = df["Final_pct"] - df[["Quiz_pct", "Mid_pct"]].mean(axis=1) | |
| df["coursework_avg"] = df[["Quiz_pct", "Mid_pct", "Lab Report_pct", "Lab Activity_pct"]].mean(axis=1) | |
| df["final_vs_coursework_gap"] = df["Final_pct"] - df["coursework_avg"] | |
| df["lab_theory_gap"] = df["lab_avg"] - df["theory_avg"] | |
| return df | |
| def _kmeans(df_feat: pd.DataFrame, k=3): | |
| scaler = StandardScaler() | |
| X = scaler.fit_transform(df_feat.values) | |
| model = KMeans(n_clusters=k, random_state=42, n_init=10) | |
| labels = model.fit_predict(X) | |
| sil = None | |
| try: | |
| if len(df_feat) >= k + 2 and len(set(labels)) > 1: | |
| sil = float(silhouette_score(X, labels)) | |
| except Exception: | |
| sil = None | |
| return labels, X, sil | |
| def _pca_fig(X, labels, hover_df, title): | |
| pca = PCA(n_components=2, random_state=42) | |
| coords = pca.fit_transform(X) | |
| plot_df = hover_df.copy() | |
| plot_df["PC1"] = coords[:, 0] | |
| plot_df["PC2"] = coords[:, 1] | |
| plot_df["Cluster"] = labels.astype(int).astype(str) | |
| fig = px.scatter( | |
| plot_df, | |
| x="PC1", | |
| y="PC2", | |
| color="Cluster", | |
| hover_data=list(hover_df.columns), | |
| title=title, | |
| ) | |
| fig.update_layout(height=480) | |
| return fig | |
| def _student_reason(r): | |
| s = float(r["std_score"]) | |
| eg = float(r["final_vs_coursework_gap"]) | |
| ltg = float(r["lab_theory_gap"]) | |
| bullets = [] | |
| if s <= 0.10: | |
| bullets.append("Low variation across assessments (consistent distribution).") | |
| elif s <= 0.18: | |
| bullets.append("Moderate variation across assessments (expected academic spread).") | |
| else: | |
| bullets.append("High variation across components (irregular distribution).") | |
| if abs(eg) >= 0.25: | |
| bullets.append("Large mismatch between final exam and coursework trend.") | |
| elif abs(eg) >= 0.15: | |
| bullets.append("Noticeable final vs coursework difference.") | |
| if abs(ltg) >= 0.20: | |
| bullets.append("Large mismatch between lab/practical and theory performance.") | |
| elif abs(ltg) >= 0.12: | |
| bullets.append("Some lab vs theory mismatch.") | |
| if (s > 0.18) or (abs(eg) >= 0.25) or (abs(ltg) >= 0.20): | |
| rec = "Recommendation: Consider academic review for assessment alignment and/or student support (not an allegation)." | |
| else: | |
| rec = "Recommendation: Pattern looks academically reasonable; routine moderation is sufficient." | |
| return "Key observations:\n- " + "\n- ".join(bullets) + "\n\n" + rec | |
| def _section_reason(r): | |
| s = float(r["std_score"]) | |
| eg = float(r["final_vs_coursework_gap"]) | |
| ltg = float(r["lab_theory_gap"]) | |
| bullets = [] | |
| if s <= 0.08: | |
| bullets.append("Uniform distribution across components in this section.") | |
| elif s <= 0.14: | |
| bullets.append("Genuine variation consistent with expected student diversity.") | |
| else: | |
| bullets.append("Higher variability across components (less uniform distribution).") | |
| if abs(eg) >= 0.18: | |
| bullets.append("Final vs coursework trend mismatch at section level.") | |
| if abs(ltg) >= 0.15: | |
| bullets.append("Lab vs theory mismatch at section level.") | |
| if (s > 0.14) or (abs(eg) >= 0.18) or (abs(ltg) >= 0.15): | |
| rec = "Recommendation: Review assessment design/moderation consistency for this section (not an allegation)." | |
| else: | |
| rec = "Recommendation: Distribution appears aligned; routine moderation is sufficient." | |
| return "Key observations:\n- " + "\n- ".join(bullets) + "\n\n" + rec | |
| def _make_excel(student_out, section_out, student_profile, section_profile) -> bytes: | |
| """ | |
| Return Excel file bytes. | |
| """ | |
| out = io.BytesIO() | |
| with pd.ExcelWriter(out, engine="openpyxl") as writer: | |
| student_out.to_excel(writer, index=False, sheet_name="Student_Level") | |
| section_out.to_excel(writer, index=False, sheet_name="Section_Level") | |
| student_profile.to_excel(writer, index=False, sheet_name="Student_Cluster_Profile") | |
| section_profile.to_excel(writer, index=False, sheet_name="Section_Cluster_Profile") | |
| out.seek(0) | |
| return out.getvalue() | |
| def run_analysis(file_obj): | |
| if file_obj is None: | |
| raise gr.Error("Please upload your Excel/CSV file first.") | |
| name = (getattr(file_obj, "name", "") or "").lower() | |
| try: | |
| if name.endswith(".csv"): | |
| raw = pd.read_csv(file_obj.name) | |
| else: | |
| raw = _read_excel_safely(file_obj.name) | |
| except Exception as e: | |
| raise gr.Error(f"Could not read the file. Details: {e}") | |
| df = _map_columns(raw) | |
| df = _clean(df) | |
| df = _features(df) | |
| if df.empty: | |
| raise gr.Error("No valid rows after cleaning. Please check missing values.") | |
| # Student-level clustering | |
| stud_feat = df[["mean_score", "std_score", "exam_gap", "lab_theory_gap", "final_vs_coursework_gap"]] | |
| stud_labels, Xs, stud_sil = _kmeans(stud_feat, k=3) | |
| df["StudentCluster"] = stud_labels.astype(int) | |
| df["StudentReason"] = df.apply(_student_reason, axis=1) | |
| hover_student = df[ | |
| [ | |
| "StudentID", | |
| "Section", | |
| "Quiz", | |
| "Mid", | |
| "Lab Report", | |
| "Lab Activity", | |
| "Final", | |
| "StudentCluster", | |
| "mean_score", | |
| "std_score", | |
| "final_vs_coursework_gap", | |
| "lab_theory_gap", | |
| ] | |
| ].copy() | |
| fig_student = _pca_fig(Xs, stud_labels, hover_student, "Student-Level Clustering (PCA)") | |
| stud_profile = ( | |
| df.groupby("StudentCluster")[ | |
| [ | |
| "Quiz_pct", | |
| "Mid_pct", | |
| "Lab Report_pct", | |
| "Lab Activity_pct", | |
| "Final_pct", | |
| "mean_score", | |
| "std_score", | |
| "final_vs_coursework_gap", | |
| "lab_theory_gap", | |
| ] | |
| ] | |
| .mean() | |
| .reset_index() | |
| .sort_values("StudentCluster") | |
| ) | |
| # Section-level aggregation + clustering | |
| sec_cols = [ | |
| "Quiz_pct", | |
| "Mid_pct", | |
| "Lab Report_pct", | |
| "Lab Activity_pct", | |
| "Final_pct", | |
| "mean_score", | |
| "std_score", | |
| "exam_gap", | |
| "lab_theory_gap", | |
| "final_vs_coursework_gap", | |
| ] | |
| sec = df.groupby("Section")[sec_cols].mean().reset_index() | |
| sec_feat = sec[["mean_score", "std_score", "exam_gap", "lab_theory_gap", "final_vs_coursework_gap"]] | |
| sec_labels, Xsec, sec_sil = _kmeans(sec_feat, k=3) | |
| sec["SectionCluster"] = sec_labels.astype(int) | |
| sec["SectionReason"] = sec.apply(_section_reason, axis=1) | |
| hover_section = sec[ | |
| ["Section", "SectionCluster", "mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"] | |
| ].copy() | |
| fig_section = _pca_fig(Xsec, sec_labels, hover_section, "Section-Level Clustering (PCA)") | |
| sec_profile = ( | |
| sec.groupby("SectionCluster")[ | |
| [ | |
| "Quiz_pct", | |
| "Mid_pct", | |
| "Lab Report_pct", | |
| "Lab Activity_pct", | |
| "Final_pct", | |
| "mean_score", | |
| "std_score", | |
| "final_vs_coursework_gap", | |
| "lab_theory_gap", | |
| ] | |
| ] | |
| .mean() | |
| .reset_index() | |
| .sort_values("SectionCluster") | |
| ) | |
| # Output tables (convert key metrics to % for readability) | |
| student_out = df[ | |
| [ | |
| "StudentID", | |
| "Section", | |
| "Quiz", | |
| "Mid", | |
| "Lab Report", | |
| "Lab Activity", | |
| "Final", | |
| "StudentCluster", | |
| "mean_score", | |
| "std_score", | |
| "final_vs_coursework_gap", | |
| "lab_theory_gap", | |
| "StudentReason", | |
| ] | |
| ].copy() | |
| for c in ["mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]: | |
| student_out[c] = (student_out[c] * 100).round(1) | |
| section_out = sec[ | |
| [ | |
| "Section", | |
| "SectionCluster", | |
| "mean_score", | |
| "std_score", | |
| "final_vs_coursework_gap", | |
| "lab_theory_gap", | |
| "SectionReason", | |
| ] | |
| ].copy() | |
| for c in ["mean_score", "std_score", "final_vs_coursework_gap", "lab_theory_gap"]: | |
| section_out[c] = (section_out[c] * 100).round(1) | |
| # Profiles as % | |
| stud_profile_show = stud_profile.copy() | |
| sec_profile_show = sec_profile.copy() | |
| for t in [stud_profile_show, sec_profile_show]: | |
| for c in t.columns: | |
| if c != t.columns[0]: | |
| t[c] = (t[c] * 100).round(1) | |
| # Summary | |
| sil_s = f"{stud_sil:.3f}" if stud_sil is not None else "N/A" | |
| sil_c = f"{sec_sil:.3f}" if sec_sil is not None else "N/A" | |
| summary = ( | |
| f"Students analysed: {len(df)}\n" | |
| f"Sections analysed: {df['Section'].nunique()}\n\n" | |
| f"Student-level silhouette (k=3): {sil_s}\n" | |
| f"Section-level silhouette (k=3): {sil_c}\n\n" | |
| f"Note: Clusters indicate patterns for review/support; they do not imply misconduct." | |
| ) | |
| # Create downloadable Excel file and RETURN FILE PATH (Gradio expects a path) | |
| excel_bytes = _make_excel(student_out, section_out, stud_profile_show, sec_profile_show) | |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") | |
| tmp.write(excel_bytes) | |
| tmp.flush() | |
| tmp.close() | |
| return summary, student_out, fig_student, stud_profile_show, section_out, fig_section, sec_profile_show, tmp.name | |
| with gr.Blocks(title="Marks Clustering (Student + Section)") as demo: | |
| gr.Markdown( | |
| """ | |
| # Marks Clustering Dashboard (Unsupervised) | |
| Upload your Excel/CSV marks file. | |
| This app produces **Student-level + Section-level clustering (3 clusters)** with **graphs and reasons**. | |
| **Output is for academic QA/moderation and student support** (no allegations). | |
| """.strip() | |
| ) | |
| file_in = gr.File(label="Upload Excel/CSV", file_types=[".xlsx", ".xls", ".csv"]) | |
| run_btn = gr.Button("Run Clustering", variant="primary") | |
| summary = gr.Textbox(label="Summary", lines=8) | |
| with gr.Tabs(): | |
| with gr.Tab("Student-Level"): | |
| student_df = gr.Dataframe(label="Student-Level Results (with reasons)", wrap=True) | |
| student_plot = gr.Plot(label="Student PCA Plot") | |
| student_profile = gr.Dataframe(label="Student Cluster Profile (avg %)", wrap=True) | |
| with gr.Tab("Section-Level"): | |
| section_df = gr.Dataframe(label="Section-Level Results (with reasons)", wrap=True) | |
| section_plot = gr.Plot(label="Section PCA Plot") | |
| section_profile = gr.Dataframe(label="Section Cluster Profile (avg %)", wrap=True) | |
| with gr.Tab("Download"): | |
| dl = gr.File(label="Download Excel Report") | |
| run_btn.click( | |
| run_analysis, | |
| inputs=[file_in], | |
| outputs=[summary, student_df, student_plot, student_profile, section_df, section_plot, section_profile, dl], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |