Spaces:

HarshitaSuri
/

StudentDropoutPrediction

Sleeping

App Files Files Community

HarshitaSuri commited on Sep 11, 2025

Commit

50c55fd

verified ·

1 Parent(s): 42beb12

Update app.py

Browse files

Files changed (1) hide show

app.py +319 -150

app.py CHANGED Viewed

@@ -1,171 +1,340 @@
-# app.py
 import pandas as pd
 import numpy as np
 import gradio as gr
-from sklearn.ensemble import RandomForestClassifier
-import joblib
-import os
-# ---------------- CONFIG ----------------
-MODEL_PATH = "rf_balanced_model.pkl"
-REPORT_PATH = "risk_report.csv"
-# ---------------- PIPELINE HELPERS ----------------
-def preprocess(df):
-    """Clean Fees Paid column and create proxy Fail label"""
-    data = df.copy()
-    fees_map = {"yes": 1, "y": 1, "paid": 1, "true": 1,
-                "no": 0, "n": 0, "false": 0}
-    if "Fees Paid" in data.columns:
-        data["Fees_Paid_Flag"] = (
-            data["Fees Paid"].astype(str).str.strip().str.lower().map(fees_map).fillna(0).astype(int)
-        )
-    return data
-def attendance_risk(att):
-    if att < 60: return 1.0
-    elif att < 70: return 0.7
-    elif att < 80: return 0.4
-    elif att < 85: return 0.2
-    else: return 0.0
-def combine_scores(ml_prob, att_risk, mode="max", ml_w=0.6, att_w=0.4):
-    if mode == "max":
-        return np.maximum(ml_prob, att_risk)
-    else:
-        return (ml_prob * ml_w) + (att_risk * att_w)
-def label_from_score(score):
-    if score >= 0.6: return "High"
-    elif score >= 0.4: return "Medium"
-    else: return "Low"
-def explain_row(row):
-    reasons = []
-    if row.get("Attendance_Risk", 0) >= 0.7:
-        reasons.append(f"Low attendance ({row['Attendance (%)']}%)")
-    if row.get("ML_Fail_Prob", 0) >= 0.6:
-        reasons.append(f"ML_prob high ({row['ML_Fail_Prob']:.2f})")
-    if row.get("Fees_Paid_Flag", 1) == 0:
-        reasons.append("Fees unpaid")
-    return "; ".join(reasons) if reasons else "None"
-# ---------------- MERGE SOURCES ----------------
 def normalise_colnames(df):
     df = df.copy()
-    df.columns = [c.strip().title().replace("Roll No", "Roll Number") for c in df.columns]
-    return df
 def merge_sources(att_df, test_df, fee_df):
-    att = normalise_colnames(att_df) if att_df is not None else None
-    test = normalise_colnames(test_df) if test_df is not None else None
-    fee = normalise_colnames(fee_df) if fee_df is not None else None
-    key = "Roll Number" if (
-        (att is not None and "Roll Number" in att.columns) or
-        (test is not None and "Roll Number" in test.columns) or
-        (fee is not None and "Roll Number" in fee.columns)
-    ) else "Name"
-    dfs = [d for d in [att, test, fee] if d is not None]
     if not dfs:
         return pd.DataFrame()
     merged = dfs[0]
     for d in dfs[1:]:
-        merged = pd.merge(merged, d, on=key, how="outer", suffixes=("", "_dup"))
-        merged = merged.loc[:, ~merged.columns.duplicated()]
     return merged
-# ---------------- PREDICTION FUNCTIONS ----------------
-def load_model():
-    if os.path.exists(MODEL_PATH):
-        return joblib.load(MODEL_PATH)
-    return RandomForestClassifier().fit([[80, 1]], [0])  # dummy
-model = load_model()
-def predict_single(name, roll, attendance, marks, fees_paid):
-    df = pd.DataFrame([{
-        "Name": name,
-        "Roll Number": roll,
-        "Attendance (%)": attendance,
-        "Marks (%)": marks,
-        "Fees Paid": fees_paid
-    }])
-    df = preprocess(df)
-    X = df[["Attendance (%)", "Fees_Paid_Flag"]].values
-    prob = model.predict_proba(X)[:,1][0]
-    att_r = attendance_risk(attendance)
-    combined = combine_scores(prob, att_r)
-    label = label_from_score(combined)
-    reason = explain_row({
-        "Attendance_Risk": att_r,
-        "Attendance (%)": attendance,
-        "ML_Fail_Prob": prob,
-        "Fees_Paid_Flag": df["Fees_Paid_Flag"].iloc[0]
-    })
-    return f"Risk: {label}\nReasons: {reason}"
-def process_and_report(att_path, test_path, fee_path):
-    att_df = pd.read_excel(att_path.name) if att_path else None
-    test_df = pd.read_excel(test_path.name) if test_path else None
-    fee_df = pd.read_excel(fee_path.name) if fee_path else None
     merged = merge_sources(att_df, test_df, fee_df)
-    merged = preprocess(merged)
-    if "Attendance (%)" not in merged or "Fees_Paid_Flag" not in merged:
-        return pd.DataFrame(), None
-    X = merged[["Attendance (%)", "Fees_Paid_Flag"]].values
-    merged["ML_Fail_Prob"] = model.predict_proba(X)[:,1]
-    merged["Attendance_Risk"] = merged["Attendance (%)"].apply(attendance_risk)
-    merged["Combined_Risk_Score"] = combine_scores(
-        merged["ML_Fail_Prob"].values,
-        merged["Attendance_Risk"].values
-    )
-    merged["Risk_Label"] = merged["Combined_Risk_Score"].apply(label_from_score)
-    merged["Reason"] = merged.apply(explain_row, axis=1)
-    cols = ["Name", "Roll Number", "Attendance (%)", "Marks (%)", "Fees Paid",
-            "ML_Fail_Prob", "Attendance_Risk", "Combined_Risk_Score", "Risk_Label", "Reason"]
-    merged.to_csv(REPORT_PATH, index=False)
-    return merged[cols], REPORT_PATH
-# ---------------- GRADIO APP ----------------
-with gr.Blocks(title="Student Dropout Prediction") as demo:
-    gr.Markdown("## 🎓 Student Dropout Prediction Dashboard")
-    with gr.Tab("Single Student"):
-        name = gr.Textbox(label="Name")
-        roll = gr.Textbox(label="Roll Number")
-        att = gr.Slider(0, 100, value=75, label="Attendance (%)")
-        marks = gr.Slider(0, 100, value=60, label="Marks (%)")
-        fees = gr.Dropdown(["Yes", "No"], label="Fees Paid?")
-        predict_btn = gr.Button("Predict Risk")
-        result = gr.Textbox(label="Prediction Result")
-        predict_btn.click(
-            fn=predict_single,
-            inputs=[name, roll, att, marks, fees],
-            outputs=result
-        )
-    with gr.Tab("Bulk Dashboard"):
-        upload_att = gr.File(label="Upload Attendance", type="file")
-        upload_test = gr.File(label="Upload Test Results", type="file")
-        upload_fee = gr.File(label="Upload Fees", type="file")
-        run_btn = gr.Button("Run Analysis")
-        output_table = gr.Dataframe(label="Risk Report", wrap=True)
-        output_file = gr.File(label="Download CSV Report")
-        run_btn.click(
-            fn=process_and_report,
-            inputs=[upload_att, upload_test, upload_fee],
-            outputs=[output_table, output_file]
         )
-if __name__ == "__main__":
-    demo.launch()

+# app.py (improved)
+# Requirements: pandas, numpy, gradio, matplotlib, openpyxl
+import os, io, math
+from datetime import datetime
 import pandas as pd
 import numpy as np
 import gradio as gr
+import matplotlib.pyplot as plt
+import smtplib
+from email.message import EmailMessage
+# ---------------- config defaults ----------------
+DEFAULT_ATTENDANCE_THRESHOLD = 75.0
+DEFAULT_ATTENDANCE_HIGH_RISK = 60.0
+DEFAULT_TEST_DECLINE_PERCENT = 10.0
+DEFAULT_MAX_ATTEMPTS = 3
+DEFAULT_HIGH_CUT = 0.6
+DEFAULT_MED_CUT = 0.4
+# ---------------- utils (kept/cleaned) ----------------
+def load_df_from_file(f):
+    if f is None:
+        return None
+    try:
+        name = getattr(f, "name", "")
+        if str(name).lower().endswith((".xls", ".xlsx")):
+            return pd.read_excel(f)
+        else:
+            return pd.read_csv(f)
+    except Exception:
+        try:
+            f.seek(0)
+            return pd.read_csv(f)
+        except Exception as e:
+            raise RuntimeError(f"Could not read file: {e}")
 def normalise_colnames(df):
+    if df is None:
+        return None
     df = df.copy()
+    df.columns = [c.strip() for c in df.columns]
+    colmap = {}
+    for c in df.columns:
+        lc = c.lower().replace(" ", "").replace("_", "")
+        if "roll" in lc and ("no" in lc or "number" in lc or "id" in lc):
+            colmap[c] = "Roll Number"
+        elif "name" == lc or "studentname" == lc or lc == "student":
+            colmap[c] = "Name"
+        elif "attendance" in lc:
+            colmap[c] = "Attendance (%)"
+        elif "mark" in lc or "score" in lc or "percentage" in lc:
+            colmap[c] = "Marks (%)"
+        elif "fee" in lc:
+            colmap[c] = "Fees Paid"
+        elif "attempt" in lc:
+            colmap[c] = "Attempts"
+        elif any(k in lc for k in ("test","exam","mid","quiz","assessment")):
+            colmap[c] = c  # preserve test columns
+    return df.rename(columns=colmap)
 def merge_sources(att_df, test_df, fee_df):
+    att = normalise_colnames(att_df)
+    test = normalise_colnames(test_df)
+    fee = normalise_colnames(fee_df)
+    # choose merge key
+    key = None
+    for d in (att, test, fee):
+        if d is not None and 'Roll Number' in d.columns:
+            key = 'Roll Number'
+            break
+    if key is None:
+        key = 'Name'
+    dfs = []
+    for d in (att, test, fee):
+        if d is None:
+            continue
+        if key not in d.columns:
+            d[key] = np.nan
+        dfs.append(d)
     if not dfs:
         return pd.DataFrame()
     merged = dfs[0]
     for d in dfs[1:]:
+        merged = pd.merge(merged, d, on=key, how='outer', suffixes=(False, False))
     return merged
+def parse_test_scores(df):
+    if df is None:
+        return []
+    test_cols = [c for c in df.columns if any(k in c.lower() for k in ("test","exam","mid","quiz","assessment","score")) and c not in ["Marks (%)", "Attendance (%)"]]
+    if "Marks (%)" in df.columns:
+        test_cols.append("Marks (%)")
+    return test_cols
+def attendance_risk(att, threshold, high_risk):
+    try:
+        att = float(att)
+    except:
+        return 0.0
+    if math.isnan(att):
+        return 0.0
+    if att < high_risk:
+        return 1.0
+    if att < threshold:
+        span = max(1.0, threshold - high_risk)
+        return (threshold - att) / span * 0.9
+    return 0.0
+def test_decline_risk(row, test_cols, decline_pct):
+    scores = []
+    for c in test_cols:
+        v = row.get(c, None)
+        try:
+            scores.append(float(v))
+        except:
+            continue
+    if len(scores) < 2:
+        return 0.0
+    latest, prev = scores[-1], scores[-2]
+    if prev == 0:
+        return 0.0
+    drop = (prev - latest) / prev * 100.0
+    if drop >= decline_pct:
+        return min(1.0, drop / (2 * decline_pct))
+    return 0.0
+def attempts_risk(attempts_val, max_attempts):
+    try:
+        a = int(attempts_val)
+    except:
+        return 0.0
+    if a >= max_attempts:
+        return 1.0
+    if a == 0:
+        return 0.0
+    return a / max(1, max_attempts)
+def combine_signals(signals, mode="max", weights=None):
+    arr = np.array(signals)
+    if mode == "max":
+        return np.max(arr, axis=0)
+    elif mode == "weighted" and weights is not None:
+        w = np.array(weights)
+        w = w / w.sum()
+        return (arr * w[:, None]).sum(axis=0)
+    return np.mean(arr, axis=0)
+def label_from_score(score, high_cut=DEFAULT_HIGH_CUT, med_cut=DEFAULT_MED_CUT):
+    if score >= high_cut:
+        return "High"
+    elif score >= med_cut:
+        return "Medium"
+    else:
+        return "Low"
+def send_email_notification(to_emails, subject, body, smtp_host, smtp_port, smtp_user, smtp_pass):
+    try:
+        msg = EmailMessage()
+        msg["Subject"] = subject
+        msg["From"] = smtp_user
+        msg["To"] = ", ".join(to_emails)
+        msg.set_content(body)
+        with smtplib.SMTP(smtp_host, smtp_port, timeout=15) as s:
+            s.starttls()
+            s.login(smtp_user, smtp_pass)
+            s.send_message(msg)
+        return True, "Sent"
+    except Exception as e:
+        return False, str(e)
+# ---------------- main processing ----------------
+def process_and_report(att_file, test_file, fee_file,
+                       attendance_threshold=DEFAULT_ATTENDANCE_THRESHOLD,
+                       attendance_high_risk=DEFAULT_ATTENDANCE_HIGH_RISK,
+                       decline_pct=DEFAULT_TEST_DECLINE_PERCENT,
+                       max_attempts=DEFAULT_MAX_ATTEMPTS,
+                       combine_mode="max",
+                       weight_att=0.5, weight_test=0.3, weight_attempt=0.2,
+                       notify=False, notify_threshold="High", notify_emails="", smtp_overrides=None):
+    # load files
+    try:
+        att_df = load_df_from_file(att_file) if att_file else None
+        test_df = load_df_from_file(test_file) if test_file else None
+        fee_df = load_df_from_file(fee_file) if fee_file else None
+    except Exception as e:
+        return pd.DataFrame(), {"error": str(e)}
     merged = merge_sources(att_df, test_df, fee_df)
+    if merged.empty:
+        return pd.DataFrame(), {"error": "No data loaded. Upload at least one sheet."}
+    merged = merged.reset_index(drop=True)
+    merged['Attendance (%)'] = merged.get('Attendance (%)', np.nan)
+    merged['Fees Paid'] = merged.get('Fees Paid', merged.get('FeesPaid', merged.get('Fees', np.nan)))
+    test_cols = parse_test_scores(merged)
+    n = len(merged)
+    att_risks = np.zeros(n); test_risks = np.zeros(n); attempt_risks = np.zeros(n)
+    for i, row in merged.iterrows():
+        att_risks[i] = attendance_risk(row.get('Attendance (%)', np.nan), attendance_threshold, attendance_high_risk)
+        test_risks[i] = test_decline_risk(row, test_cols, decline_pct)
+        attempt_risks[i] = attempts_risk(row.get('Attempts', 0), max_attempts)
+    merged['Attendance_Risk'] = att_risks
+    merged['Test_Decline_Risk'] = test_risks
+    merged['Attempts_Risk'] = attempt_risks
+    signals = [att_risks, test_risks, attempt_risks]
+    if combine_mode == "max":
+        combined = combine_signals(signals, mode="max")
+    else:
+        weights = [weight_att, weight_test, weight_attempt]
+        combined = combine_signals(signals, mode="weighted", weights=weights)
+    merged['Combined_Risk_Score'] = combined
+    merged['Risk_Label'] = merged['Combined_Risk_Score'].apply(label_from_score)
+    reasons = []
+    for i, row in merged.iterrows():
+        r = []
+        if row['Attendance_Risk'] >= 0.7:
+            r.append(f"Low attendance ({row.get('Attendance (%)', 'NA')}%)")
+        elif row['Attendance_Risk'] > 0:
+            r.append(f"Attendance below threshold ({row.get('Attendance (%)', 'NA')}%)")
+        if row['Test_Decline_Risk'] > 0:
+            r.append("Recent test decline")
+        if row['Attempts_Risk'] > 0:
+            r.append(f"High attempts ({row.get('Attempts','NA')})")
+        if str(row.get('Fees Paid','')).strip().lower() in ("no","n","false","0","unpaid"):
+            r.append("Fees unpaid")
+        reasons.append("; ".join(r) if r else "None")
+    merged['Flag_Reason'] = reasons
+    summary = merged['Risk_Label'].value_counts().to_dict()
+    notif_result = None
+    if notify:
+        notify_mask = merged['Risk_Label'] == notify_threshold
+        notify_rows = merged[notify_mask]
+        smtp = smtp_overrides or {
+            "host": os.environ.get("SMTP_HOST"),
+            "port": int(os.environ.get("SMTP_PORT", 587)),
+            "user": os.environ.get("SMTP_USER"),
+            "pass": os.environ.get("SMTP_PASS")
+        }
+        if not notify_rows.empty and smtp["user"] and smtp["pass"] and smtp["host"]:
+            emails = [e.strip() for e in str(notify_emails).split(",") if e.strip()]
+            subject = f"[Early Warning] {len(notify_rows)} students flagged {notify_threshold}"
+            body_lines = ["Students flagged:\n"]
+            for _, r in notify_rows.iterrows():
+                body_lines.append(f"{r.get('Name','')}\t{r.get('Roll Number','')}\t{r.get('Risk_Label')}\tReasons: {r.get('Flag_Reason')}")
+            ok, msg = send_email_notification(emails, subject, "\n".join(body_lines), smtp["host"], smtp["port"], smtp["user"], smtp["pass"])
+            notif_result = (ok, msg)
+        else:
+            notif_result = (False, "Notification missing SMTP settings or no flagged students")
+    return merged, {"summary": summary, "notify_result": notif_result}
+# --------------- UI ----------------
+def build_plot(df):
+    if df.empty:
+        return None
+    counts = df['Risk_Label'].value_counts().reindex(['High','Medium','Low']).fillna(0)
+    fig, ax = plt.subplots(figsize=(6,3))
+    ax.bar(counts.index, counts.values)
+    ax.set_title("Risk distribution")
+    ax.set_ylabel("Number of students")
+    plt.tight_layout()
+    return fig
+def df_to_colored_html(df):
+    if df.empty:
+        return "<p>No data</p>"
+    df = df.copy()
+    # show a few important cols if available
+    cols = [c for c in ['Roll Number','Name','Attendance (%)','Marks (%)','Fees Paid','Combined_Risk_Score','Risk_Label','Flag_Reason'] if c in df.columns]
+    df = df[cols]
+    def row_style(r):
+        label = r.get("Risk_Label","Low")
+        if label=="High":
+            return 'background:#ffcccc'  # pale red
+        if label=="Medium":
+            return 'background:#fff2cc'  # pale orange
+        return ''
+    styled = df.style.apply(lambda r: [row_style(r)]*len(r), axis=1)
+    return styled.hide_index().to_html()
+with gr.Blocks() as demo:
+    gr.Markdown("## Student Early-Warning Dashboard")
+    with gr.Row():
+        with gr.Column():
+            att_file = gr.File(label="Attendance", file_types=['.csv','.xlsx'])
+            test_file = gr.File(label="Tests", file_types=['.csv','.xlsx'])
+            fee_file = gr.File(label="Fees", file_types=['.csv','.xlsx'])
+            run_btn = gr.Button("Run")
+            download = gr.File()
+        with gr.Column():
+            att_thresh = gr.Slider(50,100,value=DEFAULT_ATTENDANCE_THRESHOLD,label="Attendance threshold")
+            att_high = gr.Slider(20,80,value=DEFAULT_ATTENDANCE_HIGH_RISK,label="High-risk attendance cutoff")
+            decline_pct = gr.Slider(1,50,value=DEFAULT_TEST_DECLINE_PERCENT,label="Test decline % to flag")
+            max_attempts = gr.Number(value=DEFAULT_MAX_ATTEMPTS,label="Max attempts before flag",precision=0)
+            combine_mode = gr.Radio(["max","weighted"],value="max",label="Combine mode")
+            weight_att = gr.Slider(0.0,1.0,value=0.5,label="Weight: attendance (only for weighted)")
+            weight_test = gr.Slider(0.0,1.0,value=0.3,label="Weight: test decline")
+            weight_attempt = gr.Slider(0.0,1.0,value=0.2,label="Weight: attempts")
+            notify = gr.Checkbox(False,label="Send email notifications to mentors (uses SMTP env vars or enter below)")
+            notify_emails = gr.Textbox(label="Notify emails (comma-separated)")
+            smtp_host = gr.Textbox(label="SMTP host (optional override)")
+            smtp_port = gr.Number(value=587,label="SMTP port (optional override)",precision=0)
+            smtp_user = gr.Textbox(label="SMTP user (optional override)")
+            smtp_pass = gr.Textbox(type="password", label="SMTP pass (optional override)")
+    result_html = gr.HTML()
+    risk_plot = gr.Plot()
+    summary_box = gr.Textbox()
+    def on_run(att_file_, test_file_, fee_file_,
+               att_thresh_, att_high_, decline_pct_, max_attempts_,
+               combine_mode_, weight_att_, weight_test_, weight_attempt_,
+               notify_, notify_emails_, smtp_host_, smtp_port_, smtp_user_, smtp_pass_):
+        smtp_overrides = None
+        if smtp_user_ and smtp_pass_ and smtp_host_:
+            smtp_overrides = {"host": smtp_host_, "port": int(smtp_port_), "user": smtp_user_, "pass": smtp_pass_}
+        df, meta = process_and_report(
+            att_file=att_file_, test_file=test_file_, fee_file=fee_file_,
+            attendance_threshold=float(att_thresh_), attendance_high_risk=float(att_high_),
+            decline_pct=float(decline_pct_), max_attempts=int(max_attempts_ or DEFAULT_MAX_ATTEMPTS),
+            combine_mode=combine_mode_, weight_att=float(weight_att_), weight_test=float(weight_test_), weight_attempt=float(weight_attempt_),
+            notify=notify_, notify_threshold="High", notify_emails=notify_emails_, smtp_overrides=smtp_overrides
         )
+        if isinstance(meta, dict) and meta.get("error"):
+            return f"<pre style='color:red'>{meta['error']}</pre>", None, str(meta)
+        html = df_to_colored_html(df)
+        fig = build_plot(df)
+        # prepare CSV bytes for download
+        csv_bytes = df.to_csv(index=False).encode('utf-8')
+        file_obj = io.BytesIO(csv_bytes)
+        file_obj.name = f"risk_report_{datetime.utcnow().strftime('%Y%m%dT%H%M%SZ')}.csv"
+        return html, fig, str(meta), file_obj
+    run_btn.click(fn=on_run, inputs=[att_file, test_file, fee_file,
+                                     att_thresh, att_high, decline_pct, max_attempts,
+                                     combine_mode, weight_att, weight_test, weight_attempt,
+                                     notify, notify_emails, smtp_host, smtp_port, smtp_user, smtp_pass],
+                  outputs=[result_html, risk_plot, summary_box, download])
+demo.launch()