Spaces:

Deevyankar
/

OSN

Runtime error

App Files Files Community

Deevyankar commited on Dec 29, 2025

Commit

e414cdf

verified ·

1 Parent(s): 91e93f9

Update app.py

Browse files

Files changed (1) hide show

app.py +353 -542

app.py CHANGED Viewed

@@ -1,549 +1,360 @@
-import os
-import urllib.request
-import gzip
-import io
-import numpy as np
 import pandas as pd
-import networkx as nx
-from sklearn.cluster import KMeans
-import torch
-from torch import nn
-from torch.utils.data import TensorDataset, DataLoader
-import matplotlib.pyplot as plt
-import gradio as gr
-# -------------------------------------------------------
-# 1. Download and load SNAP Facebook combined graph
-# -------------------------------------------------------
-SNAP_URL = "https://snap.stanford.edu/data/facebook_combined.txt.gz"
-DATA_DIR = "data"
-os.makedirs(DATA_DIR, exist_ok=True)
-LOCAL_PATH = os.path.join(DATA_DIR, "facebook_combined.txt.gz")
-if not os.path.exists(LOCAL_PATH):
-    print("Downloading SNAP Facebook dataset...")
-    urllib.request.urlretrieve(SNAP_URL, LOCAL_PATH)
 else:
-    print("Using cached SNAP dataset.")
-print("Loading graph...")
-with gzip.open(LOCAL_PATH, "rt") as f:
-    G = nx.read_edgelist(f, nodetype=int)
-print(f"Graph loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
-# Ensure largest connected component (should already be connected in this dataset)
-if not nx.is_connected(G):
-    largest_cc = max(nx.connected_components(G), key=len)
-    G = G.subgraph(largest_cc).copy()
-    print(f"After LCC: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
-nodes = list(G.nodes())
-node_index = {n: i for i, n in enumerate(nodes)}
-N = len(nodes)
-# -------------------------------------------------------
-# 2. Real structural features from SNAP graph
-# -------------------------------------------------------
-# Degree
-deg = np.array([G.degree(n) for n in nodes], dtype=float)
-# Clustering coefficient
-cc_dict = nx.clustering(G)
-cc = np.array([cc_dict[n] for n in nodes], dtype=float)
-# Average neighbor degree
-avg_nd_dict = nx.average_neighbor_degree(G)
-avg_nd = np.array([avg_nd_dict[n] for n in nodes], dtype=float)
-# PageRank
-pr_dict = nx.pagerank(G, alpha=0.85)
-pr = np.array([pr_dict[n] for n in nodes], dtype=float)
-def minmax(x):
-    x = np.asarray(x, dtype=float)
-    return (x - x.min()) / (x.max() - x.min() + 1e-8)
-deg_norm = minmax(deg)
-cc_norm = minmax(cc)
-avg_nd_norm = minmax(avg_nd)
-pr_norm = minmax(pr)
-print("Sample structural features for first 5 nodes:")
-for i in range(5):
-    print(
-        nodes[i],
-        "deg=", deg[i],
-        "deg_norm=", round(deg_norm[i], 3),
-        "cc_norm=", round(cc_norm[i], 3),
-        "avg_nd_norm=", round(avg_nd_norm[i], 3),
-        "pr_norm=", round(pr_norm[i], 3),
-    )
-# -------------------------------------------------------
-# 3. Paper-style behavioural features (synthetic but graph-driven)
-# -------------------------------------------------------
-rng = np.random.default_rng(42)
-# Engagement: central users are more "engaged"
-engagement = 50 * (0.6 * deg_norm + 0.4 * avg_nd_norm) + rng.normal(0, 3, size=N)
-engagement = np.clip(engagement, 0, None)
-eng_norm = minmax(engagement)
-# Trust base: users with higher PageRank and clustering are more trusted
-trust_base = (pr_norm + cc_norm) / 2.0
-# Suspicious: high degree but low clustering and low PageRank
-suspicious_raw = deg_norm * (1.0 - cc_norm) * (1.0 - pr_norm)
-suspicious_raw += 0.1 * rng.random(N)
-susp_norm = minmax(suspicious_raw)
-# Activity regularity: more regular if clustering is high (stable community)
-activity_reg = cc_norm + rng.normal(0, 0.05, size=N)
-activity_reg = np.clip(activity_reg, 0.0, 1.0)
-act_norm = minmax(activity_reg)
-# Friend requests sent: more for high degree, but bounded
-sent_requests = rng.poisson(lam=2 + 15 * deg_norm)
-sent_requests = np.maximum(sent_requests, 1)
-# Acceptance probability: higher for trusted, lower for suspicious
-accepted_prob = 0.1 + 0.7 * ((trust_base + (1.0 - susp_norm)) / 2.0)
-accepted_prob = np.clip(accepted_prob, 0.0, 1.0)
-accepted_requests = rng.binomial(sent_requests, accepted_prob)
-friend_request_ratio = accepted_requests / (sent_requests + 1e-8)
-frr_norm = minmax(friend_request_ratio)
-# Mutual friends ratio (approx): we use clustering coefficient as a proxy
-# because high clustering means many mutual connections among friends.
-mutual_friends_ratio = cc_norm.copy()
-mfr_norm = minmax(mutual_friends_ratio)
-friends_norm = minmax(deg)  # total friends ≈ degree
-# -------------------------------------------------------
-# 4. Build S, T, B scores (in spirit of your paper)
-# -------------------------------------------------------
-# S: social / structural (FRR, MFR, friends)
-S_score = (frr_norm + mfr_norm + friends_norm) / 3.0
-# T: trust (trust_base, FRR, inverse suspiciousness)
-T_score = (trust_base + frr_norm + (1.0 - susp_norm)) / 3.0
-# B: behaviour (engagement, regularity, suspiciousness)
-B_score = (eng_norm + act_norm + susp_norm) / 3.0
-# -------------------------------------------------------
-# 5. Fuse S, T, B with variance-based weights
-# -------------------------------------------------------
-varS = np.var(S_score)
-varT = np.var(T_score)
-varB = np.var(B_score)
-den = varS + varT + varB + 1e-8
-wS, wT, wB = varS / den, varT / den, varB / den
-F = np.vstack([
-    wS * S_score,
-    wT * T_score,
-    wB * B_score
-]).T  # shape (N, 3)
-print("Fusion weights:", wS, wT, wB)
-print("F shape:", F.shape)
-# -------------------------------------------------------
-# 6. KMeans clustering -> pseudo labels
-#    (0 = Trusted, 1 = Under Observation, 2 = Intruder)
-# -------------------------------------------------------
-kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
-cluster_raw = kmeans.fit_predict(F)
-cluster_means = []
-for c in range(3):
-    cluster_means.append((c, T_score[cluster_raw == c].mean()))
-cluster_means_sorted = sorted(cluster_means, key=lambda x: x[1])
-label_map = {
-    cluster_means_sorted[0][0]: 2,  # lowest trust → Intruder
-    cluster_means_sorted[1][0]: 1,  # medium → Under Observation
-    cluster_means_sorted[2][0]: 0   # highest → Trusted
-}
-cluster_labels = np.array([label_map[c] for c in cluster_raw], dtype=int)
-label_names = {
-    0: "Trusted",
-    1: "Under Observation",
-    2: "Intruder"
-}
-status_counts = np.bincount(cluster_labels, minlength=3)
-def make_status_bar_plot():
-    fig, ax = plt.subplots()
-    labels_txt = ["Trusted", "Under Observation", "Intruder"]
-    ax.bar(labels_txt, status_counts)
-    ax.set_ylabel("Number of users")
-    ax.set_title("Global distribution of user statuses (SNAP graph)")
-    fig.tight_layout()
-    return fig
-# -------------------------------------------------------
-# 7. Train small MLP on fused features -> status
-# -------------------------------------------------------
-X = torch.tensor(F, dtype=torch.float32)
-y = torch.tensor(cluster_labels, dtype=torch.long)
-dataset = TensorDataset(X, y)
-loader = DataLoader(dataset, batch_size=128, shuffle=True)
-class MLPClassifier(nn.Module):
-    def __init__(self, in_dim, hidden_dim=32, num_classes=3):
-        super().__init__()
-        self.net = nn.Sequential(
-            nn.Linear(in_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, hidden_dim),
-            nn.ReLU(),
-            nn.Linear(hidden_dim, num_classes)
-        )
-    def forward(self, x):
-        return self.net(x)
-model = MLPClassifier(in_dim=3)
-criterion = nn.CrossEntropyLoss()
-optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
-for epoch in range(20):
-    model.train()
-    total_loss = 0.0
-    for xb, yb in loader:
-        optimizer.zero_grad()
-        logits = model(xb)
-        loss = criterion(logits, yb)
-        loss.backward()
-        optimizer.step()
-        total_loss += loss.item() * xb.size(0)
-    # optional print, can be commented on HF to reduce logs
-    print(f"Epoch {epoch+1:02d} - loss = {total_loss / len(dataset):.4f}")
-model.eval()
-with torch.no_grad():
-    preds = model(X).argmax(dim=1)
-    acc = (preds == y).float().mean().item()
-print(f"Training accuracy vs pseudo-labels: {acc:.3f}")
-def predict_from_fused(S_val, T_val, B_val):
-    vec3 = np.array([wS * S_val, wT * T_val, wB * B_val], dtype=np.float32)
-    x = torch.tensor(vec3.reshape(1, -1), dtype=torch.float32)
-    model.eval()
-    with torch.no_grad():
-        logits = model(x)
-        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
-        pred = int(np.argmax(probs))
-    return pred, probs
-eng_min = engagement.min()
-eng_max = engagement.max()
-# -------------------------------------------------------
-# 8. Map UI sliders -> S/T/B (paper-style logic)
-# -------------------------------------------------------
-def build_scores_from_user_input(
-    engagement_input,
-    suspicious_input,
-    activity_input,
-    frr_input,
-    mfr_input
-):
-    # Normalize engagement using dataset range
-    eng_norm_ui = (engagement_input - eng_min) / (eng_max - eng_min + 1e-8)
-    eng_norm_ui = float(np.clip(eng_norm_ui, 0.0, 1.0))
-    susp_norm_ui = float(np.clip(suspicious_input, 0.0, 1.0))
-    act_norm_ui = float(np.clip(activity_input, 0.0, 1.0))
-    frr_norm_ui = float(np.clip(frr_input, 0.0, 1.0))
-    mfr_norm_ui = float(np.clip(mfr_input, 0.0, 1.0))
-    # Assume average number of friends ~ 0.5 normalized
-    friends_norm_ui = 0.5
-    # Trust estimate from engagement & suspiciousness
-    trust_norm_ui = (eng_norm_ui + (1.0 - susp_norm_ui)) / 2.0
-    # Construct S / T / B
-    S_ui = (frr_norm_ui + mfr_norm_ui + friends_norm_ui) / 3.0
-    T_ui = (trust_norm_ui + frr_norm_ui + (1.0 - susp_norm_ui)) / 3.0
-    B_ui = (eng_norm_ui + act_norm_ui + susp_norm_ui) / 3.0
-    return S_ui, T_ui, B_ui, eng_norm_ui, susp_norm_ui, act_norm_ui
-# -------------------------------------------------------
-# 9. Timeline (T1–T5) helpers
-# -------------------------------------------------------
-def make_timeline_plot(timeline_state):
-    fig, ax = plt.subplots()
-    if not timeline_state:
-        ax.text(0.5, 0.5, "No timeline yet", ha="center", va="center")
-        ax.set_axis_off()
-        fig.tight_layout()
-        return fig
-    steps = [i + 1 for i in range(len(timeline_state))]
-    trusted = [entry["probs"][0] for entry in timeline_state]
-    obs = [entry["probs"][1] for entry in timeline_state]
-    intr = [entry["probs"][2] for entry in timeline_state]
-    ax.plot(steps, trusted, marker="o", label="Trusted")
-    ax.plot(steps, obs, marker="o", label="Under Observation")
-    ax.plot(steps, intr, marker="o", label="Intruder")
-    ax.set_xticks(steps)
-    ax.set_xlabel("Time step (T1–T5)")
-    ax.set_ylabel("Probability")
-    ax.set_ylim(0, 1)
-    ax.set_title("User status probabilities over time")
-    ax.legend()
-    fig.tight_layout()
-    return fig
-def simulate_week(
-    engagement_input,
-    suspicious_input,
-    activity_input,
-    frr_input,
-    mfr_input,
-    timeline_state
-):
-    if timeline_state is None:
-        timeline_state = []
-    S_ui, T_ui, B_ui, eng_n, susp_n, act_n = build_scores_from_user_input(
-        engagement_input,
-        suspicious_input,
-        activity_input,
-        frr_input,
-        mfr_input
-    )
-    pred, probs = predict_from_fused(S_ui, T_ui, B_ui)
-    status = label_names[pred]
-    # Keep only last 5 time steps (T1–T5)
-    if len(timeline_state) >= 5:
-        timeline_state = timeline_state[1:]
-    timeline_state.append({
-        "status": status,
-        "probs": probs.tolist(),
-        "S": float(S_ui),
-        "T": float(T_ui),
-        "B": float(B_ui)
-    })
-    step_num = len(timeline_state)
-    # Current week summary
-    lines = []
-    lines.append(f"### Current Time Step: T{step_num}")
-    lines.append(f"**Predicted Status:** **{status}**")
-    lines.append("")
-    lines.append("**Probabilities:**")
-    lines.append(f"- Trusted: {probs[0]:.2f}")
-    lines.append(f"- Under Observation: {probs[1]:.2f}")
-    lines.append(f"- Intruder: {probs[2]:.2f}")
-    lines.append("")
-    lines.append("**Aggregated scores (0–1):**")
-    lines.append(f"- S (Social / Structural): `{S_ui:.2f}`")
-    lines.append(f"- T (Trust): `{T_ui:.2f}`")
-    lines.append(f"- B (Behaviour): `{B_ui:.2f}`")
-    lines.append("")
-    lines.append("**Inputs (normalized):**")
-    lines.append(f"- Engagement: `{eng_n:.2f}`")
-    lines.append(f"- Suspiciousness: `{susp_n:.2f}`")
-    lines.append(f"- Activity regularity: `{act_n:.2f}`")
-    current_md = "\n".join(lines)
-    # Timeline text
-    tl_lines = ["## Timeline (T1–T5)"]
-    for i, entry in enumerate(timeline_state):
-        p = entry["probs"]
-        tl_lines.append(
-            f"- **T{i+1}**: {entry['status']} | "
-            f"Trusted={p[0]:.2f}, Obs={p[1]:.2f}, Intruder={p[2]:.2f}"
-        )
-    timeline_md = "\n".join(tl_lines)
-    tl_fig = make_timeline_plot(timeline_state)
-    return current_md, timeline_md, tl_fig, timeline_state
-def reset_timeline():
-    empty_fig = make_timeline_plot([])
-    return (
-        "Timeline reset. Adjust sliders and click **Next week (T+1)** to start from T1.",
-        "## Timeline (T1–T5)\n(No entries yet)",
-        empty_fig,
-        []
-    )
-# -------------------------------------------------------
-# 10. Example table: real Trusted vs Intruder-like nodes
-# -------------------------------------------------------
-def build_example_table(n_per_class=5):
-    rows = []
-    for lbl in [0, 2]:  # 0 = Trusted, 2 = Intruder
-        idxs = np.where(cluster_labels == lbl)[0]
-        if len(idxs) == 0:
-            continue
-        sel = rng.choice(idxs, size=min(n_per_class, len(idxs)), replace=False)
-        tmp = pd.DataFrame({
-            "NodeID": [nodes[i] for i in sel],
-            "Status": [label_names[lbl]] * len(sel),
-            "Degree": deg[sel],
-            "Clustering": cc[sel],
-            "S_score": S_score[sel],
-            "T_score": T_score[sel],
-            "B_score": B_score[sel]
-        })
-        rows.append(tmp)
-    if rows:
-        return pd.concat(rows, ignore_index=True)
     else:
-        return pd.DataFrame(columns=[
-            "NodeID", "Status", "Degree", "Clustering",
-            "S_score", "T_score", "B_score"
-        ])
-examples_df = build_example_table()
-def refresh_examples():
-    return build_example_table()
-global_status_fig = make_status_bar_plot()
-# -------------------------------------------------------
-# 11. Gradio app
-# -------------------------------------------------------
-with gr.Blocks() as demo:
-    gr.Markdown("# Trust-Based Intrusion Detection on SNAP Facebook Graph")
-    gr.Markdown(
-        "This demo uses the **SNAP Facebook combined graph** as a real online social network.\n\n"
-        "- Structural features (degree, clustering, PageRank, neighbour degree) come from the real graph.\n"
-        "- Behavioural features (engagement, suspiciousness, activity regularity, friend-request ratio, "
-        "mutual-friends ratio) are generated **synthetically but guided by the graph structure**, following the "
-        "spirit of your paper.\n\n"
-        "We fuse these into **S (Social)**, **T (Trust)** and **B (Behaviour)** scores, cluster users into "
-        "**Trusted / Under Observation / Intruder**, and train a small neural network to replicate this.\n\n"
-        "**Use the sliders** to simulate how a user changes behaviour over time. Each click on "
-        "**Next week (T+1)** advances the time step T1..T5 and updates the model's judgement."
-    )
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("### Behaviour Inputs (for one user)")
-            engagement_slider = gr.Slider(
-                minimum=float(eng_min),
-                maximum=float(eng_max),
-                value=float((eng_min + eng_max) / 2.0),
-                step=1.0,
-                label="Engagement level (synthetic, based on graph centrality)"
-            )
-            suspicious_slider = gr.Slider(
-                minimum=0.0,
-                maximum=1.0,
-                value=0.2,
-                step=0.01,
-                label="Suspiciousness (0 = clean, 1 = very suspicious)"
-            )
-            activity_slider = gr.Slider(
-                minimum=0.0,
-                maximum=1.0,
-                value=0.7,
-                step=0.01,
-                label="Activity regularity (1 = very regular, 0 = random)"
-            )
-            frr_slider = gr.Slider(
-                minimum=0.0,
-                maximum=1.0,
-                value=0.8,
-                step=0.01,
-                label="Friend Request Ratio (accepted / sent)"
-            )
-            mfr_slider = gr.Slider(
-                minimum=0.0,
-                maximum=1.0,
-                value=0.6,
-                step=0.01,
-                label="Mutual Friends Ratio (proxy)"
-            )
-            next_button = gr.Button("Next week (T+1)")
-            reset_button = gr.Button("Reset timeline")
-        with gr.Column():
-            current_box = gr.Markdown(
-                "Current time-step status will appear here after you click **Next week (T+1)**."
-            )
-            timeline_box = gr.Markdown(
-                "## Timeline (T1–T5)\n(No entries yet)"
-            )
-            timeline_plot = gr.Plot(
-                value=make_timeline_plot([]),
-                label="Timeline probabilities (T1–T5)"
-            )
-    gr.Markdown("### Global Status Distribution on the SNAP Graph")
-    status_plot = gr.Plot(value=global_status_fig)
-    gr.Markdown("### Example Users (Real graph nodes: Trusted vs Intruder-like)")
-    examples_table = gr.Dataframe(
-        value=examples_df,
-        label="Sample nodes from SNAP Facebook",
-        interactive=False
-    )
-    refresh_button = gr.Button("Refresh examples")
-    timeline_state = gr.State([])
-    next_button.click(
-        fn=simulate_week,
-        inputs=[
-            engagement_slider,
-            suspicious_slider,
-            activity_slider,
-            frr_slider,
-            mfr_slider,
-            timeline_state
-        ],
-        outputs=[current_box, timeline_box, timeline_plot, timeline_state]
-    )
-    reset_button.click(
-        fn=reset_timeline,
-        inputs=None,
-        outputs=[current_box, timeline_box, timeline_plot, timeline_state]
-    )
-    refresh_button.click(
-        fn=refresh_examples,
-        inputs=None,
-        outputs=[examples_table]
     )
-if __name__ == "__main__":
-    demo.launch()

+# app.py
+import streamlit as st
 import pandas as pd
+import numpy as np
+import plotly.express as px
+st.set_page_config(page_title="Excel → Management Insights (Power BI style)", layout="wide")
+st.title("📊 Excel → Interactive Management Dashboard (Power BI style)")
+st.caption("Grade-based decision rule: **PASS if Grade ≥ C (including C, C+, B-, etc.)** and **FAIL if below C (C-, D, F, etc.)**. Marks thresholds are not used.")
+# -----------------------------
+# Grade logic (FINAL as per you)
+# -----------------------------
+def grade_pass_fail(g):
+    if pd.isna(g):
+        return "Unknown"
+    g = str(g).strip().upper()
+    # Explicit FAIL
+    if g.startswith(("D", "E", "F")):
+        return "Fail"
+    # C- is FAIL, all other C variants are PASS
+    if g.startswith("C"):
+        if g == "C-" or g.startswith("C-"):
+            return "Fail"
+        return "Pass"
+    # A/B (with any +/-) are PASS
+    if g.startswith(("A", "B")):
+        return "Pass"
+    return "Unknown"
+def pick_grade_column(df: pd.DataFrame) -> str:
+    # User confirmed "Grade is last column" — we still try to be robust.
+    candidates = [c for c in df.columns if "grade" in str(c).lower()]
+    if candidates:
+        return candidates[-1]
+    return df.columns[-1]
+def normalize_headers(df: pd.DataFrame) -> pd.DataFrame:
+    # Clean common trailing spaces
+    df = df.copy()
+    df.columns = [str(c).strip() for c in df.columns]
+    return df
+def coerce_numeric(df: pd.DataFrame, cols):
+    for c in cols:
+        if c in df.columns:
+            df[c] = pd.to_numeric(df[c], errors="coerce")
+    return df
+def describe_fail_reason(row, components):
+    # Human-readable reason (simple, management-friendly)
+    if row.get("PassFail") != "Fail":
+        return ""
+    hints = []
+    for c in components:
+        v = row.get(c)
+        if pd.notna(v):
+            # rough, non-controversial hinting — not using thresholds for pass/fail
+            if c.lower().find("final") >= 0 and v < np.nanpercentile(components_df[c].dropna(), 25):
+                hints.append("Final exam is in the lower quartile")
+            if c.lower().find("lab") >= 0 and v < np.nanpercentile(components_df[c].dropna(), 25):
+                hints.append("Lab total is in the lower quartile")
+            if c.lower().find("mid") >= 0 and v < np.nanpercentile(components_df[c].dropna(), 25):
+                hints.append("Mid exam is in the lower quartile")
+            if c.lower().find("test") >= 0 and v < np.nanpercentile(components_df[c].dropna(), 25):
+                hints.append("Test score is in the lower quartile")
+    if not hints:
+        return "Grade below C (check component performance & attendance/assessment issues)."
+    return " | ".join(hints)
+# -----------------------------
+# Upload + read
+# -----------------------------
+uploaded = st.file_uploader("Upload Excel (.xlsx)", type=["xlsx"])
+if uploaded is None:
+    st.info("Upload an Excel file to begin.")
+    st.stop()
+xls = pd.ExcelFile(uploaded)
+sheet = st.selectbox("Select sheet", xls.sheet_names, index=0)
+raw = pd.read_excel(uploaded, sheet_name=sheet)
+raw = normalize_headers(raw)
+# Try to remove non-student rows (robust: keep rows with any numeric marks OR any grade-like text)
+grade_col_name = pick_grade_column(raw)
+tmp_grade = raw[grade_col_name].astype(str).str.strip()
+grade_like = tmp_grade.str.match(r"^[A-Fa-f][\+\-]?$", na=False)
+numeric_cols_guess = [c for c in raw.columns if c != grade_col_name]
+numeric_signal = raw[numeric_cols_guess].apply(pd.to_numeric, errors="coerce").notna().sum(axis=1) > 0
+df = raw[grade_like | numeric_signal].copy()
+# Add Sno if exists, else create row id
+sno_col = None
+for c in df.columns:
+    if str(c).strip().lower() in ["sno", "sno.", "sr", "sr.", "id", "studentid", "student id"]:
+        sno_col = c
+        break
+if sno_col is None:
+    df.insert(0, "Sno", range(1, len(df) + 1))
+    sno_col = "Sno"
+# Grade column
+df["Grade"] = df[grade_col_name].astype(str).str.strip().str.upper()
+df["PassFail"] = df["Grade"].apply(grade_pass_fail)
+df["Pass"] = df["PassFail"].eq("Pass")
+df["Fail"] = df["PassFail"].eq("Fail")
+# Identify likely mark columns (common names; if not found, pick numeric ones)
+common_components = ["Test -1", "Test-1", "Test 1", "Mid Exam", "Mid", "Lab Total", "Final Exam", "Total"]
+component_cols = [c for c in df.columns if c in common_components]
+if not component_cols:
+    # fallback: all numeric columns except Sno
+    num_cols = df.columns[df.apply(lambda s: pd.to_numeric(s, errors="coerce").notna().mean() > 0.4)]
+    component_cols = [c for c in num_cols if c != sno_col]
+# Coerce numerics (if present)
+df = coerce_numeric(df, component_cols)
+# Consistency score (std across available components)
+if len(component_cols) >= 2:
+    df["Consistency_SD"] = df[component_cols].std(axis=1, skipna=True)
 else:
+    df["Consistency_SD"] = np.nan
+# Global for hinting
+components_df = df.copy()
+# Optional “Fail reason” (for drilldown / risk view)
+if component_cols:
+    df["FailReasonHint"] = df.apply(lambda r: describe_fail_reason(r, component_cols), axis=1)
+else:
+    df["FailReasonHint"] = np.where(df["Fail"], "Grade below C.", "")
+# -----------------------------
+# Sidebar: “Power BI pages”
+# -----------------------------
+st.sidebar.header("Perspective")
+view = st.sidebar.radio(
+    "Choose a view",
+    ["Executive (Management)", "Risk & Intervention", "Assessment Quality", "Student Drill-down", "Export for Power BI"],
+    index=0
+)
+st.sidebar.header("Filters")
+pf = st.sidebar.multiselect("Pass/Fail", ["Pass", "Fail", "Unknown"], default=["Pass", "Fail", "Unknown"])
+grade_unique = sorted([g for g in df["Grade"].dropna().unique()])
+sel_grades = st.sidebar.multiselect("Grades", grade_unique, default=grade_unique)
+filtered = df[df["PassFail"].isin(pf)]
+filtered = filtered[filtered["Grade"].isin(sel_grades)]
+# -----------------------------
+# KPI Row
+# -----------------------------
+k1, k2, k3, k4, k5 = st.columns(5)
+with k1: st.metric("Students", int(filtered.shape[0]))
+with k2: st.metric("Pass", int(filtered["Pass"].sum()))
+with k3: st.metric("Fail", int(filtered["Fail"].sum()))
+with k4:
+    pr = (filtered["Pass"].mean() * 100) if filtered.shape[0] else 0
+    st.metric("Pass Rate", f"{pr:.1f}%")
+with k5:
+    if "Total" in filtered.columns and pd.api.types.is_numeric_dtype(filtered["Total"]):
+        st.metric("Average Total", f"{filtered['Total'].mean():.2f}")
     else:
+        st.metric("Average Total", "—")
+st.divider()
+# -----------------------------
+# Views
+# -----------------------------
+def executive_view(d):
+    left, right = st.columns([1, 1])
+    with left:
+        st.subheader("Grade Distribution")
+        grade_counts = d["Grade"].value_counts(dropna=False).reset_index()
+        grade_counts.columns = ["Grade", "Count"]
+        fig = px.bar(grade_counts, x="Grade", y="Count")
+        st.plotly_chart(fig, use_container_width=True)
+    with right:
+        st.subheader("Pass/Fail Distribution")
+        pf_counts = d["PassFail"].value_counts(dropna=False).reset_index()
+        pf_counts.columns = ["Status", "Count"]
+        fig = px.pie(pf_counts, names="Status", values="Count")
+        st.plotly_chart(fig, use_container_width=True)
+    st.subheader("Hidden Patterns (Quick Signals)")
+    c1, c2, c3 = st.columns(3)
+    # Pattern: Strong Lab but Fail (if lab exists)
+    if any("Lab" in c for c in component_cols):
+        lab_col = [c for c in component_cols if "Lab" in c][0]
+        strong_lab_fail = d[(d["Fail"]) & (d[lab_col].notna()) & (d[lab_col] >= d[lab_col].quantile(0.75))]
+        with c1:
+            st.metric("Fail with Strong Lab", int(strong_lab_fail.shape[0]))
+    else:
+        with c1:
+            st.metric("Fail with Strong Lab", "—")
+    # Pattern: Inconsistent high SD
+    if "Consistency_SD" in d.columns and d["Consistency_SD"].notna().any():
+        top_incons = d["Consistency_SD"].quantile(0.90)
+        with c2:
+            st.metric("High Inconsistency (Top 10%)", int((d["Consistency_SD"] >= top_incons).sum()))
+    else:
+        with c2:
+            st.metric("High Inconsistency (Top 10%)", "—")
+    # Pattern: Fail with good Total (if Total exists)
+    if "Total" in d.columns and pd.api.types.is_numeric_dtype(d["Total"]) and d["Total"].notna().any():
+        good_total_fail = d[(d["Fail"]) & (d["Total"] >= d["Total"].quantile(0.75))]
+        with c3:
+            st.metric("Fail with High Total", int(good_total_fail.shape[0]))
+    else:
+        with c3:
+            st.metric("Fail with High Total", "—")
+    if component_cols and "Total" in d.columns and pd.api.types.is_numeric_dtype(d["Total"]):
+        st.subheader("What Drives Total? (Correlation)")
+        corr_cols = [c for c in component_cols if c in d.columns] + ["Total"]
+        corr = d[corr_cols].corr(numeric_only=True)
+        fig = px.imshow(corr, text_auto=True, aspect="auto")
+        st.plotly_chart(fig, use_container_width=True)
+def risk_view(d):
+    st.subheader("Fail List (Grade < C)")
+    fails = d[d["Fail"]].copy()
+    # Bucket: C- vs D/F etc.
+    fails["FailType"] = np.where(fails["Grade"].str.startswith("C-"), "C- (Borderline Fail)", "Below C")
+    bucket = fails["FailType"].value_counts().reset_index()
+    bucket.columns = ["Fail Type", "Count"]
+    c1, c2 = st.columns([1, 2])
+    with c1:
+        fig = px.bar(bucket, x="Fail Type", y="Count")
+        st.plotly_chart(fig, use_container_width=True)
+    with c2:
+        show_cols = [sno_col, "Grade", "PassFail"]
+        for c in ["Total"] + component_cols:
+            if c in fails.columns and c not in show_cols:
+                show_cols.append(c)
+        show_cols += ["FailReasonHint"]
+        st.dataframe(fails[show_cols].sort_values(by=["Grade", sno_col]), use_container_width=True, height=420)
+    st.subheader("Intervention Suggestions (Management-friendly)")
+    st.markdown(
+        """
+- **Many C- failures** → run targeted revision + re-assessment readiness support (borderline group).
+- **Failures concentrated with low Final** → strengthen exam preparation (mock exams + feedback).
+- **Failures with strong Lab** → review exam alignment, study strategy, and assessment balance.
+"""
     )
+def assessment_quality_view(d):
+    st.subheader("Assessment Component Overview")
+    if not component_cols:
+        st.warning("No numeric component columns detected. Add columns like Test/Mid/Lab/Final/Total for deeper assessment analysis.")
+        return
+    # Component distributions
+    comp = st.selectbox("Choose component", component_cols, index=min(0, len(component_cols)-1))
+    fig = px.histogram(d, x=comp, nbins=20)
+    st.plotly_chart(fig, use_container_width=True)
+    # Component vs Grade
+    st.subheader("Component vs Grade (Boxplot)")
+    fig = px.box(d, x="Grade", y=comp)
+    st.plotly_chart(fig, use_container_width=True)
+    # Zero / missing checks
+    st.subheader("Data Quality Flags")
+    flags = []
+    for c in component_cols:
+        series = d[c]
+        if pd.api.types.is_numeric_dtype(series):
+            missing = int(series.isna().sum())
+            zeros = int((series == 0).sum())
+            flags.append({"Component": c, "Missing": missing, "Zeros": zeros})
+    st.dataframe(pd.DataFrame(flags), use_container_width=True)
+    # If Total exists: correlation heatmap
+    if "Total" in d.columns and pd.api.types.is_numeric_dtype(d["Total"]):
+        st.subheader("Correlation Heatmap")
+        corr_cols = [c for c in component_cols if c in d.columns] + ["Total"]
+        corr = d[corr_cols].corr(numeric_only=True)
+        fig = px.imshow(corr, text_auto=True, aspect="auto")
+        st.plotly_chart(fig, use_container_width=True)
+def student_drilldown_view(d):
+    st.subheader("Student Drill-down")
+    st.caption("Pick a student to view component breakdown and the grade-based decision.")
+    sid = st.selectbox("Select student (Sno)", sorted(d[sno_col].unique()))
+    row = d[d[sno_col] == sid].iloc[0]
+    c1, c2, c3 = st.columns(3)
+    with c1: st.metric("Grade", str(row.get("Grade", "—")))
+    with c2: st.metric("Status", str(row.get("PassFail", "—")))
+    with c3:
+        if "Total" in d.columns and pd.notna(row.get("Total", np.nan)):
+            st.metric("Total", f"{row['Total']:.2f}")
+        else:
+            st.metric("Total", "—")
+    st.write("**Reason (simple hint):**", row.get("FailReasonHint", ""))
+    # Component bar
+    if component_cols:
+        comp_vals = {c: row.get(c) for c in component_cols if c in d.columns}
+        comp_df = pd.DataFrame({"Component": list(comp_vals.keys()), "Score": list(comp_vals.values())})
+        fig = px.bar(comp_df, x="Component", y="Score")
+        st.plotly_chart(fig, use_container_width=True)
+    st.subheader("Raw record")
+    st.dataframe(pd.DataFrame(row).T, use_container_width=True)
+def export_view(d):
+    st.subheader("Export for Power BI")
+    st.caption("Download cleaned data with the computed PassFail fields. Load into Power BI (Get Data → Text/CSV).")
+    clean_csv = d.to_csv(index=False).encode("utf-8")
+    st.download_button("⬇️ Download Cleaned Data (CSV)", clean_csv, file_name="cleaned_marks_with_passfail.csv", mime="text/csv")
+    st.subheader("Recommended Power BI Measures (DAX)")
+    st.code(r"""
+Pass Count = CALCULATE(COUNTROWS(cleaned_marks), cleaned_marks[PassFail] = "Pass")
+Fail Count = CALCULATE(COUNTROWS(cleaned_marks), cleaned_marks[PassFail] = "Fail")
+Pass Rate % = DIVIDE([Pass Count], COUNTROWS(cleaned_marks))
+""", language="text")
+    st.subheader("Summary Tables")
+    grade_summary = d["Grade"].value_counts(dropna=False).reset_index()
+    grade_summary.columns = ["Grade", "Count"]
+    st.dataframe(grade_summary, use_container_width=True)
+    pf_summary = d["PassFail"].value_counts(dropna=False).reset_index()
+    pf_summary.columns = ["PassFail", "Count"]
+    st.dataframe(pf_summary, use_container_width=True)
+# Render selected view
+if view == "Executive (Management)":
+    executive_view(filtered)
+elif view == "Risk & Intervention":
+    risk_view(filtered)
+elif view == "Assessment Quality":
+    assessment_quality_view(filtered)
+elif view == "Student Drill-down":
+    student_drilldown_view(filtered)
+else:
+    export_view(filtered)