Deevyankar commited on
Commit
e414cdf
Β·
verified Β·
1 Parent(s): 91e93f9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +353 -542
app.py CHANGED
@@ -1,549 +1,360 @@
1
- import os
2
- import urllib.request
3
- import gzip
4
- import io
5
-
6
- import numpy as np
7
  import pandas as pd
8
- import networkx as nx
9
- from sklearn.cluster import KMeans
10
-
11
- import torch
12
- from torch import nn
13
- from torch.utils.data import TensorDataset, DataLoader
14
-
15
- import matplotlib.pyplot as plt
16
- import gradio as gr
17
-
18
- # -------------------------------------------------------
19
- # 1. Download and load SNAP Facebook combined graph
20
- # -------------------------------------------------------
21
-
22
- SNAP_URL = "https://snap.stanford.edu/data/facebook_combined.txt.gz"
23
- DATA_DIR = "data"
24
- os.makedirs(DATA_DIR, exist_ok=True)
25
- LOCAL_PATH = os.path.join(DATA_DIR, "facebook_combined.txt.gz")
26
-
27
- if not os.path.exists(LOCAL_PATH):
28
- print("Downloading SNAP Facebook dataset...")
29
- urllib.request.urlretrieve(SNAP_URL, LOCAL_PATH)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  else:
31
- print("Using cached SNAP dataset.")
32
-
33
- print("Loading graph...")
34
- with gzip.open(LOCAL_PATH, "rt") as f:
35
- G = nx.read_edgelist(f, nodetype=int)
36
-
37
- print(f"Graph loaded: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
38
-
39
- # Ensure largest connected component (should already be connected in this dataset)
40
- if not nx.is_connected(G):
41
- largest_cc = max(nx.connected_components(G), key=len)
42
- G = G.subgraph(largest_cc).copy()
43
- print(f"After LCC: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")
44
-
45
- nodes = list(G.nodes())
46
- node_index = {n: i for i, n in enumerate(nodes)}
47
- N = len(nodes)
48
-
49
- # -------------------------------------------------------
50
- # 2. Real structural features from SNAP graph
51
- # -------------------------------------------------------
52
-
53
- # Degree
54
- deg = np.array([G.degree(n) for n in nodes], dtype=float)
55
-
56
- # Clustering coefficient
57
- cc_dict = nx.clustering(G)
58
- cc = np.array([cc_dict[n] for n in nodes], dtype=float)
59
-
60
- # Average neighbor degree
61
- avg_nd_dict = nx.average_neighbor_degree(G)
62
- avg_nd = np.array([avg_nd_dict[n] for n in nodes], dtype=float)
63
-
64
- # PageRank
65
- pr_dict = nx.pagerank(G, alpha=0.85)
66
- pr = np.array([pr_dict[n] for n in nodes], dtype=float)
67
-
68
- def minmax(x):
69
- x = np.asarray(x, dtype=float)
70
- return (x - x.min()) / (x.max() - x.min() + 1e-8)
71
-
72
- deg_norm = minmax(deg)
73
- cc_norm = minmax(cc)
74
- avg_nd_norm = minmax(avg_nd)
75
- pr_norm = minmax(pr)
76
-
77
- print("Sample structural features for first 5 nodes:")
78
- for i in range(5):
79
- print(
80
- nodes[i],
81
- "deg=", deg[i],
82
- "deg_norm=", round(deg_norm[i], 3),
83
- "cc_norm=", round(cc_norm[i], 3),
84
- "avg_nd_norm=", round(avg_nd_norm[i], 3),
85
- "pr_norm=", round(pr_norm[i], 3),
86
- )
87
-
88
- # -------------------------------------------------------
89
- # 3. Paper-style behavioural features (synthetic but graph-driven)
90
- # -------------------------------------------------------
91
-
92
- rng = np.random.default_rng(42)
93
-
94
- # Engagement: central users are more "engaged"
95
- engagement = 50 * (0.6 * deg_norm + 0.4 * avg_nd_norm) + rng.normal(0, 3, size=N)
96
- engagement = np.clip(engagement, 0, None)
97
- eng_norm = minmax(engagement)
98
-
99
- # Trust base: users with higher PageRank and clustering are more trusted
100
- trust_base = (pr_norm + cc_norm) / 2.0
101
-
102
- # Suspicious: high degree but low clustering and low PageRank
103
- suspicious_raw = deg_norm * (1.0 - cc_norm) * (1.0 - pr_norm)
104
- suspicious_raw += 0.1 * rng.random(N)
105
- susp_norm = minmax(suspicious_raw)
106
-
107
- # Activity regularity: more regular if clustering is high (stable community)
108
- activity_reg = cc_norm + rng.normal(0, 0.05, size=N)
109
- activity_reg = np.clip(activity_reg, 0.0, 1.0)
110
- act_norm = minmax(activity_reg)
111
-
112
- # Friend requests sent: more for high degree, but bounded
113
- sent_requests = rng.poisson(lam=2 + 15 * deg_norm)
114
- sent_requests = np.maximum(sent_requests, 1)
115
-
116
- # Acceptance probability: higher for trusted, lower for suspicious
117
- accepted_prob = 0.1 + 0.7 * ((trust_base + (1.0 - susp_norm)) / 2.0)
118
- accepted_prob = np.clip(accepted_prob, 0.0, 1.0)
119
- accepted_requests = rng.binomial(sent_requests, accepted_prob)
120
- friend_request_ratio = accepted_requests / (sent_requests + 1e-8)
121
- frr_norm = minmax(friend_request_ratio)
122
-
123
- # Mutual friends ratio (approx): we use clustering coefficient as a proxy
124
- # because high clustering means many mutual connections among friends.
125
- mutual_friends_ratio = cc_norm.copy()
126
- mfr_norm = minmax(mutual_friends_ratio)
127
-
128
- friends_norm = minmax(deg) # total friends β‰ˆ degree
129
-
130
- # -------------------------------------------------------
131
- # 4. Build S, T, B scores (in spirit of your paper)
132
- # -------------------------------------------------------
133
-
134
- # S: social / structural (FRR, MFR, friends)
135
- S_score = (frr_norm + mfr_norm + friends_norm) / 3.0
136
-
137
- # T: trust (trust_base, FRR, inverse suspiciousness)
138
- T_score = (trust_base + frr_norm + (1.0 - susp_norm)) / 3.0
139
-
140
- # B: behaviour (engagement, regularity, suspiciousness)
141
- B_score = (eng_norm + act_norm + susp_norm) / 3.0
142
-
143
- # -------------------------------------------------------
144
- # 5. Fuse S, T, B with variance-based weights
145
- # -------------------------------------------------------
146
-
147
- varS = np.var(S_score)
148
- varT = np.var(T_score)
149
- varB = np.var(B_score)
150
- den = varS + varT + varB + 1e-8
151
- wS, wT, wB = varS / den, varT / den, varB / den
152
-
153
- F = np.vstack([
154
- wS * S_score,
155
- wT * T_score,
156
- wB * B_score
157
- ]).T # shape (N, 3)
158
-
159
- print("Fusion weights:", wS, wT, wB)
160
- print("F shape:", F.shape)
161
-
162
- # -------------------------------------------------------
163
- # 6. KMeans clustering -> pseudo labels
164
- # (0 = Trusted, 1 = Under Observation, 2 = Intruder)
165
- # -------------------------------------------------------
166
-
167
- kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
168
- cluster_raw = kmeans.fit_predict(F)
169
-
170
- cluster_means = []
171
- for c in range(3):
172
- cluster_means.append((c, T_score[cluster_raw == c].mean()))
173
- cluster_means_sorted = sorted(cluster_means, key=lambda x: x[1])
174
-
175
- label_map = {
176
- cluster_means_sorted[0][0]: 2, # lowest trust β†’ Intruder
177
- cluster_means_sorted[1][0]: 1, # medium β†’ Under Observation
178
- cluster_means_sorted[2][0]: 0 # highest β†’ Trusted
179
- }
180
-
181
- cluster_labels = np.array([label_map[c] for c in cluster_raw], dtype=int)
182
-
183
- label_names = {
184
- 0: "Trusted",
185
- 1: "Under Observation",
186
- 2: "Intruder"
187
- }
188
-
189
- status_counts = np.bincount(cluster_labels, minlength=3)
190
-
191
- def make_status_bar_plot():
192
- fig, ax = plt.subplots()
193
- labels_txt = ["Trusted", "Under Observation", "Intruder"]
194
- ax.bar(labels_txt, status_counts)
195
- ax.set_ylabel("Number of users")
196
- ax.set_title("Global distribution of user statuses (SNAP graph)")
197
- fig.tight_layout()
198
- return fig
199
-
200
- # -------------------------------------------------------
201
- # 7. Train small MLP on fused features -> status
202
- # -------------------------------------------------------
203
-
204
- X = torch.tensor(F, dtype=torch.float32)
205
- y = torch.tensor(cluster_labels, dtype=torch.long)
206
-
207
- dataset = TensorDataset(X, y)
208
- loader = DataLoader(dataset, batch_size=128, shuffle=True)
209
-
210
- class MLPClassifier(nn.Module):
211
- def __init__(self, in_dim, hidden_dim=32, num_classes=3):
212
- super().__init__()
213
- self.net = nn.Sequential(
214
- nn.Linear(in_dim, hidden_dim),
215
- nn.ReLU(),
216
- nn.Linear(hidden_dim, hidden_dim),
217
- nn.ReLU(),
218
- nn.Linear(hidden_dim, num_classes)
219
- )
220
- def forward(self, x):
221
- return self.net(x)
222
-
223
- model = MLPClassifier(in_dim=3)
224
- criterion = nn.CrossEntropyLoss()
225
- optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
226
-
227
- for epoch in range(20):
228
- model.train()
229
- total_loss = 0.0
230
- for xb, yb in loader:
231
- optimizer.zero_grad()
232
- logits = model(xb)
233
- loss = criterion(logits, yb)
234
- loss.backward()
235
- optimizer.step()
236
- total_loss += loss.item() * xb.size(0)
237
- # optional print, can be commented on HF to reduce logs
238
- print(f"Epoch {epoch+1:02d} - loss = {total_loss / len(dataset):.4f}")
239
-
240
- model.eval()
241
- with torch.no_grad():
242
- preds = model(X).argmax(dim=1)
243
- acc = (preds == y).float().mean().item()
244
- print(f"Training accuracy vs pseudo-labels: {acc:.3f}")
245
-
246
- def predict_from_fused(S_val, T_val, B_val):
247
- vec3 = np.array([wS * S_val, wT * T_val, wB * B_val], dtype=np.float32)
248
- x = torch.tensor(vec3.reshape(1, -1), dtype=torch.float32)
249
- model.eval()
250
- with torch.no_grad():
251
- logits = model(x)
252
- probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
253
- pred = int(np.argmax(probs))
254
- return pred, probs
255
-
256
- eng_min = engagement.min()
257
- eng_max = engagement.max()
258
-
259
- # -------------------------------------------------------
260
- # 8. Map UI sliders -> S/T/B (paper-style logic)
261
- # -------------------------------------------------------
262
-
263
- def build_scores_from_user_input(
264
- engagement_input,
265
- suspicious_input,
266
- activity_input,
267
- frr_input,
268
- mfr_input
269
- ):
270
- # Normalize engagement using dataset range
271
- eng_norm_ui = (engagement_input - eng_min) / (eng_max - eng_min + 1e-8)
272
- eng_norm_ui = float(np.clip(eng_norm_ui, 0.0, 1.0))
273
-
274
- susp_norm_ui = float(np.clip(suspicious_input, 0.0, 1.0))
275
- act_norm_ui = float(np.clip(activity_input, 0.0, 1.0))
276
- frr_norm_ui = float(np.clip(frr_input, 0.0, 1.0))
277
- mfr_norm_ui = float(np.clip(mfr_input, 0.0, 1.0))
278
-
279
- # Assume average number of friends ~ 0.5 normalized
280
- friends_norm_ui = 0.5
281
-
282
- # Trust estimate from engagement & suspiciousness
283
- trust_norm_ui = (eng_norm_ui + (1.0 - susp_norm_ui)) / 2.0
284
-
285
- # Construct S / T / B
286
- S_ui = (frr_norm_ui + mfr_norm_ui + friends_norm_ui) / 3.0
287
- T_ui = (trust_norm_ui + frr_norm_ui + (1.0 - susp_norm_ui)) / 3.0
288
- B_ui = (eng_norm_ui + act_norm_ui + susp_norm_ui) / 3.0
289
-
290
- return S_ui, T_ui, B_ui, eng_norm_ui, susp_norm_ui, act_norm_ui
291
-
292
- # -------------------------------------------------------
293
- # 9. Timeline (T1–T5) helpers
294
- # -------------------------------------------------------
295
-
296
- def make_timeline_plot(timeline_state):
297
- fig, ax = plt.subplots()
298
- if not timeline_state:
299
- ax.text(0.5, 0.5, "No timeline yet", ha="center", va="center")
300
- ax.set_axis_off()
301
- fig.tight_layout()
302
- return fig
303
-
304
- steps = [i + 1 for i in range(len(timeline_state))]
305
- trusted = [entry["probs"][0] for entry in timeline_state]
306
- obs = [entry["probs"][1] for entry in timeline_state]
307
- intr = [entry["probs"][2] for entry in timeline_state]
308
-
309
- ax.plot(steps, trusted, marker="o", label="Trusted")
310
- ax.plot(steps, obs, marker="o", label="Under Observation")
311
- ax.plot(steps, intr, marker="o", label="Intruder")
312
-
313
- ax.set_xticks(steps)
314
- ax.set_xlabel("Time step (T1–T5)")
315
- ax.set_ylabel("Probability")
316
- ax.set_ylim(0, 1)
317
- ax.set_title("User status probabilities over time")
318
- ax.legend()
319
- fig.tight_layout()
320
- return fig
321
-
322
- def simulate_week(
323
- engagement_input,
324
- suspicious_input,
325
- activity_input,
326
- frr_input,
327
- mfr_input,
328
- timeline_state
329
- ):
330
- if timeline_state is None:
331
- timeline_state = []
332
-
333
- S_ui, T_ui, B_ui, eng_n, susp_n, act_n = build_scores_from_user_input(
334
- engagement_input,
335
- suspicious_input,
336
- activity_input,
337
- frr_input,
338
- mfr_input
339
- )
340
 
341
- pred, probs = predict_from_fused(S_ui, T_ui, B_ui)
342
- status = label_names[pred]
343
-
344
- # Keep only last 5 time steps (T1–T5)
345
- if len(timeline_state) >= 5:
346
- timeline_state = timeline_state[1:]
347
- timeline_state.append({
348
- "status": status,
349
- "probs": probs.tolist(),
350
- "S": float(S_ui),
351
- "T": float(T_ui),
352
- "B": float(B_ui)
353
- })
354
-
355
- step_num = len(timeline_state)
356
-
357
- # Current week summary
358
- lines = []
359
- lines.append(f"### Current Time Step: T{step_num}")
360
- lines.append(f"**Predicted Status:** **{status}**")
361
- lines.append("")
362
- lines.append("**Probabilities:**")
363
- lines.append(f"- Trusted: {probs[0]:.2f}")
364
- lines.append(f"- Under Observation: {probs[1]:.2f}")
365
- lines.append(f"- Intruder: {probs[2]:.2f}")
366
- lines.append("")
367
- lines.append("**Aggregated scores (0–1):**")
368
- lines.append(f"- S (Social / Structural): `{S_ui:.2f}`")
369
- lines.append(f"- T (Trust): `{T_ui:.2f}`")
370
- lines.append(f"- B (Behaviour): `{B_ui:.2f}`")
371
- lines.append("")
372
- lines.append("**Inputs (normalized):**")
373
- lines.append(f"- Engagement: `{eng_n:.2f}`")
374
- lines.append(f"- Suspiciousness: `{susp_n:.2f}`")
375
- lines.append(f"- Activity regularity: `{act_n:.2f}`")
376
-
377
- current_md = "\n".join(lines)
378
-
379
- # Timeline text
380
- tl_lines = ["## Timeline (T1–T5)"]
381
- for i, entry in enumerate(timeline_state):
382
- p = entry["probs"]
383
- tl_lines.append(
384
- f"- **T{i+1}**: {entry['status']} | "
385
- f"Trusted={p[0]:.2f}, Obs={p[1]:.2f}, Intruder={p[2]:.2f}"
386
- )
387
- timeline_md = "\n".join(tl_lines)
388
-
389
- tl_fig = make_timeline_plot(timeline_state)
390
-
391
- return current_md, timeline_md, tl_fig, timeline_state
392
-
393
- def reset_timeline():
394
- empty_fig = make_timeline_plot([])
395
- return (
396
- "Timeline reset. Adjust sliders and click **Next week (T+1)** to start from T1.",
397
- "## Timeline (T1–T5)\n(No entries yet)",
398
- empty_fig,
399
- []
400
- )
401
 
402
- # -------------------------------------------------------
403
- # 10. Example table: real Trusted vs Intruder-like nodes
404
- # -------------------------------------------------------
405
-
406
- def build_example_table(n_per_class=5):
407
- rows = []
408
- for lbl in [0, 2]: # 0 = Trusted, 2 = Intruder
409
- idxs = np.where(cluster_labels == lbl)[0]
410
- if len(idxs) == 0:
411
- continue
412
- sel = rng.choice(idxs, size=min(n_per_class, len(idxs)), replace=False)
413
- tmp = pd.DataFrame({
414
- "NodeID": [nodes[i] for i in sel],
415
- "Status": [label_names[lbl]] * len(sel),
416
- "Degree": deg[sel],
417
- "Clustering": cc[sel],
418
- "S_score": S_score[sel],
419
- "T_score": T_score[sel],
420
- "B_score": B_score[sel]
421
- })
422
- rows.append(tmp)
423
- if rows:
424
- return pd.concat(rows, ignore_index=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
425
  else:
426
- return pd.DataFrame(columns=[
427
- "NodeID", "Status", "Degree", "Clustering",
428
- "S_score", "T_score", "B_score"
429
- ])
430
-
431
- examples_df = build_example_table()
432
-
433
- def refresh_examples():
434
- return build_example_table()
435
-
436
- global_status_fig = make_status_bar_plot()
437
-
438
- # -------------------------------------------------------
439
- # 11. Gradio app
440
- # -------------------------------------------------------
441
-
442
- with gr.Blocks() as demo:
443
- gr.Markdown("# Trust-Based Intrusion Detection on SNAP Facebook Graph")
444
- gr.Markdown(
445
- "This demo uses the **SNAP Facebook combined graph** as a real online social network.\n\n"
446
- "- Structural features (degree, clustering, PageRank, neighbour degree) come from the real graph.\n"
447
- "- Behavioural features (engagement, suspiciousness, activity regularity, friend-request ratio, "
448
- "mutual-friends ratio) are generated **synthetically but guided by the graph structure**, following the "
449
- "spirit of your paper.\n\n"
450
- "We fuse these into **S (Social)**, **T (Trust)** and **B (Behaviour)** scores, cluster users into "
451
- "**Trusted / Under Observation / Intruder**, and train a small neural network to replicate this.\n\n"
452
- "**Use the sliders** to simulate how a user changes behaviour over time. Each click on "
453
- "**Next week (T+1)** advances the time step T1..T5 and updates the model's judgement."
454
- )
455
-
456
- with gr.Row():
457
- with gr.Column():
458
- gr.Markdown("### Behaviour Inputs (for one user)")
459
- engagement_slider = gr.Slider(
460
- minimum=float(eng_min),
461
- maximum=float(eng_max),
462
- value=float((eng_min + eng_max) / 2.0),
463
- step=1.0,
464
- label="Engagement level (synthetic, based on graph centrality)"
465
- )
466
- suspicious_slider = gr.Slider(
467
- minimum=0.0,
468
- maximum=1.0,
469
- value=0.2,
470
- step=0.01,
471
- label="Suspiciousness (0 = clean, 1 = very suspicious)"
472
- )
473
- activity_slider = gr.Slider(
474
- minimum=0.0,
475
- maximum=1.0,
476
- value=0.7,
477
- step=0.01,
478
- label="Activity regularity (1 = very regular, 0 = random)"
479
- )
480
- frr_slider = gr.Slider(
481
- minimum=0.0,
482
- maximum=1.0,
483
- value=0.8,
484
- step=0.01,
485
- label="Friend Request Ratio (accepted / sent)"
486
- )
487
- mfr_slider = gr.Slider(
488
- minimum=0.0,
489
- maximum=1.0,
490
- value=0.6,
491
- step=0.01,
492
- label="Mutual Friends Ratio (proxy)"
493
- )
494
-
495
- next_button = gr.Button("Next week (T+1)")
496
- reset_button = gr.Button("Reset timeline")
497
-
498
- with gr.Column():
499
- current_box = gr.Markdown(
500
- "Current time-step status will appear here after you click **Next week (T+1)**."
501
- )
502
- timeline_box = gr.Markdown(
503
- "## Timeline (T1–T5)\n(No entries yet)"
504
- )
505
- timeline_plot = gr.Plot(
506
- value=make_timeline_plot([]),
507
- label="Timeline probabilities (T1–T5)"
508
- )
509
-
510
- gr.Markdown("### Global Status Distribution on the SNAP Graph")
511
- status_plot = gr.Plot(value=global_status_fig)
512
-
513
- gr.Markdown("### Example Users (Real graph nodes: Trusted vs Intruder-like)")
514
- examples_table = gr.Dataframe(
515
- value=examples_df,
516
- label="Sample nodes from SNAP Facebook",
517
- interactive=False
518
- )
519
- refresh_button = gr.Button("Refresh examples")
520
-
521
- timeline_state = gr.State([])
522
-
523
- next_button.click(
524
- fn=simulate_week,
525
- inputs=[
526
- engagement_slider,
527
- suspicious_slider,
528
- activity_slider,
529
- frr_slider,
530
- mfr_slider,
531
- timeline_state
532
- ],
533
- outputs=[current_box, timeline_box, timeline_plot, timeline_state]
534
- )
535
-
536
- reset_button.click(
537
- fn=reset_timeline,
538
- inputs=None,
539
- outputs=[current_box, timeline_box, timeline_plot, timeline_state]
540
- )
541
-
542
- refresh_button.click(
543
- fn=refresh_examples,
544
- inputs=None,
545
- outputs=[examples_table]
546
  )
547
 
548
- if __name__ == "__main__":
549
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
 
 
 
 
3
  import pandas as pd
4
+ import numpy as np
5
+ import plotly.express as px
6
+
7
+ st.set_page_config(page_title="Excel β†’ Management Insights (Power BI style)", layout="wide")
8
+
9
+ st.title("πŸ“Š Excel β†’ Interactive Management Dashboard (Power BI style)")
10
+ st.caption("Grade-based decision rule: **PASS if Grade β‰₯ C (including C, C+, B-, etc.)** and **FAIL if below C (C-, D, F, etc.)**. Marks thresholds are not used.")
11
+
12
+ # -----------------------------
13
+ # Grade logic (FINAL as per you)
14
+ # -----------------------------
15
+ def grade_pass_fail(g):
16
+ if pd.isna(g):
17
+ return "Unknown"
18
+ g = str(g).strip().upper()
19
+
20
+ # Explicit FAIL
21
+ if g.startswith(("D", "E", "F")):
22
+ return "Fail"
23
+
24
+ # C- is FAIL, all other C variants are PASS
25
+ if g.startswith("C"):
26
+ if g == "C-" or g.startswith("C-"):
27
+ return "Fail"
28
+ return "Pass"
29
+
30
+ # A/B (with any +/-) are PASS
31
+ if g.startswith(("A", "B")):
32
+ return "Pass"
33
+
34
+ return "Unknown"
35
+
36
+ def pick_grade_column(df: pd.DataFrame) -> str:
37
+ # User confirmed "Grade is last column" β€” we still try to be robust.
38
+ candidates = [c for c in df.columns if "grade" in str(c).lower()]
39
+ if candidates:
40
+ return candidates[-1]
41
+ return df.columns[-1]
42
+
43
+ def normalize_headers(df: pd.DataFrame) -> pd.DataFrame:
44
+ # Clean common trailing spaces
45
+ df = df.copy()
46
+ df.columns = [str(c).strip() for c in df.columns]
47
+ return df
48
+
49
+ def coerce_numeric(df: pd.DataFrame, cols):
50
+ for c in cols:
51
+ if c in df.columns:
52
+ df[c] = pd.to_numeric(df[c], errors="coerce")
53
+ return df
54
+
55
+ def describe_fail_reason(row, components):
56
+ # Human-readable reason (simple, management-friendly)
57
+ if row.get("PassFail") != "Fail":
58
+ return ""
59
+ hints = []
60
+ for c in components:
61
+ v = row.get(c)
62
+ if pd.notna(v):
63
+ # rough, non-controversial hinting β€” not using thresholds for pass/fail
64
+ if c.lower().find("final") >= 0 and v < np.nanpercentile(components_df[c].dropna(), 25):
65
+ hints.append("Final exam is in the lower quartile")
66
+ if c.lower().find("lab") >= 0 and v < np.nanpercentile(components_df[c].dropna(), 25):
67
+ hints.append("Lab total is in the lower quartile")
68
+ if c.lower().find("mid") >= 0 and v < np.nanpercentile(components_df[c].dropna(), 25):
69
+ hints.append("Mid exam is in the lower quartile")
70
+ if c.lower().find("test") >= 0 and v < np.nanpercentile(components_df[c].dropna(), 25):
71
+ hints.append("Test score is in the lower quartile")
72
+ if not hints:
73
+ return "Grade below C (check component performance & attendance/assessment issues)."
74
+ return " | ".join(hints)
75
+
76
+ # -----------------------------
77
+ # Upload + read
78
+ # -----------------------------
79
+ uploaded = st.file_uploader("Upload Excel (.xlsx)", type=["xlsx"])
80
+
81
+ if uploaded is None:
82
+ st.info("Upload an Excel file to begin.")
83
+ st.stop()
84
+
85
+ xls = pd.ExcelFile(uploaded)
86
+ sheet = st.selectbox("Select sheet", xls.sheet_names, index=0)
87
+ raw = pd.read_excel(uploaded, sheet_name=sheet)
88
+ raw = normalize_headers(raw)
89
+
90
+ # Try to remove non-student rows (robust: keep rows with any numeric marks OR any grade-like text)
91
+ grade_col_name = pick_grade_column(raw)
92
+ tmp_grade = raw[grade_col_name].astype(str).str.strip()
93
+ grade_like = tmp_grade.str.match(r"^[A-Fa-f][\+\-]?$", na=False)
94
+
95
+ numeric_cols_guess = [c for c in raw.columns if c != grade_col_name]
96
+ numeric_signal = raw[numeric_cols_guess].apply(pd.to_numeric, errors="coerce").notna().sum(axis=1) > 0
97
+
98
+ df = raw[grade_like | numeric_signal].copy()
99
+
100
+ # Add Sno if exists, else create row id
101
+ sno_col = None
102
+ for c in df.columns:
103
+ if str(c).strip().lower() in ["sno", "sno.", "sr", "sr.", "id", "studentid", "student id"]:
104
+ sno_col = c
105
+ break
106
+ if sno_col is None:
107
+ df.insert(0, "Sno", range(1, len(df) + 1))
108
+ sno_col = "Sno"
109
+
110
+ # Grade column
111
+ df["Grade"] = df[grade_col_name].astype(str).str.strip().str.upper()
112
+ df["PassFail"] = df["Grade"].apply(grade_pass_fail)
113
+ df["Pass"] = df["PassFail"].eq("Pass")
114
+ df["Fail"] = df["PassFail"].eq("Fail")
115
+
116
+ # Identify likely mark columns (common names; if not found, pick numeric ones)
117
+ common_components = ["Test -1", "Test-1", "Test 1", "Mid Exam", "Mid", "Lab Total", "Final Exam", "Total"]
118
+ component_cols = [c for c in df.columns if c in common_components]
119
+ if not component_cols:
120
+ # fallback: all numeric columns except Sno
121
+ num_cols = df.columns[df.apply(lambda s: pd.to_numeric(s, errors="coerce").notna().mean() > 0.4)]
122
+ component_cols = [c for c in num_cols if c != sno_col]
123
+
124
+ # Coerce numerics (if present)
125
+ df = coerce_numeric(df, component_cols)
126
+
127
+ # Consistency score (std across available components)
128
+ if len(component_cols) >= 2:
129
+ df["Consistency_SD"] = df[component_cols].std(axis=1, skipna=True)
130
  else:
131
+ df["Consistency_SD"] = np.nan
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
+ # Global for hinting
134
+ components_df = df.copy()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
+ # Optional β€œFail reason” (for drilldown / risk view)
137
+ if component_cols:
138
+ df["FailReasonHint"] = df.apply(lambda r: describe_fail_reason(r, component_cols), axis=1)
139
+ else:
140
+ df["FailReasonHint"] = np.where(df["Fail"], "Grade below C.", "")
141
+
142
+ # -----------------------------
143
+ # Sidebar: β€œPower BI pages”
144
+ # -----------------------------
145
+ st.sidebar.header("Perspective")
146
+ view = st.sidebar.radio(
147
+ "Choose a view",
148
+ ["Executive (Management)", "Risk & Intervention", "Assessment Quality", "Student Drill-down", "Export for Power BI"],
149
+ index=0
150
+ )
151
+
152
+ st.sidebar.header("Filters")
153
+ pf = st.sidebar.multiselect("Pass/Fail", ["Pass", "Fail", "Unknown"], default=["Pass", "Fail", "Unknown"])
154
+ grade_unique = sorted([g for g in df["Grade"].dropna().unique()])
155
+ sel_grades = st.sidebar.multiselect("Grades", grade_unique, default=grade_unique)
156
+
157
+ filtered = df[df["PassFail"].isin(pf)]
158
+ filtered = filtered[filtered["Grade"].isin(sel_grades)]
159
+
160
+ # -----------------------------
161
+ # KPI Row
162
+ # -----------------------------
163
+ k1, k2, k3, k4, k5 = st.columns(5)
164
+ with k1: st.metric("Students", int(filtered.shape[0]))
165
+ with k2: st.metric("Pass", int(filtered["Pass"].sum()))
166
+ with k3: st.metric("Fail", int(filtered["Fail"].sum()))
167
+ with k4:
168
+ pr = (filtered["Pass"].mean() * 100) if filtered.shape[0] else 0
169
+ st.metric("Pass Rate", f"{pr:.1f}%")
170
+ with k5:
171
+ if "Total" in filtered.columns and pd.api.types.is_numeric_dtype(filtered["Total"]):
172
+ st.metric("Average Total", f"{filtered['Total'].mean():.2f}")
173
  else:
174
+ st.metric("Average Total", "β€”")
175
+
176
+ st.divider()
177
+
178
+ # -----------------------------
179
+ # Views
180
+ # -----------------------------
181
+ def executive_view(d):
182
+ left, right = st.columns([1, 1])
183
+
184
+ with left:
185
+ st.subheader("Grade Distribution")
186
+ grade_counts = d["Grade"].value_counts(dropna=False).reset_index()
187
+ grade_counts.columns = ["Grade", "Count"]
188
+ fig = px.bar(grade_counts, x="Grade", y="Count")
189
+ st.plotly_chart(fig, use_container_width=True)
190
+
191
+ with right:
192
+ st.subheader("Pass/Fail Distribution")
193
+ pf_counts = d["PassFail"].value_counts(dropna=False).reset_index()
194
+ pf_counts.columns = ["Status", "Count"]
195
+ fig = px.pie(pf_counts, names="Status", values="Count")
196
+ st.plotly_chart(fig, use_container_width=True)
197
+
198
+ st.subheader("Hidden Patterns (Quick Signals)")
199
+ c1, c2, c3 = st.columns(3)
200
+
201
+ # Pattern: Strong Lab but Fail (if lab exists)
202
+ if any("Lab" in c for c in component_cols):
203
+ lab_col = [c for c in component_cols if "Lab" in c][0]
204
+ strong_lab_fail = d[(d["Fail"]) & (d[lab_col].notna()) & (d[lab_col] >= d[lab_col].quantile(0.75))]
205
+ with c1:
206
+ st.metric("Fail with Strong Lab", int(strong_lab_fail.shape[0]))
207
+ else:
208
+ with c1:
209
+ st.metric("Fail with Strong Lab", "β€”")
210
+
211
+ # Pattern: Inconsistent high SD
212
+ if "Consistency_SD" in d.columns and d["Consistency_SD"].notna().any():
213
+ top_incons = d["Consistency_SD"].quantile(0.90)
214
+ with c2:
215
+ st.metric("High Inconsistency (Top 10%)", int((d["Consistency_SD"] >= top_incons).sum()))
216
+ else:
217
+ with c2:
218
+ st.metric("High Inconsistency (Top 10%)", "β€”")
219
+
220
+ # Pattern: Fail with good Total (if Total exists)
221
+ if "Total" in d.columns and pd.api.types.is_numeric_dtype(d["Total"]) and d["Total"].notna().any():
222
+ good_total_fail = d[(d["Fail"]) & (d["Total"] >= d["Total"].quantile(0.75))]
223
+ with c3:
224
+ st.metric("Fail with High Total", int(good_total_fail.shape[0]))
225
+ else:
226
+ with c3:
227
+ st.metric("Fail with High Total", "β€”")
228
+
229
+ if component_cols and "Total" in d.columns and pd.api.types.is_numeric_dtype(d["Total"]):
230
+ st.subheader("What Drives Total? (Correlation)")
231
+ corr_cols = [c for c in component_cols if c in d.columns] + ["Total"]
232
+ corr = d[corr_cols].corr(numeric_only=True)
233
+ fig = px.imshow(corr, text_auto=True, aspect="auto")
234
+ st.plotly_chart(fig, use_container_width=True)
235
+
236
+ def risk_view(d):
237
+ st.subheader("Fail List (Grade < C)")
238
+ fails = d[d["Fail"]].copy()
239
+
240
+ # Bucket: C- vs D/F etc.
241
+ fails["FailType"] = np.where(fails["Grade"].str.startswith("C-"), "C- (Borderline Fail)", "Below C")
242
+ bucket = fails["FailType"].value_counts().reset_index()
243
+ bucket.columns = ["Fail Type", "Count"]
244
+ c1, c2 = st.columns([1, 2])
245
+ with c1:
246
+ fig = px.bar(bucket, x="Fail Type", y="Count")
247
+ st.plotly_chart(fig, use_container_width=True)
248
+ with c2:
249
+ show_cols = [sno_col, "Grade", "PassFail"]
250
+ for c in ["Total"] + component_cols:
251
+ if c in fails.columns and c not in show_cols:
252
+ show_cols.append(c)
253
+ show_cols += ["FailReasonHint"]
254
+ st.dataframe(fails[show_cols].sort_values(by=["Grade", sno_col]), use_container_width=True, height=420)
255
+
256
+ st.subheader("Intervention Suggestions (Management-friendly)")
257
+ st.markdown(
258
+ """
259
+ - **Many C- failures** β†’ run targeted revision + re-assessment readiness support (borderline group).
260
+ - **Failures concentrated with low Final** β†’ strengthen exam preparation (mock exams + feedback).
261
+ - **Failures with strong Lab** β†’ review exam alignment, study strategy, and assessment balance.
262
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  )
264
 
265
+ def assessment_quality_view(d):
266
+ st.subheader("Assessment Component Overview")
267
+ if not component_cols:
268
+ st.warning("No numeric component columns detected. Add columns like Test/Mid/Lab/Final/Total for deeper assessment analysis.")
269
+ return
270
+
271
+ # Component distributions
272
+ comp = st.selectbox("Choose component", component_cols, index=min(0, len(component_cols)-1))
273
+ fig = px.histogram(d, x=comp, nbins=20)
274
+ st.plotly_chart(fig, use_container_width=True)
275
+
276
+ # Component vs Grade
277
+ st.subheader("Component vs Grade (Boxplot)")
278
+ fig = px.box(d, x="Grade", y=comp)
279
+ st.plotly_chart(fig, use_container_width=True)
280
+
281
+ # Zero / missing checks
282
+ st.subheader("Data Quality Flags")
283
+ flags = []
284
+ for c in component_cols:
285
+ series = d[c]
286
+ if pd.api.types.is_numeric_dtype(series):
287
+ missing = int(series.isna().sum())
288
+ zeros = int((series == 0).sum())
289
+ flags.append({"Component": c, "Missing": missing, "Zeros": zeros})
290
+ st.dataframe(pd.DataFrame(flags), use_container_width=True)
291
+
292
+ # If Total exists: correlation heatmap
293
+ if "Total" in d.columns and pd.api.types.is_numeric_dtype(d["Total"]):
294
+ st.subheader("Correlation Heatmap")
295
+ corr_cols = [c for c in component_cols if c in d.columns] + ["Total"]
296
+ corr = d[corr_cols].corr(numeric_only=True)
297
+ fig = px.imshow(corr, text_auto=True, aspect="auto")
298
+ st.plotly_chart(fig, use_container_width=True)
299
+
300
+ def student_drilldown_view(d):
301
+ st.subheader("Student Drill-down")
302
+ st.caption("Pick a student to view component breakdown and the grade-based decision.")
303
+ sid = st.selectbox("Select student (Sno)", sorted(d[sno_col].unique()))
304
+ row = d[d[sno_col] == sid].iloc[0]
305
+
306
+ c1, c2, c3 = st.columns(3)
307
+ with c1: st.metric("Grade", str(row.get("Grade", "β€”")))
308
+ with c2: st.metric("Status", str(row.get("PassFail", "β€”")))
309
+ with c3:
310
+ if "Total" in d.columns and pd.notna(row.get("Total", np.nan)):
311
+ st.metric("Total", f"{row['Total']:.2f}")
312
+ else:
313
+ st.metric("Total", "β€”")
314
+
315
+ st.write("**Reason (simple hint):**", row.get("FailReasonHint", ""))
316
+
317
+ # Component bar
318
+ if component_cols:
319
+ comp_vals = {c: row.get(c) for c in component_cols if c in d.columns}
320
+ comp_df = pd.DataFrame({"Component": list(comp_vals.keys()), "Score": list(comp_vals.values())})
321
+ fig = px.bar(comp_df, x="Component", y="Score")
322
+ st.plotly_chart(fig, use_container_width=True)
323
+
324
+ st.subheader("Raw record")
325
+ st.dataframe(pd.DataFrame(row).T, use_container_width=True)
326
+
327
+ def export_view(d):
328
+ st.subheader("Export for Power BI")
329
+ st.caption("Download cleaned data with the computed PassFail fields. Load into Power BI (Get Data β†’ Text/CSV).")
330
+
331
+ clean_csv = d.to_csv(index=False).encode("utf-8")
332
+ st.download_button("⬇️ Download Cleaned Data (CSV)", clean_csv, file_name="cleaned_marks_with_passfail.csv", mime="text/csv")
333
+
334
+ st.subheader("Recommended Power BI Measures (DAX)")
335
+ st.code(r"""
336
+ Pass Count = CALCULATE(COUNTROWS(cleaned_marks), cleaned_marks[PassFail] = "Pass")
337
+ Fail Count = CALCULATE(COUNTROWS(cleaned_marks), cleaned_marks[PassFail] = "Fail")
338
+ Pass Rate % = DIVIDE([Pass Count], COUNTROWS(cleaned_marks))
339
+ """, language="text")
340
+
341
+ st.subheader("Summary Tables")
342
+ grade_summary = d["Grade"].value_counts(dropna=False).reset_index()
343
+ grade_summary.columns = ["Grade", "Count"]
344
+ st.dataframe(grade_summary, use_container_width=True)
345
+
346
+ pf_summary = d["PassFail"].value_counts(dropna=False).reset_index()
347
+ pf_summary.columns = ["PassFail", "Count"]
348
+ st.dataframe(pf_summary, use_container_width=True)
349
+
350
+ # Render selected view
351
+ if view == "Executive (Management)":
352
+ executive_view(filtered)
353
+ elif view == "Risk & Intervention":
354
+ risk_view(filtered)
355
+ elif view == "Assessment Quality":
356
+ assessment_quality_view(filtered)
357
+ elif view == "Student Drill-down":
358
+ student_drilldown_view(filtered)
359
+ else:
360
+ export_view(filtered)