SanthiSastra commited on
Commit
9826f07
·
verified ·
1 Parent(s): 7c450bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +211 -447
app.py CHANGED
@@ -1,521 +1,285 @@
 
 
 
 
 
1
  import os
2
- import io
3
  import tempfile
4
- from datetime import datetime
5
-
6
  import numpy as np
7
  import pandas as pd
8
-
9
  import gradio as gr
10
-
11
  import matplotlib.pyplot as plt
12
- import seaborn as sns
13
- from scipy import stats
14
 
15
- from docx import Document
16
- from docx.shared import Inches, Pt
17
- from docx.enum.text import WD_ALIGN_PARAGRAPH
18
 
19
 
20
- # ----------------------------
21
- # EDA helpers
22
- # ----------------------------
23
- def find_numeric_columns(df: pd.DataFrame):
24
- return df.select_dtypes(include=[np.number]).columns.tolist()
25
 
26
- def safe_skew(series: pd.Series):
27
- s = series.dropna()
28
- if len(s) < 3:
29
- return np.nan
30
- return float(s.skew())
31
 
32
- def safe_kurt(series: pd.Series):
33
- s = series.dropna()
34
- if len(s) < 4:
35
- return np.nan
36
- return float(s.kurt())
37
 
38
- def basic_numeric_stats(df: pd.DataFrame, num_cols: list[str]) -> pd.DataFrame:
39
- rows = []
40
- for c in num_cols:
41
- s = df[c]
42
- s2 = s.dropna()
43
- rows.append({
44
- "Attribute": c,
45
- "Count": int(s2.shape[0]),
46
- "Missing": int(s.isna().sum()),
47
- "Mean": float(s2.mean()) if len(s2) else np.nan,
48
- "Std": float(s2.std(ddof=1)) if len(s2) > 1 else np.nan,
49
- "Var": float(s2.var(ddof=1)) if len(s2) > 1 else np.nan,
50
- "Min": float(s2.min()) if len(s2) else np.nan,
51
- "25%": float(s2.quantile(0.25)) if len(s2) else np.nan,
52
- "Median": float(s2.median()) if len(s2) else np.nan,
53
- "75%": float(s2.quantile(0.75)) if len(s2) else np.nan,
54
- "Max": float(s2.max()) if len(s2) else np.nan,
55
- "Skewness": safe_skew(s),
56
- "Kurtosis": safe_kurt(s),
57
- })
58
- return pd.DataFrame(rows)
59
-
60
- def five_point_summary_table(df: pd.DataFrame, num_cols: list[str]) -> pd.DataFrame:
61
- rows = []
62
- for c in num_cols:
63
- s = df[c].dropna()
64
- if len(s) == 0:
65
- rows.append({"Attribute": c, "Min": np.nan, "Q1": np.nan, "Median": np.nan, "Q3": np.nan, "Max": np.nan})
66
- else:
67
- rows.append({
68
- "Attribute": c,
69
- "Min": float(s.min()),
70
- "Q1": float(s.quantile(0.25)),
71
- "Median": float(s.median()),
72
- "Q3": float(s.quantile(0.75)),
73
- "Max": float(s.max()),
74
- })
75
- return pd.DataFrame(rows)
76
-
77
- def interpretation_numeric(stats_df: pd.DataFrame) -> str:
78
- if stats_df.empty:
79
- return "No numeric attributes were detected in the uploaded dataset."
80
- tmp = stats_df[["Attribute", "Skewness"]].dropna()
81
- skew_top = tmp.reindex(tmp["Skewness"].abs().sort_values(ascending=False).index).head(3)
82
- lines = []
83
- lines.append(f"Numeric attributes detected: {len(stats_df)}.")
84
- if len(skew_top) > 0:
85
- parts = [f"{r.Attribute} (skew={r.Skewness:.2f})" for r in skew_top.itertuples(index=False)]
86
- lines.append("Most skewed attributes (absolute skewness): " + ", ".join(parts) + ".")
87
- miss_sorted = stats_df.sort_values("Missing", ascending=False).head(3)
88
- if miss_sorted["Missing"].max() > 0:
89
- parts = [f"{r.Attribute} (missing={int(r.Missing)})" for r in miss_sorted.itertuples(index=False)]
90
- lines.append("Attributes with higher missing values: " + ", ".join(parts) + ".")
91
- else:
92
- lines.append("No missing values were observed in numeric attributes.")
93
- return " ".join(lines)
94
-
95
- def correlation_interpretation(corr: pd.DataFrame) -> str:
96
- if corr is None or corr.empty:
97
- return "Correlation could not be computed (insufficient numeric attributes)."
98
- c = corr.copy()
99
- np.fill_diagonal(c.values, np.nan)
100
- stacked = c.stack().dropna()
101
- if stacked.empty:
102
- return "No meaningful pairwise correlations were found."
103
- top = stacked.abs().sort_values(ascending=False).head(3)
104
- lines = []
105
- for (a, b), _ in top.items():
106
- val = float(corr.loc[a, b])
107
- sign = "positive" if val >= 0 else "negative"
108
- lines.append(f"{a} vs {b}: {val:.2f} ({sign})")
109
- return "Strongest correlations: " + "; ".join(lines) + "."
110
-
111
- def fig_to_png_path(fig) -> str:
112
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
113
- fig.savefig(tmp.name, bbox_inches="tight", dpi=200)
114
- plt.close(fig)
115
- return tmp.name
116
-
117
- def plot_correlogram_annotated(corr: pd.DataFrame, title="Correlogram (Annotated)"):
118
- fig, ax = plt.subplots(figsize=(8, 6))
119
- data = corr.values
120
- im = ax.imshow(data, aspect="auto")
121
- ax.set_title(title)
122
- ax.set_xticks(range(len(corr.columns)))
123
- ax.set_xticklabels(corr.columns, rotation=45, ha="right")
124
- ax.set_yticks(range(len(corr.index)))
125
- ax.set_yticklabels(corr.index)
126
-
127
- for i in range(data.shape[0]):
128
- for j in range(data.shape[1]):
129
- val = data[i, j]
130
- ax.text(j, i, "" if np.isnan(val) else f"{val:.2f}", ha="center", va="center", fontsize=8)
131
-
132
- fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
133
- fig.tight_layout()
134
- return fig
135
 
136
- def plot_pairplot(df: pd.DataFrame, num_cols: list[str], max_cols=6):
137
- use_cols = num_cols[:max_cols]
138
- if len(use_cols) < 2:
139
- return None
140
- grid = sns.pairplot(df[use_cols].dropna(), corner=True, diag_kind="hist")
141
- grid.fig.suptitle("Pair Plot", y=1.02)
142
- return grid
143
-
144
-
145
- # ----------------------------
146
- # DOCX helpers
147
- # ----------------------------
148
- def add_heading_centered(doc: Document, text: str, font_size=16, bold=True, color_rgb="1E5AA8"):
149
- p = doc.add_paragraph()
150
- p.alignment = WD_ALIGN_PARAGRAPH.CENTER
151
- run = p.add_run(text)
152
- run.bold = bold
153
- run.font.size = Pt(font_size)
154
- # set run color
155
- rPr = run._element.get_or_add_rPr()
156
- color = rPr.get_or_add_color()
157
- color.val = color_rgb
158
-
159
- def add_image(doc: Document, image_path: str, width_inches=6.2):
160
- doc.add_picture(image_path, width=Inches(width_inches))
161
-
162
- def build_docx_report(df: pd.DataFrame, dataset_name: str, id_col: str | None,
163
- stats_df: pd.DataFrame, corr: pd.DataFrame,
164
- graph_paths: list[tuple[str, str]]) -> str:
165
- doc = Document()
166
-
167
- add_heading_centered(doc, "Amrita Manthana", font_size=20, bold=True)
168
- add_heading_centered(doc, "Prof.B.Santhi,SRC,SASTRA", font_size=14, bold=True)
169
-
170
- doc.add_paragraph("")
171
- p = doc.add_paragraph()
172
- p.alignment = WD_ALIGN_PARAGRAPH.CENTER
173
- p.add_run("EDA Report").bold = True
174
-
175
- doc.add_paragraph(f"Dataset: {dataset_name}")
176
- doc.add_paragraph(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
177
- if id_col:
178
- doc.add_paragraph(f"Record ID column: {id_col}")
179
-
180
- doc.add_paragraph("")
181
- doc.add_paragraph("1) Numeric Statistics").runs[0].bold = True
182
- doc.add_paragraph(interpretation_numeric(stats_df))
183
-
184
- if not stats_df.empty:
185
- tdf = stats_df.head(25) if len(stats_df) > 25 else stats_df
186
- table = doc.add_table(rows=1, cols=len(tdf.columns))
187
- for j, col in enumerate(tdf.columns):
188
- table.rows[0].cells[j].text = str(col)
189
- for _, row in tdf.iterrows():
190
- cells = table.add_row().cells
191
- for j, col in enumerate(tdf.columns):
192
- val = row[col]
193
- if isinstance(val, float):
194
- cells[j].text = "" if np.isnan(val) else f"{val:.4f}"
195
- else:
196
- cells[j].text = str(val)
197
-
198
- doc.add_paragraph("")
199
- doc.add_paragraph("2) Correlation").runs[0].bold = True
200
- doc.add_paragraph(correlation_interpretation(corr))
201
-
202
- doc.add_paragraph("")
203
- doc.add_paragraph("3) Graphs & Interpretation").runs[0].bold = True
204
- for title, path in graph_paths:
205
- doc.add_paragraph("")
206
- doc.add_paragraph(title).runs[0].bold = True
207
- add_image(doc, path)
208
- if "Correlogram" in title:
209
- doc.add_paragraph("Interpretation: Values near +1/-1 indicate strong positive/negative association.")
210
- elif "Bar" in title:
211
- doc.add_paragraph("Interpretation: Taller bars indicate larger frequency/aggregate value.")
212
- elif "Pie" in title:
213
- doc.add_paragraph("Interpretation: Slice proportions show relative contribution of categories.")
214
- elif "Scatter" in title:
215
- doc.add_paragraph("Interpretation: Patterns indicate linear/non-linear trend, clustering, or outliers.")
216
- elif "Pair Plot" in title:
217
- doc.add_paragraph("Interpretation: Diagonal shows distributions; others show pairwise relationships/outliers.")
218
-
219
- doc.add_paragraph("")
220
- doc.add_paragraph("4) Final Remarks").runs[0].bold = True
221
- doc.add_paragraph("This report consolidates numeric measures, distributions, and relationships among attributes.")
222
-
223
- out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
224
- doc.save(out_path)
225
- return out_path
226
-
227
-
228
- # ----------------------------
229
- # App logic
230
- # ----------------------------
231
- def load_csv(file_obj, sep, header_flag):
232
- if file_obj is None:
233
- return None, "Please upload a CSV.", None, [], None
234
 
235
- try:
236
- header = 0 if header_flag else None
237
- df = pd.read_csv(file_obj.name, sep=sep, header=header)
238
- if not header_flag:
239
- df.columns = [f"col_{i+1}" for i in range(df.shape[1])]
240
- num_cols = find_numeric_columns(df)
241
- cols = df.columns.tolist()
242
- info = f"Loaded: rows={df.shape[0]}, cols={df.shape[1]}. Numeric cols={len(num_cols)}."
243
- return df, info, df.head(30), cols, num_cols
244
- except Exception as e:
245
- return None, f"Could not read CSV: {e}", None, [], None
246
 
 
247
 
248
- def eda_compute(df: pd.DataFrame):
249
- # MUST return 6 outputs always: note, stats, fps, skew, corr_table, corr_note
250
- if df is None:
251
- return "Upload a CSV first.", pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), ""
252
 
253
- num_cols = find_numeric_columns(df)
254
- if len(num_cols) == 0:
255
- return "No numeric columns found.", pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), "Correlation not available."
 
 
 
 
 
 
 
 
256
 
257
- stats_df = basic_numeric_stats(df, num_cols)
258
- fps_df = five_point_summary_table(df, num_cols)
259
- skew_df = pd.DataFrame({"Attribute": num_cols, "Skewness": [safe_skew(df[c]) for c in num_cols]})
260
 
261
- if len(num_cols) >= 2:
262
- corr_df = df[num_cols].corr(numeric_only=True)
263
- corr_text = correlation_interpretation(corr_df)
264
- else:
265
- corr_df = pd.DataFrame()
266
- corr_text = "Correlation not available (need at least 2 numeric columns)."
267
 
268
- return interpretation_numeric(stats_df), stats_df, fps_df, skew_df, corr_df, corr_text
 
 
269
 
 
 
270
 
 
 
 
271
 
272
- def graph_make(df: pd.DataFrame, barpie_col: str, topn: int,
273
- scatter_x: str, scatter_y: str, pair_max: int):
274
- if df is None:
275
- return None, None, None, None, None
276
-
277
- paths = [None, None, None, None, None]
278
-
279
- # Bar / Pie data
280
- series = df[barpie_col]
281
- if pd.api.types.is_numeric_dtype(series):
282
- binned = pd.cut(series.dropna(), bins=10)
283
- counts = binned.value_counts().head(topn)
284
- labels = counts.index.astype(str).tolist()
285
- yvals = counts.values
286
- suffix = "(binned)"
287
- else:
288
- counts = series.astype(str).value_counts().head(topn)
289
- labels = counts.index.tolist()
290
- yvals = counts.values
291
- suffix = ""
292
-
293
- fig_bar, ax = plt.subplots(figsize=(7, 4))
294
- ax.bar(range(len(labels)), yvals)
295
- ax.set_xticks(range(len(labels)))
296
- ax.set_xticklabels(labels, rotation=45, ha="right")
297
- ax.set_title(f"Bar Chart: {barpie_col} {suffix}")
298
- ax.set_ylabel("Count")
299
- paths[0] = fig_to_png_path(fig_bar)
300
-
301
- fig_pie, ax2 = plt.subplots(figsize=(6, 4))
302
- ax2.pie(yvals, labels=labels, autopct="%1.1f%%")
303
- ax2.set_title(f"Pie Chart: {barpie_col} {suffix}")
304
- paths[1] = fig_to_png_path(fig_pie)
305
-
306
- # Scatter / Corr / Pair
307
- num_cols = find_numeric_columns(df)
308
- if len(num_cols) >= 2 and scatter_x in num_cols and scatter_y in num_cols:
309
- fig_sc, ax3 = plt.subplots(figsize=(7, 4))
310
- ax3.scatter(df[scatter_x], df[scatter_y], alpha=0.7)
311
- ax3.set_xlabel(scatter_x)
312
- ax3.set_ylabel(scatter_y)
313
- ax3.set_title(f"Scatter: {scatter_x} vs {scatter_y}")
314
- paths[2] = fig_to_png_path(fig_sc)
315
-
316
- corr = df[num_cols].corr(numeric_only=True)
317
- fig_corr = plot_correlogram_annotated(corr)
318
- paths[3] = fig_to_png_path(fig_corr)
319
-
320
- pair_max = max(2, min(pair_max, len(num_cols)))
321
- grid = plot_pairplot(df, num_cols, max_cols=pair_max)
322
- if grid is not None:
323
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
324
- grid.fig.savefig(tmp, bbox_inches="tight", dpi=200)
325
- plt.close(grid.fig)
326
- paths[4] = tmp
327
-
328
- return tuple(paths)
329
-
330
-
331
- def report_generate(df: pd.DataFrame, dataset_name: str, id_col: str,
332
- barpie_col: str, topn: int, scatter_x: str, scatter_y: str, pair_max: int):
333
- if df is None:
334
- return None, "Upload a CSV first."
335
 
336
- num_cols = find_numeric_columns(df)
337
- stats_df = basic_numeric_stats(df, num_cols) if len(num_cols) else pd.DataFrame()
338
- corr = df[num_cols].corr(numeric_only=True) if len(num_cols) >= 2 else pd.DataFrame()
 
 
339
 
340
- # build graphs (same as graph tab)
341
- bar_path, pie_path, sc_path, corr_path, pair_path = graph_make(df, barpie_col, topn, scatter_x, scatter_y, pair_max)
 
342
 
343
- graph_paths = []
344
- if bar_path: graph_paths.append((f"Bar Chart: {barpie_col}", bar_path))
345
- if pie_path: graph_paths.append((f"Pie Chart: {barpie_col}", pie_path))
346
- if sc_path: graph_paths.append((f"Scatter: {scatter_x} vs {scatter_y}", sc_path))
347
- if corr_path: graph_paths.append(("Correlogram (Annotated)", corr_path))
348
- if pair_path: graph_paths.append((f"Pair Plot (first {min(pair_max, len(num_cols))} numeric cols)", pair_path))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
- docx_path = build_docx_report(df, dataset_name or "uploaded.csv", id_col, stats_df, corr, graph_paths)
 
 
 
 
 
 
 
 
 
 
 
 
351
 
352
- # (optional) cleanup images later is fine; HF temp storage is ok for session
353
- return docx_path, "DOCX report generated successfully."
354
 
 
 
 
 
 
355
 
356
- def search_record(df: pd.DataFrame, id_col: str, query: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
357
  if df is None:
358
- return "Upload a CSV first.", None
359
- if not query:
360
- return "Enter a value to search.", None
361
- if id_col not in df.columns:
362
- return "Select a valid ID column.", None
363
-
364
- col = df[id_col]
365
- result = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
366
  if pd.api.types.is_numeric_dtype(col):
367
  try:
368
- q = float(query)
369
- result = df[df[id_col] == q]
370
- except:
371
- result = df[col.astype(str) == query]
372
  else:
373
- result = df[col.astype(str) == query]
374
 
375
- if result is None or result.empty:
376
  return "No matching record found.", pd.DataFrame()
377
- return f"Found {len(result)} record(s).", result
378
 
379
 
380
- # ----------------------------
381
- # UI
382
- # ----------------------------
383
  CSS = """
384
- #titleblock {text-align:center; margin-top: 5px; margin-bottom: 10px;}
385
  #t1 {font-size:30px; font-weight:800; color:#1E5AA8;}
386
  #t2 {font-size:18px; font-weight:800; color:#1E5AA8;}
387
  """
388
 
389
- def logo_html():
390
- # place logo.png in repo root
391
  if os.path.exists("logo.jpg"):
392
- # gradio serves files placed in root with relative path in HTML
393
- return f"""
394
- <div id="titleblock">
395
- <img src="file=logo.jpg" style="width:110px; display:block; margin:0 auto;" />
396
- <div id="t1">Amrita Manthana</div>
397
- <div id="t2">Prof.B.Santhi,SRC,SASTRA</div>
398
- </div>
399
- """
400
- else:
401
  return """
402
  <div id="titleblock">
403
- <div id="t1">Amrita Manthana</div>
404
- <div id="t2">Prof.B.Santhi,SRC,SASTRA</div>
 
405
  </div>
406
  """
 
 
 
 
 
 
 
407
 
408
- with gr.Blocks(css=CSS, title="Amrita Manthana - EDA (Gradio)") as demo:
409
- gr.HTML(logo_html())
410
 
411
  df_state = gr.State(None)
412
- cols_state = gr.State([])
413
- numcols_state = gr.State([])
414
 
415
  with gr.Row():
416
- with gr.Column(scale=1, min_width=320):
417
- gr.Markdown("### Data Upload")
418
  file_in = gr.File(label="Upload CSV", file_types=[".csv"])
419
- sep = gr.Dropdown(label="CSV Separator", choices=[",", ";", "\t", "|"], value=",")
420
- header_flag = gr.Checkbox(label="First row is header", value=True)
421
- load_btn = gr.Button("Load Data", variant="primary")
422
  load_msg = gr.Textbox(label="Status", interactive=False)
423
- preview = gr.Dataframe(label="Preview (first 30 rows)", interactive=False, wrap=True)
 
 
 
 
 
 
 
424
 
425
  with gr.Column(scale=2):
426
  with gr.Tabs():
427
- with gr.Tab("EDA"):
428
- eda_btn = gr.Button("Compute EDA")
429
- eda_note = gr.Textbox(label="Interpretation", lines=3, interactive=False)
430
- stats_table = gr.Dataframe(label="Descriptive Statistics", interactive=False, wrap=True)
431
- fps_table = gr.Dataframe(label="Five-Point Summary", interactive=False, wrap=True)
432
- skew_table = gr.Dataframe(label="Skewness", interactive=False, wrap=True)
433
- corr_table = gr.Dataframe(label="Correlation (numeric)", interactive=False, wrap=True)
434
- corr_note = gr.Textbox(label="Correlation Interpretation", lines=2, interactive=False)
435
-
436
- with gr.Tab("Graph"):
437
- gr.Markdown("#### Choose settings, then generate graphs")
438
- barpie_col = gr.Dropdown(label="Column for Bar/Pie", choices=[], value=None)
439
- topn = gr.Slider(label="Top-N categories", minimum=3, maximum=30, value=10, step=1)
440
-
441
- scatter_x = gr.Dropdown(label="Scatter X (numeric)", choices=[], value=None)
442
- scatter_y = gr.Dropdown(label="Scatter Y (numeric)", choices=[], value=None)
443
- pair_max = gr.Slider(label="Pair plot max numeric columns", minimum=2, maximum=10, value=6, step=1)
444
-
445
- graph_btn = gr.Button("Generate Graphs")
446
- with gr.Row():
447
- bar_img = gr.Image(label="Bar", type="filepath")
448
- pie_img = gr.Image(label="Pie", type="filepath")
449
- with gr.Row():
450
- sc_img = gr.Image(label="Scatter", type="filepath")
451
- corr_img = gr.Image(label="Correlogram (numbers inside)", type="filepath")
452
- pair_img = gr.Image(label="Pair Plot", type="filepath")
453
-
454
- with gr.Tab("Report"):
455
- gr.Markdown("#### DOCX report (includes all graphs + interpretations)")
456
- id_col_rep = gr.Dropdown(label="Record ID column (for report/search)", choices=[], value=None)
457
- rep_btn = gr.Button("Generate DOCX Report", variant="primary")
458
- rep_status = gr.Textbox(label="Report Status", interactive=False)
459
- rep_file = gr.File(label="Download Report (.docx)")
460
-
461
  with gr.Tab("Search"):
462
- id_col_search = gr.Dropdown(label="Select ID column", choices=[], value=None)
463
- query = gr.Textbox(label="Enter ID value (exact match)")
464
  search_btn = gr.Button("Search")
465
  search_msg = gr.Textbox(label="Search Status", interactive=False)
466
- search_out = gr.Dataframe(label="Matching Records", interactive=False, wrap=True)
467
-
468
- # --- events ---
469
- def after_load(file_obj, sep_val, header_val):
470
- df, msg, prev, cols, numcols = load_csv(file_obj, sep_val, header_val)
471
- # for dropdowns
472
- return (
473
- df, cols, numcols,
474
- msg, prev,
475
- gr.update(choices=cols, value=(cols[0] if cols else None)), # barpie_col
476
- gr.update(choices=numcols, value=(numcols[0] if len(numcols) else None)), # scatter_x
477
- gr.update(choices=numcols, value=(numcols[1] if len(numcols) > 1 else None)), # scatter_y
478
- gr.update(choices=cols, value=(cols[0] if cols else None)), # id_col_rep
479
- gr.update(choices=cols, value=(cols[0] if cols else None)), # id_col_search
480
- )
481
 
 
482
  load_btn.click(
483
- after_load,
484
- inputs=[file_in, sep, header_flag],
485
- outputs=[df_state, cols_state, numcols_state,
486
- load_msg, preview,
487
- barpie_col, scatter_x, scatter_y, id_col_rep, id_col_search]
488
- )
489
-
490
- eda_btn.click(
491
- eda_compute,
492
- inputs=[df_state],
493
- outputs=[eda_note, stats_table, fps_table, skew_table, corr_table, corr_note]
494
  )
495
 
496
- graph_btn.click(
497
- graph_make,
498
- inputs=[df_state, barpie_col, topn, scatter_x, scatter_y, pair_max],
499
- outputs=[bar_img, pie_img, sc_img, corr_img, pair_img]
500
- )
501
-
502
- def rep_run(df, idcol, barcol, topn_v, sx, sy, pmx, file_obj):
503
- name = file_obj.name if file_obj is not None else "uploaded.csv"
504
- path, status = report_generate(df, name, idcol, barcol, topn_v, sx, sy, pmx)
505
- return status, path
506
 
507
- rep_btn.click(
508
- rep_run,
509
- inputs=[df_state, id_col_rep, barpie_col, topn, scatter_x, scatter_y, pair_max, file_in],
510
- outputs=[rep_status, rep_file]
 
511
  )
512
 
513
  search_btn.click(
514
- search_record,
515
- inputs=[df_state, id_col_search, query],
516
  outputs=[search_msg, search_out]
517
  )
518
 
519
  if __name__ == "__main__":
520
  demo.launch()
521
-
 
1
+ # app.py (Fast-build Hugging Face Gradio)
2
+ # School Mark Analysis: RegNo, Name, Tamil, English, Maths, Science, Social
3
+ # Features: Total, Average, Rank, Remark, subject averages, fail-count (1..5), top-3 overall, top-3 per subject,
4
+ # search by RegNo, download result CSV
5
+
6
  import os
 
7
  import tempfile
 
 
8
  import numpy as np
9
  import pandas as pd
 
10
  import gradio as gr
 
11
  import matplotlib.pyplot as plt
 
 
12
 
13
+ SUBJECTS_DEFAULT = ["Tamil", "English", "Maths", "Science", "Social"]
14
+ ID_COL_DEFAULT = "RegNo"
15
+ NAME_COL_DEFAULT = "Name"
16
 
17
 
18
+ def _clean_columns(df: pd.DataFrame) -> pd.DataFrame:
19
+ df = df.copy()
20
+ df.columns = [c.strip() for c in df.columns]
21
+ return df
 
22
 
 
 
 
 
 
23
 
24
+ def _validate_and_prepare(df: pd.DataFrame, id_col: str, name_col: str, subjects: list[str]) -> pd.DataFrame:
25
+ df = _clean_columns(df)
 
 
 
26
 
27
+ missing = [c for c in [id_col, name_col] + subjects if c not in df.columns]
28
+ if missing:
29
+ raise ValueError(f"Missing required columns: {missing}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
+ # Ensure subject columns are numeric
32
+ for s in subjects:
33
+ df[s] = pd.to_numeric(df[s], errors="coerce")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # Basic sanity
36
+ if df[subjects].isna().all(axis=None):
37
+ raise ValueError("All subject columns became NaN after numeric conversion. Check CSV data.")
 
 
 
 
 
 
 
 
38
 
39
+ return df
40
 
 
 
 
 
41
 
42
+ def _remark(avg: float, failed_subjects: int) -> str:
43
+ if failed_subjects > 0:
44
+ return "Fail"
45
+ # Only pass students reach here
46
+ if avg >= 80:
47
+ return "Distinction"
48
+ if 60 <= avg <= 79:
49
+ return "First Class"
50
+ if 35 <= avg <= 59:
51
+ return "Second Class"
52
+ return "Pass"
53
 
 
 
 
54
 
55
+ def compute_marks(df: pd.DataFrame, pass_mark: int = 35, id_col: str = ID_COL_DEFAULT,
56
+ name_col: str = NAME_COL_DEFAULT, subjects: list[str] = SUBJECTS_DEFAULT):
57
+ df = _validate_and_prepare(df, id_col, name_col, subjects)
 
 
 
58
 
59
+ out = df.copy()
60
+ out["Total"] = out[subjects].sum(axis=1)
61
+ out["Average"] = out[subjects].mean(axis=1)
62
 
63
+ out["Failed_Subjects"] = (out[subjects] < pass_mark).sum(axis=1)
64
+ out["Remark"] = out.apply(lambda r: _remark(float(r["Average"]), int(r["Failed_Subjects"])), axis=1)
65
 
66
+ # Rank by Total (descending), ties get same minimum rank
67
+ out["Rank"] = out["Total"].rank(method="min", ascending=False).astype(int)
68
+ out = out.sort_values(["Rank", id_col], ascending=[True, True]).reset_index(drop=True)
69
 
70
+ # Subject-wise average
71
+ subj_avg = pd.DataFrame({
72
+ "Subject": subjects,
73
+ "Class_Average": [float(out[s].mean()) for s in subjects]
74
+ })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ # Fail distribution (1..5)
77
+ fail_dist = pd.DataFrame({
78
+ "Failed_Subjects": [1, 2, 3, 4, 5],
79
+ "Student_Count": [int((out["Failed_Subjects"] == k).sum()) for k in [1, 2, 3, 4, 5]]
80
+ })
81
 
82
+ # Top 3 overall
83
+ top3_overall_cols = [id_col, name_col, "Total", "Average", "Rank", "Remark"]
84
+ top3_overall = out.nsmallest(3, "Rank")[top3_overall_cols]
85
 
86
+ # Top 3 per subject
87
+ rows = []
88
+ for s in subjects:
89
+ t = out.sort_values(s, ascending=False).head(3)[[id_col, name_col, s, "Total", "Average", "Rank", "Remark"]].copy()
90
+ t.insert(0, "Subject", s)
91
+ t.rename(columns={s: "Subject_Mark"}, inplace=True)
92
+ rows.append(t)
93
+ top3_each_subject = pd.concat(rows, ignore_index=True) if rows else pd.DataFrame()
94
+
95
+ # Summary
96
+ total_students = out.shape[0]
97
+ pass_count = int((out["Failed_Subjects"] == 0).sum())
98
+ fail_count = total_students - pass_count
99
+
100
+ dist_count = int((out["Remark"] == "Distinction").sum())
101
+ first_count = int((out["Remark"] == "First Class").sum())
102
+ second_count = int((out["Remark"] == "Second Class").sum())
103
+
104
+ summary = (
105
+ f"Students: {total_students} | Pass: {pass_count} | Fail: {fail_count} | "
106
+ f"Distinction: {dist_count} | First Class: {first_count} | Second Class: {second_count}"
107
+ )
108
 
109
+ return out, subj_avg, fail_dist, top3_overall, top3_each_subject, summary
110
+
111
+
112
+ def plot_subject_avg(subj_avg: pd.DataFrame):
113
+ fig, ax = plt.subplots(figsize=(7, 4))
114
+ ax.bar(subj_avg["Subject"], subj_avg["Class_Average"])
115
+ ax.set_title("Subject-wise Class Average")
116
+ ax.set_xlabel("Subject")
117
+ ax.set_ylabel("Average Marks")
118
+ ax.set_ylim(0, 100)
119
+ plt.xticks(rotation=25, ha="right")
120
+ plt.tight_layout()
121
+ return fig
122
 
 
 
123
 
124
+ def plot_remark_distribution(result_df: pd.DataFrame):
125
+ order = ["Distinction", "First Class", "Second Class", "Fail", "Pass"]
126
+ counts = result_df["Remark"].value_counts()
127
+ labels = [x for x in order if x in counts.index]
128
+ values = [int(counts[x]) for x in labels]
129
 
130
+ fig, ax = plt.subplots(figsize=(7, 4))
131
+ ax.bar(labels, values)
132
+ ax.set_title("Remark Distribution")
133
+ ax.set_xlabel("Remark")
134
+ ax.set_ylabel("Number of Students")
135
+ plt.xticks(rotation=20, ha="right")
136
+ plt.tight_layout()
137
+ return fig
138
+
139
+
140
+ def load_csv(file_obj):
141
+ if file_obj is None:
142
+ return None, "Please upload a CSV.", None
143
+
144
+ try:
145
+ df = pd.read_csv(file_obj.name)
146
+ df = _clean_columns(df)
147
+ msg = f"Loaded: {os.path.basename(file_obj.name)} | Rows={df.shape[0]} | Cols={df.shape[1]}"
148
+ return df, msg, df.head(20)
149
+ except Exception as e:
150
+ return None, f"Could not read CSV: {e}", None
151
+
152
+
153
+ def run_all(df, pass_mark):
154
  if df is None:
155
+ return "Upload a CSV first.", None, None, None, None, None, None, None, None
156
+
157
+ try:
158
+ result_df, subj_avg, fail_dist, top3_overall, top3_each_subject, summary = compute_marks(
159
+ df, pass_mark=int(pass_mark)
160
+ )
161
+ fig_avg = plot_subject_avg(subj_avg)
162
+ fig_remark = plot_remark_distribution(result_df)
163
+
164
+ # Save downloadable CSV
165
+ out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".csv").name
166
+ result_df.to_csv(out_path, index=False)
167
+
168
+ return summary, result_df, subj_avg, fail_dist, top3_overall, top3_each_subject, fig_avg, fig_remark, out_path
169
+ except Exception as e:
170
+ return f"Error: {e}", None, None, None, None, None, None, None, None
171
+
172
+
173
+ def search_regno(result_df, regno_value):
174
+ if result_df is None or isinstance(result_df, str):
175
+ return "Run analysis first.", pd.DataFrame()
176
+ if not regno_value:
177
+ return "Enter RegNo to search.", pd.DataFrame()
178
+
179
+ # Exact match (string or numeric)
180
+ col = result_df[ID_COL_DEFAULT]
181
+ res = None
182
  if pd.api.types.is_numeric_dtype(col):
183
  try:
184
+ q = float(regno_value)
185
+ res = result_df[result_df[ID_COL_DEFAULT] == q]
186
+ except Exception:
187
+ res = result_df[col.astype(str) == str(regno_value)]
188
  else:
189
+ res = result_df[col.astype(str) == str(regno_value)]
190
 
191
+ if res is None or res.empty:
192
  return "No matching record found.", pd.DataFrame()
193
+ return f"Found {len(res)} record(s).", res
194
 
195
 
 
 
 
196
  CSS = """
197
+ #titleblock {text-align:center; margin-top: 6px; margin-bottom: 8px;}
198
  #t1 {font-size:30px; font-weight:800; color:#1E5AA8;}
199
  #t2 {font-size:18px; font-weight:800; color:#1E5AA8;}
200
  """
201
 
202
+ def header_html():
 
203
  if os.path.exists("logo.jpg"):
 
 
 
 
 
 
 
 
 
204
  return """
205
  <div id="titleblock">
206
+ <img src="file=logo.jpg" style="width:110px; display:block; margin:0 auto;" />
207
+ <div id="t1">Amrita Manthana</div>
208
+ <div id="t2">Prof.B.Santhi,SRC,SASTRA</div>
209
  </div>
210
  """
211
+ return """
212
+ <div id="titleblock">
213
+ <div id="t1">Amrita Manthana</div>
214
+ <div id="t2">Prof.B.Santhi,SRC,SASTRA</div>
215
+ </div>
216
+ """
217
+
218
 
219
+ with gr.Blocks(css=CSS, title="School Mark Analysis") as demo:
220
+ gr.HTML(header_html())
221
 
222
  df_state = gr.State(None)
223
+ result_state = gr.State(None) # stores result_df for search
 
224
 
225
  with gr.Row():
226
+ with gr.Column(scale=1, min_width=340):
227
+ gr.Markdown("### Upload Marks CSV")
228
  file_in = gr.File(label="Upload CSV", file_types=[".csv"])
229
+ load_btn = gr.Button("Load CSV", variant="primary")
 
 
230
  load_msg = gr.Textbox(label="Status", interactive=False)
231
+ preview = gr.Dataframe(label="Preview", interactive=False, wrap=True)
232
+
233
+ gr.Markdown("### Analysis Settings")
234
+ pass_mark = gr.Slider(label="Pass mark (per subject)", minimum=0, maximum=100, value=35, step=1)
235
+ run_btn = gr.Button("Run Mark Analysis", variant="primary")
236
+ summary = gr.Textbox(label="Summary", interactive=False)
237
+
238
+ download_file = gr.File(label="Download Result CSV")
239
 
240
  with gr.Column(scale=2):
241
  with gr.Tabs():
242
+ with gr.Tab("Result Table"):
243
+ result_table = gr.Dataframe(label="Result (Total, Average, Rank, Remark, Failed_Subjects)", interactive=False, wrap=True)
244
+ with gr.Tab("Subject Averages"):
245
+ subj_avg_table = gr.Dataframe(label="Subject-wise Averages", interactive=False, wrap=True)
246
+ avg_plot = gr.Plot(label="Bar Chart: Subject-wise Average")
247
+ with gr.Tab("Fail Counts"):
248
+ fail_dist_table = gr.Dataframe(label="Students failed in 1/2/3/4/5 subjects", interactive=False, wrap=True)
249
+ with gr.Tab("Toppers"):
250
+ top3_overall_table = gr.Dataframe(label="Overall Top 3", interactive=False, wrap=True)
251
+ top3_each_subject_table = gr.Dataframe(label="Top 3 in each subject", interactive=False, wrap=True)
252
+ with gr.Tab("Remarks"):
253
+ remark_plot = gr.Plot(label="Remark Distribution")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  with gr.Tab("Search"):
255
+ regno_in = gr.Textbox(label="Enter RegNo (exact match)")
 
256
  search_btn = gr.Button("Search")
257
  search_msg = gr.Textbox(label="Search Status", interactive=False)
258
+ search_out = gr.Dataframe(label="Matching Record(s)", interactive=False, wrap=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
 
260
+ # Events
261
  load_btn.click(
262
+ load_csv,
263
+ inputs=[file_in],
264
+ outputs=[df_state, load_msg, preview]
 
 
 
 
 
 
 
 
265
  )
266
 
267
+ def run_and_store(df, pm):
268
+ s, res, subj, faild, t3, t3sub, f1, f2, fcsv = run_all(df, pm)
269
+ return s, res, subj, faild, t3, t3sub, f1, f2, fcsv, res
 
 
 
 
 
 
 
270
 
271
+ run_btn.click(
272
+ run_and_store,
273
+ inputs=[df_state, pass_mark],
274
+ outputs=[summary, result_table, subj_avg_table, fail_dist_table, top3_overall_table, top3_each_subject_table,
275
+ avg_plot, remark_plot, download_file, result_state]
276
  )
277
 
278
  search_btn.click(
279
+ search_regno,
280
+ inputs=[result_state, regno_in],
281
  outputs=[search_msg, search_out]
282
  )
283
 
284
  if __name__ == "__main__":
285
  demo.launch()