SanthiSastra commited on
Commit
09db78e
·
verified ·
1 Parent(s): a005bc9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -275
app.py CHANGED
@@ -1,5 +1,6 @@
 
 
1
 
2
- import io
3
  import os
4
  import numpy as np
5
  import pandas as pd
@@ -9,14 +10,14 @@ import gradio as gr
9
  from docx import Document
10
  from docx.shared import Inches
11
 
12
- from sklearn.decomposition import PCA
13
- from sklearn.preprocessing import StandardScaler
14
  from sklearn.impute import SimpleImputer
15
-
 
16
 
17
  DOCX_OUT_PATH = "/tmp/EDA_Report.docx"
18
 
19
 
 
20
  def read_csv_safely(filepath: str) -> pd.DataFrame:
21
  try:
22
  return pd.read_csv(filepath)
@@ -24,12 +25,61 @@ def read_csv_safely(filepath: str) -> pd.DataFrame:
24
  return pd.read_csv(filepath, encoding="latin1")
25
 
26
 
 
 
 
 
 
 
 
 
 
27
  def save_plot(fig, out_path: str) -> str:
28
  fig.savefig(out_path, dpi=180, bbox_inches="tight")
29
  plt.close(fig)
30
  return out_path
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
34
  doc = Document()
35
  doc.add_heading("EDA Report (Auto-generated)", level=1)
@@ -40,41 +90,36 @@ def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
40
 
41
  doc.add_heading("Column Types", level=2)
42
  dtypes = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
43
- table = doc.add_table(rows=1, cols=2)
44
- table.rows[0].cells[0].text = "column"
45
- table.rows[0].cells[1].text = "dtype"
46
  for _, r in dtypes.head(100).iterrows():
47
- row = table.add_row().cells
48
  row[0].text = str(r["column"])
49
  row[1].text = str(r["dtype"])
50
 
51
  doc.add_heading("Missing Values", level=2)
52
  miss = (df.isna().mean() * 100).sort_values(ascending=False)
53
- doc.add_paragraph("Top columns by missing percentage:")
54
- table2 = doc.add_table(rows=1, cols=2)
55
- table2.rows[0].cells[0].text = "column"
56
- table2.rows[0].cells[1].text = "missing_%"
57
  for idx, val in miss.head(25).items():
58
- row = table2.add_row().cells
59
  row[0].text = str(idx)
60
  row[1].text = f"{val:.2f}"
61
 
62
- doc.add_paragraph(
63
- "Interpretation: Columns with high missing values may need imputation (median/mode) "
64
- "or removal depending on domain importance."
65
- )
66
 
67
  doc.add_heading("Summary Statistics (Numeric)", level=2)
68
  num_df = df.select_dtypes(include=[np.number])
69
  if num_df.shape[1] > 0:
70
  desc = num_df.describe().T.reset_index().rename(columns={"index": "feature"})
71
- cols = ["feature", "mean", "std", "min", "50%", "max"]
72
- cols = [c for c in cols if c in desc.columns]
73
- table3 = doc.add_table(rows=1, cols=len(cols))
74
  for j, c in enumerate(cols):
75
- table3.rows[0].cells[j].text = c
76
  for _, r in desc.head(30).iterrows():
77
- row = table3.add_row().cells
78
  for j, c in enumerate(cols):
79
  v = r[c]
80
  row[j].text = str(round(v, 6)) if isinstance(v, (int, float, np.floating)) else str(v)
@@ -82,34 +127,24 @@ def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
82
  doc.add_paragraph("No numeric columns found.")
83
 
84
  doc.add_heading("Charts + Interpretations", level=2)
85
- for fig_path, note in zip(fig_paths, notes):
86
  doc.add_paragraph(f"Interpretation: {note}")
87
- if os.path.exists(fig_path):
88
- doc.add_picture(fig_path, width=Inches(6.5))
89
 
90
  doc.save(DOCX_OUT_PATH)
91
  return DOCX_OUT_PATH
92
 
93
 
94
- def clean_df(df: pd.DataFrame) -> pd.DataFrame:
95
- df = df.copy()
96
- df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]
97
- for c in list(df.columns):
98
- if c.lower().startswith("unnamed"):
99
- df = df.drop(columns=[c])
100
- return df
101
-
102
-
103
  def eda_pipeline(csv_path: str):
104
  if csv_path is None or str(csv_path).strip() == "":
105
- return "Please upload a CSV.", None, None, None, None, None, None, None
106
 
107
  try:
108
- df = read_csv_safely(csv_path)
109
  except Exception as e:
110
- return f"Could not read CSV: {e}", None, None, None, None, None, None, None
111
-
112
- df = clean_df(df)
113
 
114
  preview = df.head(25)
115
  dtypes_df = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
@@ -118,24 +153,22 @@ def eda_pipeline(csv_path: str):
118
  num_df = df.select_dtypes(include=[np.number])
119
  desc = num_df.describe().T if num_df.shape[1] > 0 else pd.DataFrame()
120
 
121
- fig_paths = []
122
- notes = []
123
 
124
- # Missingness bar
125
  miss_series = (df.isna().mean() * 100).sort_values(ascending=False).head(15)
126
  fig1 = plt.figure(figsize=(10, 4))
127
  plt.bar(miss_series.index.astype(str), miss_series.values)
128
  plt.title("Missing Values (%): Top 15 Columns")
129
  plt.xticks(rotation=45, ha="right", fontsize=7)
130
  plt.ylabel("Missing (%)")
131
- p1 = save_plot(fig1, "/tmp/missingness.png")
132
- fig_paths.append(p1)
133
- notes.append("High-missing columns may need imputation (median/mode) or removal depending on usefulness.")
134
 
135
  corr_plot = None
136
  hist_plot = None
137
 
138
- # Correlation heatmap
139
  if num_df.shape[1] >= 2:
140
  corr = num_df.corr(numeric_only=True)
141
  fig2 = plt.figure(figsize=(10, 5))
@@ -145,11 +178,10 @@ def eda_pipeline(csv_path: str):
145
  plt.yticks(range(len(corr.index)), corr.index, fontsize=7)
146
  plt.colorbar()
147
  corr_plot = fig2
148
- p2 = save_plot(fig2, "/tmp/corr_heatmap.png")
149
- fig_paths.append(p2)
150
- notes.append("Strong correlations can indicate redundancy; consider regularization or feature selection.")
151
 
152
- # Histograms
153
  if num_df.shape[1] > 0:
154
  cols = list(num_df.columns)[:4]
155
  fig3 = plt.figure(figsize=(10, 6))
@@ -160,265 +192,95 @@ def eda_pipeline(csv_path: str):
160
  plt.suptitle("Histograms (first 4 numeric columns)", y=1.02)
161
  plt.tight_layout()
162
  hist_plot = fig3
163
- p3 = save_plot(fig3, "/tmp/histograms.png")
164
- fig_paths.append(p3)
165
- notes.append("Histograms show spread/outliers/skewness. Skewed features may need transforms.")
166
 
167
  # DOCX
168
  try:
169
  docx_path = make_docx_report(df, fig_paths, notes)
170
  except Exception as e:
171
- return f"Error while creating DOCX: {e}", preview, dtypes_df, miss_df, desc, None, corr_plot, hist_plot
 
172
 
 
173
  summary_text = f"Loaded CSV successfully. Rows: {df.shape[0]} | Columns: {df.shape[1]}"
174
- return summary_text, preview, dtypes_df, miss_df, desc, docx_path, corr_plot, hist_plot
175
-
176
-
177
- def get_columns_for_dropdowns(csv_path: str):
178
- if csv_path is None or str(csv_path).strip() == "":
179
- return gr.update(choices=[], value=None), gr.update(choices=["None"], value="None")
180
-
181
- try:
182
- df = read_csv_safely(csv_path)
183
- df = clean_df(df)
184
-
185
- num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
186
- all_cols = df.columns.tolist()
187
-
188
- feature_default = num_cols[0] if len(num_cols) else None
189
- target_choices = ["None"] + all_cols
190
- target_default = "None"
191
-
192
- return gr.update(choices=num_cols, value=feature_default), gr.update(choices=target_choices, value=target_default)
193
- except Exception:
194
- return gr.update(choices=[], value=None), gr.update(choices=["None"], value="None")
195
-
196
-
197
- def feature_analysis(csv_path: str, feature_col: str, target_col: str):
198
- if csv_path is None or str(csv_path).strip() == "":
199
- return None, None, pd.DataFrame({"message": ["Please upload a CSV first."]})
200
-
201
- try:
202
- df = clean_df(read_csv_safely(csv_path))
203
- except Exception as e:
204
- return None, None, pd.DataFrame({"error": [f"Could not read CSV: {e}"]})
205
-
206
- if feature_col is None or feature_col not in df.columns:
207
- return None, None, pd.DataFrame({"error": ["Please select a valid numeric feature."]})
208
-
209
- if not pd.api.types.is_numeric_dtype(df[feature_col]):
210
- return None, None, pd.DataFrame({"error": [f"Selected feature '{feature_col}' is not numeric."]})
211
-
212
- # Box plot
213
- box_fig = plt.figure(figsize=(7, 4))
214
- if target_col and target_col != "None" and target_col in df.columns:
215
- uniq = df[target_col].dropna().unique()
216
- if len(uniq) <= 20:
217
- groups, labels = [], []
218
- for u in sorted(uniq, key=lambda x: str(x)):
219
- vals = df.loc[df[target_col] == u, feature_col].dropna().values
220
- if len(vals):
221
- groups.append(vals)
222
- labels.append(str(u))
223
- if len(groups) >= 2:
224
- plt.boxplot(groups, labels=labels, showfliers=True)
225
- plt.title(f"Box Plot: {feature_col} by {target_col}")
226
- plt.xlabel(target_col)
227
- plt.ylabel(feature_col)
228
- else:
229
- plt.boxplot(df[feature_col].dropna().values)
230
- plt.title(f"Box Plot: {feature_col}")
231
- plt.ylabel(feature_col)
232
- else:
233
- plt.boxplot(df[feature_col].dropna().values)
234
- plt.title(f"Box Plot: {feature_col} (target too many groups)")
235
- plt.ylabel(feature_col)
236
- else:
237
- plt.boxplot(df[feature_col].dropna().values)
238
- plt.title(f"Box Plot: {feature_col}")
239
- plt.ylabel(feature_col)
240
- plt.tight_layout()
241
-
242
- # Skewness table
243
- num_df = df.select_dtypes(include=[np.number]).copy()
244
- if num_df.shape[1] == 0:
245
- skew_table = pd.DataFrame({"error": ["No numeric columns to compute skewness."]})
246
- else:
247
- skew_series = num_df.skew(numeric_only=True).sort_values(key=lambda s: s.abs(), ascending=False)
248
- skew_table = pd.DataFrame({
249
- "feature": skew_series.index,
250
- "skewness": skew_series.values,
251
- "abs_skewness": np.abs(skew_series.values)
252
- }).head(20)
253
- selected_skew = float(num_df[feature_col].skew()) if feature_col in num_df.columns else np.nan
254
- skew_table = pd.concat([
255
- pd.DataFrame({"feature": [feature_col], "skewness": [selected_skew], "abs_skewness": [abs(selected_skew)]}),
256
- skew_table
257
- ], ignore_index=True).drop_duplicates(subset=["feature"], keep="first")
258
-
259
- # PCA plot
260
- if num_df.shape[1] >= 2 and num_df.shape[0] >= 5:
261
- X = SimpleImputer(strategy="median").fit_transform(num_df.values)
262
- X = StandardScaler().fit_transform(X)
263
- pca = PCA(n_components=2, random_state=42)
264
- Z = pca.fit_transform(X)
265
-
266
- pca_fig = plt.figure(figsize=(7, 4))
267
- if target_col and target_col != "None" and target_col in df.columns:
268
- y = df[target_col].astype(str)
269
- uniq = y.dropna().unique()
270
- if len(uniq) <= 10:
271
- for u in sorted(uniq):
272
- mask = (y == u).values
273
- plt.scatter(Z[mask, 0], Z[mask, 1], s=18, label=u)
274
- plt.legend(fontsize=8)
275
- else:
276
- plt.scatter(Z[:, 0], Z[:, 1], s=18)
277
- plt.title(f"PCA (2D) colored by {target_col}")
278
- else:
279
- plt.scatter(Z[:, 0], Z[:, 1], s=18)
280
- plt.title("PCA (2D)")
281
 
282
- plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
283
- plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
284
- plt.tight_layout()
285
- else:
286
- pca_fig = plt.figure(figsize=(7, 2))
287
- plt.text(0.01, 0.5, "Not enough numeric columns/rows for PCA.", fontsize=10)
288
- plt.axis("off")
289
 
290
- return box_fig, pca_fig, skew_table
 
 
 
 
 
 
291
 
 
 
 
 
 
 
 
292
 
293
  with gr.Blocks(
294
- title="Samudramadanam-Amirthum1 | SASTRA",
295
- theme=gr.themes.Soft(
296
- primary_hue="blue",
297
- secondary_hue="slate",
298
- neutral_hue="gray",
299
- radius_size="lg",
300
- font=["Inter", "ui-sans-serif", "system-ui"]
301
- ),
302
- css="""
303
- .topbar {
304
- display:flex; align-items:center; gap:14px;
305
- padding:16px 18px; border-radius:16px;
306
- background: linear-gradient(90deg, rgba(15,23,42,1), rgba(30,58,138,1));
307
- color:white; margin-bottom:14px;
308
- box-shadow: 0 10px 24px rgba(2,6,23,0.25);
309
- }
310
- .topbar img { height:56px; width:auto; border-radius:10px; background:white; padding:6px; }
311
- .topbar .title { font-size:20px; font-weight:800; line-height:1.1; }
312
- .topbar .sub { font-size:12px; opacity:0.9; margin-top:2px; }
313
- .chiprow { margin-top:10px; display:flex; flex-wrap:wrap; gap:8px; }
314
- .chip {
315
- display:inline-block; padding:6px 10px; border-radius:999px;
316
- background: rgba(255,255,255,0.14); color:white;
317
- font-size:12px; border: 1px solid rgba(255,255,255,0.18);
318
- }
319
- .card {
320
- border-radius:18px; padding:14px 14px;
321
- border:1px solid rgba(148,163,184,0.35);
322
- box-shadow: 0 10px 22px rgba(15,23,42,0.06);
323
- background: rgba(255,255,255,0.88);
324
- }
325
- .hint { font-size:12px; color:#475569; }
326
- .stepbox {
327
- border-radius:14px;
328
- border:1px dashed rgba(100,116,139,0.55);
329
- padding:12px 12px;
330
- background: rgba(248,250,252,0.95);
331
- }
332
- """
333
  ) as demo:
334
- with gr.Row(variant="compact"):
335
- gr.Image(value="logo.jpg", show_label=False, height=80, container=False)
336
- gr.Markdown(
337
- "## **Samudramadanam-Amirthum1**\n"
338
- "**SASTRA • CSV EDA & Report Studio (Upload → EDA → Plots → DOCX)**"
339
- )
340
 
341
- # ---------- Header ----------
342
- # gr.Markdown("## Samudhramadanam-Amirtham1")
343
- #gr.Markdown("### SASTRA CSV EDA & Report Studio")
 
 
344
 
345
-
346
 
347
- # ---------- Main Layout ----------
348
  with gr.Row():
349
- # Left: Controls
350
- with gr.Column(scale=1, min_width=360):
351
  with gr.Group(elem_classes="card"):
352
- gr.Markdown("### 1) Upload Dataset")
353
- gr.Markdown("<div class='hint'>Upload a CSV file. The app processes it securely on the server.</div>")
354
  file_in = gr.File(label="Upload CSV", file_types=[".csv"], type="filepath")
 
 
355
 
356
- gr.Markdown("### 2) Run EDA")
357
- gr.Markdown(
358
- "<div class='stepbox'>"
359
- "<b>Procedure</b><br>"
360
- "• Upload CSV<br>"
361
- "• Click <b>Run EDA + Generate DOCX</b><br>"
362
- "• View preview, missing %, numeric summary<br>"
363
- "• Download the DOCX report<br>"
364
- "• Optional: feature analysis (boxplot, skewness, PCA)"
365
- "</div>"
366
- )
367
-
368
- run_btn = gr.Button("Run EDA + Generate DOCX", variant="primary")
369
- summary = gr.Textbox(label="Status", lines=2)
370
- docx_out = gr.File(label="Download EDA Report (.docx)")
371
-
372
- with gr.Group(elem_classes="card"):
373
- gr.Markdown("### 3) Feature Analysis (Optional)")
374
- gr.Markdown("<div class='hint'>Select a numeric feature. Choose a target column if you want grouping/color.</div>")
375
- with gr.Row():
376
- feature_dd = gr.Dropdown(label="Numeric Feature", choices=[], value=None)
377
- target_dd = gr.Dropdown(label="Target Column (optional)", choices=["None"], value="None")
378
- analyze_btn = gr.Button("Run Feature Analysis", variant="secondary")
379
-
380
- # Right: Outputs
381
  with gr.Column(scale=2, min_width=520):
382
  with gr.Tabs():
383
- with gr.TabItem("📄 EDA Tables"):
 
384
  with gr.Group(elem_classes="card"):
385
  preview_out = gr.Dataframe(label="Preview (first 25 rows)", interactive=False)
386
- dtypes_out = gr.Dataframe(label="Column types", interactive=False)
387
  with gr.Group(elem_classes="card"):
388
- miss_out = gr.Dataframe(label="Missing values (% top 25)", interactive=False)
389
- desc_out = gr.Dataframe(label="Numeric summary (describe)", interactive=False)
390
 
391
- with gr.TabItem("📈 EDA Plots"):
 
392
  with gr.Group(elem_classes="card"):
393
  with gr.Row():
394
- corr_plot_out = gr.Plot(label="Correlation Heatmap (numeric)")
395
- hist_plot_out = gr.Plot(label="Histograms (first 4 numeric columns)")
396
 
397
- with gr.TabItem("🔍 Feature Analysis"):
 
398
  with gr.Group(elem_classes="card"):
399
- with gr.Row():
400
- box_plot_out = gr.Plot(label="Box Plot")
401
- pca_plot_out = gr.Plot(label="PCA (2D)")
402
- skew_out = gr.Dataframe(label="Skewness (Top 20 numeric features)", interactive=False)
 
 
 
403
 
404
- # ---------- Wiring (uses your existing functions) ----------
405
  run_btn.click(
406
  fn=eda_pipeline,
407
  inputs=[file_in],
408
- outputs=[summary, preview_out, dtypes_out, miss_out, desc_out, docx_out, corr_plot_out, hist_plot_out]
409
- )
410
-
411
- file_in.change(
412
- fn=get_columns_for_dropdowns,
413
- inputs=[file_in],
414
- outputs=[feature_dd, target_dd]
415
- )
416
-
417
- analyze_btn.click(
418
- fn=feature_analysis,
419
- inputs=[file_in, feature_dd, target_dd],
420
- outputs=[box_plot_out, pca_plot_out, skew_out]
421
  )
422
 
423
  demo.launch()
424
-
 
1
+ # app.py (Hugging Face Spaces + Gradio)
2
+ # Requirements: gradio, pandas, numpy, matplotlib, python-docx, scikit-learn
3
 
 
4
  import os
5
  import numpy as np
6
  import pandas as pd
 
10
  from docx import Document
11
  from docx.shared import Inches
12
 
 
 
13
  from sklearn.impute import SimpleImputer
14
+ from sklearn.preprocessing import StandardScaler
15
+ from sklearn.decomposition import PCA
16
 
17
  DOCX_OUT_PATH = "/tmp/EDA_Report.docx"
18
 
19
 
20
+ # ----------------------------- Helpers -----------------------------
21
  def read_csv_safely(filepath: str) -> pd.DataFrame:
22
  try:
23
  return pd.read_csv(filepath)
 
25
  return pd.read_csv(filepath, encoding="latin1")
26
 
27
 
28
+ def clean_df(df: pd.DataFrame) -> pd.DataFrame:
29
+ df = df.copy()
30
+ df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]
31
+ for c in list(df.columns):
32
+ if c.lower().startswith("unnamed"):
33
+ df = df.drop(columns=[c])
34
+ return df
35
+
36
+
37
  def save_plot(fig, out_path: str) -> str:
38
  fig.savefig(out_path, dpi=180, bbox_inches="tight")
39
  plt.close(fig)
40
  return out_path
41
 
42
 
43
+ def make_interpretation_notes(df: pd.DataFrame) -> str:
44
+ notes = []
45
+ notes.append(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns.")
46
+
47
+ miss = (df.isna().mean() * 100).sort_values(ascending=False)
48
+ top_miss = miss[miss > 0].head(5)
49
+ if len(top_miss) == 0:
50
+ notes.append("No missing values detected.")
51
+ else:
52
+ notes.append("Top missing columns (%): " + ", ".join([f"{k}={v:.1f}%" for k, v in top_miss.items()]))
53
+
54
+ num_df = df.select_dtypes(include=[np.number])
55
+ if num_df.shape[1] > 0:
56
+ skew = num_df.skew(numeric_only=True)
57
+ high_skew = skew[skew.abs() > 1].sort_values(key=lambda s: s.abs(), ascending=False).head(5)
58
+ if len(high_skew) > 0:
59
+ notes.append("Highly skewed numeric features (|skew|>1): " +
60
+ ", ".join([f"{k}={v:.2f}" for k, v in high_skew.items()]) +
61
+ ". Consider log/Box-Cox or robust scaling if needed.")
62
+ else:
63
+ notes.append("No strongly skewed numeric features (|skew|>1) detected among numeric columns.")
64
+
65
+ if num_df.shape[1] >= 2:
66
+ corr = num_df.corr(numeric_only=True)
67
+ # strongest correlations (excluding self)
68
+ pairs = []
69
+ cols = corr.columns
70
+ for i in range(len(cols)):
71
+ for j in range(i + 1, len(cols)):
72
+ pairs.append((cols[i], cols[j], corr.iloc[i, j]))
73
+ pairs = sorted(pairs, key=lambda x: abs(x[2]), reverse=True)[:5]
74
+ if pairs:
75
+ notes.append("Top correlations (absolute): " + ", ".join([f"{a}-{b}={c:.2f}" for a, b, c in pairs]))
76
+ else:
77
+ notes.append("No numeric columns detected; plots and numeric summary will be limited.")
78
+
79
+ return "\n• " + "\n• ".join(notes)
80
+
81
+
82
+ # ----------------------------- DOCX Report -----------------------------
83
  def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
84
  doc = Document()
85
  doc.add_heading("EDA Report (Auto-generated)", level=1)
 
90
 
91
  doc.add_heading("Column Types", level=2)
92
  dtypes = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
93
+ t = doc.add_table(rows=1, cols=2)
94
+ t.rows[0].cells[0].text = "column"
95
+ t.rows[0].cells[1].text = "dtype"
96
  for _, r in dtypes.head(100).iterrows():
97
+ row = t.add_row().cells
98
  row[0].text = str(r["column"])
99
  row[1].text = str(r["dtype"])
100
 
101
  doc.add_heading("Missing Values", level=2)
102
  miss = (df.isna().mean() * 100).sort_values(ascending=False)
103
+ t2 = doc.add_table(rows=1, cols=2)
104
+ t2.rows[0].cells[0].text = "column"
105
+ t2.rows[0].cells[1].text = "missing_%"
 
106
  for idx, val in miss.head(25).items():
107
+ row = t2.add_row().cells
108
  row[0].text = str(idx)
109
  row[1].text = f"{val:.2f}"
110
 
111
+ doc.add_paragraph("Interpretation: Columns with high missing values may need imputation or removal.")
 
 
 
112
 
113
  doc.add_heading("Summary Statistics (Numeric)", level=2)
114
  num_df = df.select_dtypes(include=[np.number])
115
  if num_df.shape[1] > 0:
116
  desc = num_df.describe().T.reset_index().rename(columns={"index": "feature"})
117
+ cols = [c for c in ["feature", "mean", "std", "min", "50%", "max"] if c in desc.columns]
118
+ t3 = doc.add_table(rows=1, cols=len(cols))
 
119
  for j, c in enumerate(cols):
120
+ t3.rows[0].cells[j].text = c
121
  for _, r in desc.head(30).iterrows():
122
+ row = t3.add_row().cells
123
  for j, c in enumerate(cols):
124
  v = r[c]
125
  row[j].text = str(round(v, 6)) if isinstance(v, (int, float, np.floating)) else str(v)
 
127
  doc.add_paragraph("No numeric columns found.")
128
 
129
  doc.add_heading("Charts + Interpretations", level=2)
130
+ for fp, note in zip(fig_paths, notes):
131
  doc.add_paragraph(f"Interpretation: {note}")
132
+ if os.path.exists(fp):
133
+ doc.add_picture(fp, width=Inches(6.5))
134
 
135
  doc.save(DOCX_OUT_PATH)
136
  return DOCX_OUT_PATH
137
 
138
 
139
+ # ----------------------------- EDA Pipeline -----------------------------
 
 
 
 
 
 
 
 
140
  def eda_pipeline(csv_path: str):
141
  if csv_path is None or str(csv_path).strip() == "":
142
+ return "Please upload a CSV.", None, None, None, None, None, None, None, ""
143
 
144
  try:
145
+ df = clean_df(read_csv_safely(csv_path))
146
  except Exception as e:
147
+ return f"Could not read CSV: {e}", None, None, None, None, None, None, None, ""
 
 
148
 
149
  preview = df.head(25)
150
  dtypes_df = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
 
153
  num_df = df.select_dtypes(include=[np.number])
154
  desc = num_df.describe().T if num_df.shape[1] > 0 else pd.DataFrame()
155
 
156
+ fig_paths, notes = [], []
 
157
 
158
+ # Plot 1: Missingness
159
  miss_series = (df.isna().mean() * 100).sort_values(ascending=False).head(15)
160
  fig1 = plt.figure(figsize=(10, 4))
161
  plt.bar(miss_series.index.astype(str), miss_series.values)
162
  plt.title("Missing Values (%): Top 15 Columns")
163
  plt.xticks(rotation=45, ha="right", fontsize=7)
164
  plt.ylabel("Missing (%)")
165
+ fig_paths.append(save_plot(fig1, "/tmp/missingness.png"))
166
+ notes.append("High-missing columns may need imputation (median/mode) or removal based on usefulness.")
 
167
 
168
  corr_plot = None
169
  hist_plot = None
170
 
171
+ # Plot 2: Correlation
172
  if num_df.shape[1] >= 2:
173
  corr = num_df.corr(numeric_only=True)
174
  fig2 = plt.figure(figsize=(10, 5))
 
178
  plt.yticks(range(len(corr.index)), corr.index, fontsize=7)
179
  plt.colorbar()
180
  corr_plot = fig2
181
+ fig_paths.append(save_plot(fig2, "/tmp/corr_heatmap.png"))
182
+ notes.append("Strong correlations may indicate redundant features; consider feature selection/regularization.")
 
183
 
184
+ # Plot 3: Histograms
185
  if num_df.shape[1] > 0:
186
  cols = list(num_df.columns)[:4]
187
  fig3 = plt.figure(figsize=(10, 6))
 
192
  plt.suptitle("Histograms (first 4 numeric columns)", y=1.02)
193
  plt.tight_layout()
194
  hist_plot = fig3
195
+ fig_paths.append(save_plot(fig3, "/tmp/histograms.png"))
196
+ notes.append("Histograms show distribution/outliers/skewness; consider transforms for highly skewed features.")
 
197
 
198
  # DOCX
199
  try:
200
  docx_path = make_docx_report(df, fig_paths, notes)
201
  except Exception as e:
202
+ interp = make_interpretation_notes(df)
203
+ return f"Error while creating DOCX: {e}", preview, dtypes_df, miss_df, desc, None, corr_plot, hist_plot, interp
204
 
205
+ interp = make_interpretation_notes(df)
206
  summary_text = f"Loaded CSV successfully. Rows: {df.shape[0]} | Columns: {df.shape[1]}"
207
+ return summary_text, preview, dtypes_df, miss_df, desc, docx_path, corr_plot, hist_plot, interp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
 
 
 
 
 
 
 
209
 
210
+ # ----------------------------- App UI (Beautiful College View) -----------------------------
211
+ CSS = """
212
+ /* Center header */
213
+ #hdr {text-align:center; margin-top:8px; margin-bottom:6px;}
214
+ #appname {color:#0b3d91; font-weight:900; font-size:28px; margin:0;}
215
+ #appsub {color:#0b3d91; font-weight:700; font-size:16px; margin-top:4px;}
216
+ #appauth {color:#0b3d91; font-weight:700; font-size:14px; margin-top:2px;}
217
 
218
+ /* Ribbon tabs */
219
+ .gradio-container .tabs {border-radius:14px;}
220
+ .gradio-container .tabitem {font-weight:800;}
221
+ /* Card style */
222
+ .card {border:1px solid rgba(148,163,184,.35); border-radius:18px; padding:14px; background:rgba(255,255,255,.92);}
223
+ .hint {font-size:12px; color:#475569;}
224
+ """
225
 
226
  with gr.Blocks(
227
+ title="SAMUDHRAMADANAM-AMIRTHAM1 | SASTRA",
228
+ theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate", neutral_hue="gray"),
229
+ css=CSS
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  ) as demo:
 
 
 
 
 
 
231
 
232
+ # ---------- Header (CENTER) ----------
233
+ with gr.Column(elem_id="hdr"):
234
+ gr.Image(value="logo.jpg", show_label=False, height=120, container=False)
235
+ gr.Markdown("<div id='appname'>SAMUDHRAMADANAM-AMIRTHAM1</div>")
236
+ gr.Markdown("<div id='appauth'>Prof.B.Santhi, SRC, SASTRA</div>")
237
 
238
+ gr.Markdown("<hr>")
239
 
240
+ # ---------- Left controls + Ribbon outputs ----------
241
  with gr.Row():
242
+ with gr.Column(scale=1, min_width=340):
 
243
  with gr.Group(elem_classes="card"):
244
+ gr.Markdown("### Upload CSV")
245
+ gr.Markdown("<div class='hint'>Upload your dataset (CSV). Then run EDA to view tables, graphs and download report.</div>")
246
  file_in = gr.File(label="Upload CSV", file_types=[".csv"], type="filepath")
247
+ run_btn = gr.Button("Run EDA", variant="primary")
248
+ status = gr.Textbox(label="Status", lines=2)
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  with gr.Column(scale=2, min_width=520):
251
  with gr.Tabs():
252
+ # Ribbon 1: EDA
253
+ with gr.TabItem("EDA"):
254
  with gr.Group(elem_classes="card"):
255
  preview_out = gr.Dataframe(label="Preview (first 25 rows)", interactive=False)
256
+ dtypes_out = gr.Dataframe(label="Column Types", interactive=False)
257
  with gr.Group(elem_classes="card"):
258
+ miss_out = gr.Dataframe(label="Missing Values (% top 25)", interactive=False)
259
+ desc_out = gr.Dataframe(label="Numeric Summary (describe)", interactive=False)
260
 
261
+ # Ribbon 2: Graph
262
+ with gr.TabItem("Graph"):
263
  with gr.Group(elem_classes="card"):
264
  with gr.Row():
265
+ corr_plot_out = gr.Plot(label="Correlation Heatmap")
266
+ hist_plot_out = gr.Plot(label="Histograms")
267
 
268
+ # Ribbon 3: Report
269
+ with gr.TabItem("Report"):
270
  with gr.Group(elem_classes="card"):
271
+ gr.Markdown("### Download Report")
272
+ docx_out = gr.File(label="EDA Report (.docx)")
273
+
274
+ # Ribbon 4: Interpretation
275
+ with gr.TabItem("Interpretation"):
276
+ with gr.Group(elem_classes="card"):
277
+ interp_out = gr.Textbox(label="Auto Interpretation", lines=10)
278
 
279
+ # ---------- Wiring ----------
280
  run_btn.click(
281
  fn=eda_pipeline,
282
  inputs=[file_in],
283
+ outputs=[status, preview_out, dtypes_out, miss_out, desc_out, docx_out, corr_plot_out, hist_plot_out, interp_out]
 
 
 
 
 
 
 
 
 
 
 
 
284
  )
285
 
286
  demo.launch()