Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
|
|
|
|
|
| 1 |
|
| 2 |
-
import io
|
| 3 |
import os
|
| 4 |
import numpy as np
|
| 5 |
import pandas as pd
|
|
@@ -9,14 +10,14 @@ import gradio as gr
|
|
| 9 |
from docx import Document
|
| 10 |
from docx.shared import Inches
|
| 11 |
|
| 12 |
-
from sklearn.decomposition import PCA
|
| 13 |
-
from sklearn.preprocessing import StandardScaler
|
| 14 |
from sklearn.impute import SimpleImputer
|
| 15 |
-
|
|
|
|
| 16 |
|
| 17 |
DOCX_OUT_PATH = "/tmp/EDA_Report.docx"
|
| 18 |
|
| 19 |
|
|
|
|
| 20 |
def read_csv_safely(filepath: str) -> pd.DataFrame:
|
| 21 |
try:
|
| 22 |
return pd.read_csv(filepath)
|
|
@@ -24,12 +25,61 @@ def read_csv_safely(filepath: str) -> pd.DataFrame:
|
|
| 24 |
return pd.read_csv(filepath, encoding="latin1")
|
| 25 |
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
def save_plot(fig, out_path: str) -> str:
|
| 28 |
fig.savefig(out_path, dpi=180, bbox_inches="tight")
|
| 29 |
plt.close(fig)
|
| 30 |
return out_path
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
|
| 34 |
doc = Document()
|
| 35 |
doc.add_heading("EDA Report (Auto-generated)", level=1)
|
|
@@ -40,41 +90,36 @@ def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
|
|
| 40 |
|
| 41 |
doc.add_heading("Column Types", level=2)
|
| 42 |
dtypes = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
for _, r in dtypes.head(100).iterrows():
|
| 47 |
-
row =
|
| 48 |
row[0].text = str(r["column"])
|
| 49 |
row[1].text = str(r["dtype"])
|
| 50 |
|
| 51 |
doc.add_heading("Missing Values", level=2)
|
| 52 |
miss = (df.isna().mean() * 100).sort_values(ascending=False)
|
| 53 |
-
doc.
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
table2.rows[0].cells[1].text = "missing_%"
|
| 57 |
for idx, val in miss.head(25).items():
|
| 58 |
-
row =
|
| 59 |
row[0].text = str(idx)
|
| 60 |
row[1].text = f"{val:.2f}"
|
| 61 |
|
| 62 |
-
doc.add_paragraph(
|
| 63 |
-
"Interpretation: Columns with high missing values may need imputation (median/mode) "
|
| 64 |
-
"or removal depending on domain importance."
|
| 65 |
-
)
|
| 66 |
|
| 67 |
doc.add_heading("Summary Statistics (Numeric)", level=2)
|
| 68 |
num_df = df.select_dtypes(include=[np.number])
|
| 69 |
if num_df.shape[1] > 0:
|
| 70 |
desc = num_df.describe().T.reset_index().rename(columns={"index": "feature"})
|
| 71 |
-
cols = ["feature", "mean", "std", "min", "50%", "max"]
|
| 72 |
-
|
| 73 |
-
table3 = doc.add_table(rows=1, cols=len(cols))
|
| 74 |
for j, c in enumerate(cols):
|
| 75 |
-
|
| 76 |
for _, r in desc.head(30).iterrows():
|
| 77 |
-
row =
|
| 78 |
for j, c in enumerate(cols):
|
| 79 |
v = r[c]
|
| 80 |
row[j].text = str(round(v, 6)) if isinstance(v, (int, float, np.floating)) else str(v)
|
|
@@ -82,34 +127,24 @@ def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
|
|
| 82 |
doc.add_paragraph("No numeric columns found.")
|
| 83 |
|
| 84 |
doc.add_heading("Charts + Interpretations", level=2)
|
| 85 |
-
for
|
| 86 |
doc.add_paragraph(f"Interpretation: {note}")
|
| 87 |
-
if os.path.exists(
|
| 88 |
-
doc.add_picture(
|
| 89 |
|
| 90 |
doc.save(DOCX_OUT_PATH)
|
| 91 |
return DOCX_OUT_PATH
|
| 92 |
|
| 93 |
|
| 94 |
-
|
| 95 |
-
df = df.copy()
|
| 96 |
-
df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]
|
| 97 |
-
for c in list(df.columns):
|
| 98 |
-
if c.lower().startswith("unnamed"):
|
| 99 |
-
df = df.drop(columns=[c])
|
| 100 |
-
return df
|
| 101 |
-
|
| 102 |
-
|
| 103 |
def eda_pipeline(csv_path: str):
|
| 104 |
if csv_path is None or str(csv_path).strip() == "":
|
| 105 |
-
return "Please upload a CSV.", None, None, None, None, None, None, None
|
| 106 |
|
| 107 |
try:
|
| 108 |
-
df = read_csv_safely(csv_path)
|
| 109 |
except Exception as e:
|
| 110 |
-
return f"Could not read CSV: {e}", None, None, None, None, None, None, None
|
| 111 |
-
|
| 112 |
-
df = clean_df(df)
|
| 113 |
|
| 114 |
preview = df.head(25)
|
| 115 |
dtypes_df = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
|
|
@@ -118,24 +153,22 @@ def eda_pipeline(csv_path: str):
|
|
| 118 |
num_df = df.select_dtypes(include=[np.number])
|
| 119 |
desc = num_df.describe().T if num_df.shape[1] > 0 else pd.DataFrame()
|
| 120 |
|
| 121 |
-
fig_paths = []
|
| 122 |
-
notes = []
|
| 123 |
|
| 124 |
-
#
|
| 125 |
miss_series = (df.isna().mean() * 100).sort_values(ascending=False).head(15)
|
| 126 |
fig1 = plt.figure(figsize=(10, 4))
|
| 127 |
plt.bar(miss_series.index.astype(str), miss_series.values)
|
| 128 |
plt.title("Missing Values (%): Top 15 Columns")
|
| 129 |
plt.xticks(rotation=45, ha="right", fontsize=7)
|
| 130 |
plt.ylabel("Missing (%)")
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
notes.append("High-missing columns may need imputation (median/mode) or removal depending on usefulness.")
|
| 134 |
|
| 135 |
corr_plot = None
|
| 136 |
hist_plot = None
|
| 137 |
|
| 138 |
-
#
|
| 139 |
if num_df.shape[1] >= 2:
|
| 140 |
corr = num_df.corr(numeric_only=True)
|
| 141 |
fig2 = plt.figure(figsize=(10, 5))
|
|
@@ -145,11 +178,10 @@ def eda_pipeline(csv_path: str):
|
|
| 145 |
plt.yticks(range(len(corr.index)), corr.index, fontsize=7)
|
| 146 |
plt.colorbar()
|
| 147 |
corr_plot = fig2
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
notes.append("Strong correlations can indicate redundancy; consider regularization or feature selection.")
|
| 151 |
|
| 152 |
-
# Histograms
|
| 153 |
if num_df.shape[1] > 0:
|
| 154 |
cols = list(num_df.columns)[:4]
|
| 155 |
fig3 = plt.figure(figsize=(10, 6))
|
|
@@ -160,265 +192,95 @@ def eda_pipeline(csv_path: str):
|
|
| 160 |
plt.suptitle("Histograms (first 4 numeric columns)", y=1.02)
|
| 161 |
plt.tight_layout()
|
| 162 |
hist_plot = fig3
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
notes.append("Histograms show spread/outliers/skewness. Skewed features may need transforms.")
|
| 166 |
|
| 167 |
# DOCX
|
| 168 |
try:
|
| 169 |
docx_path = make_docx_report(df, fig_paths, notes)
|
| 170 |
except Exception as e:
|
| 171 |
-
|
|
|
|
| 172 |
|
|
|
|
| 173 |
summary_text = f"Loaded CSV successfully. Rows: {df.shape[0]} | Columns: {df.shape[1]}"
|
| 174 |
-
return summary_text, preview, dtypes_df, miss_df, desc, docx_path, corr_plot, hist_plot
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
def get_columns_for_dropdowns(csv_path: str):
|
| 178 |
-
if csv_path is None or str(csv_path).strip() == "":
|
| 179 |
-
return gr.update(choices=[], value=None), gr.update(choices=["None"], value="None")
|
| 180 |
-
|
| 181 |
-
try:
|
| 182 |
-
df = read_csv_safely(csv_path)
|
| 183 |
-
df = clean_df(df)
|
| 184 |
-
|
| 185 |
-
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 186 |
-
all_cols = df.columns.tolist()
|
| 187 |
-
|
| 188 |
-
feature_default = num_cols[0] if len(num_cols) else None
|
| 189 |
-
target_choices = ["None"] + all_cols
|
| 190 |
-
target_default = "None"
|
| 191 |
-
|
| 192 |
-
return gr.update(choices=num_cols, value=feature_default), gr.update(choices=target_choices, value=target_default)
|
| 193 |
-
except Exception:
|
| 194 |
-
return gr.update(choices=[], value=None), gr.update(choices=["None"], value="None")
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
def feature_analysis(csv_path: str, feature_col: str, target_col: str):
|
| 198 |
-
if csv_path is None or str(csv_path).strip() == "":
|
| 199 |
-
return None, None, pd.DataFrame({"message": ["Please upload a CSV first."]})
|
| 200 |
-
|
| 201 |
-
try:
|
| 202 |
-
df = clean_df(read_csv_safely(csv_path))
|
| 203 |
-
except Exception as e:
|
| 204 |
-
return None, None, pd.DataFrame({"error": [f"Could not read CSV: {e}"]})
|
| 205 |
-
|
| 206 |
-
if feature_col is None or feature_col not in df.columns:
|
| 207 |
-
return None, None, pd.DataFrame({"error": ["Please select a valid numeric feature."]})
|
| 208 |
-
|
| 209 |
-
if not pd.api.types.is_numeric_dtype(df[feature_col]):
|
| 210 |
-
return None, None, pd.DataFrame({"error": [f"Selected feature '{feature_col}' is not numeric."]})
|
| 211 |
-
|
| 212 |
-
# Box plot
|
| 213 |
-
box_fig = plt.figure(figsize=(7, 4))
|
| 214 |
-
if target_col and target_col != "None" and target_col in df.columns:
|
| 215 |
-
uniq = df[target_col].dropna().unique()
|
| 216 |
-
if len(uniq) <= 20:
|
| 217 |
-
groups, labels = [], []
|
| 218 |
-
for u in sorted(uniq, key=lambda x: str(x)):
|
| 219 |
-
vals = df.loc[df[target_col] == u, feature_col].dropna().values
|
| 220 |
-
if len(vals):
|
| 221 |
-
groups.append(vals)
|
| 222 |
-
labels.append(str(u))
|
| 223 |
-
if len(groups) >= 2:
|
| 224 |
-
plt.boxplot(groups, labels=labels, showfliers=True)
|
| 225 |
-
plt.title(f"Box Plot: {feature_col} by {target_col}")
|
| 226 |
-
plt.xlabel(target_col)
|
| 227 |
-
plt.ylabel(feature_col)
|
| 228 |
-
else:
|
| 229 |
-
plt.boxplot(df[feature_col].dropna().values)
|
| 230 |
-
plt.title(f"Box Plot: {feature_col}")
|
| 231 |
-
plt.ylabel(feature_col)
|
| 232 |
-
else:
|
| 233 |
-
plt.boxplot(df[feature_col].dropna().values)
|
| 234 |
-
plt.title(f"Box Plot: {feature_col} (target too many groups)")
|
| 235 |
-
plt.ylabel(feature_col)
|
| 236 |
-
else:
|
| 237 |
-
plt.boxplot(df[feature_col].dropna().values)
|
| 238 |
-
plt.title(f"Box Plot: {feature_col}")
|
| 239 |
-
plt.ylabel(feature_col)
|
| 240 |
-
plt.tight_layout()
|
| 241 |
-
|
| 242 |
-
# Skewness table
|
| 243 |
-
num_df = df.select_dtypes(include=[np.number]).copy()
|
| 244 |
-
if num_df.shape[1] == 0:
|
| 245 |
-
skew_table = pd.DataFrame({"error": ["No numeric columns to compute skewness."]})
|
| 246 |
-
else:
|
| 247 |
-
skew_series = num_df.skew(numeric_only=True).sort_values(key=lambda s: s.abs(), ascending=False)
|
| 248 |
-
skew_table = pd.DataFrame({
|
| 249 |
-
"feature": skew_series.index,
|
| 250 |
-
"skewness": skew_series.values,
|
| 251 |
-
"abs_skewness": np.abs(skew_series.values)
|
| 252 |
-
}).head(20)
|
| 253 |
-
selected_skew = float(num_df[feature_col].skew()) if feature_col in num_df.columns else np.nan
|
| 254 |
-
skew_table = pd.concat([
|
| 255 |
-
pd.DataFrame({"feature": [feature_col], "skewness": [selected_skew], "abs_skewness": [abs(selected_skew)]}),
|
| 256 |
-
skew_table
|
| 257 |
-
], ignore_index=True).drop_duplicates(subset=["feature"], keep="first")
|
| 258 |
-
|
| 259 |
-
# PCA plot
|
| 260 |
-
if num_df.shape[1] >= 2 and num_df.shape[0] >= 5:
|
| 261 |
-
X = SimpleImputer(strategy="median").fit_transform(num_df.values)
|
| 262 |
-
X = StandardScaler().fit_transform(X)
|
| 263 |
-
pca = PCA(n_components=2, random_state=42)
|
| 264 |
-
Z = pca.fit_transform(X)
|
| 265 |
-
|
| 266 |
-
pca_fig = plt.figure(figsize=(7, 4))
|
| 267 |
-
if target_col and target_col != "None" and target_col in df.columns:
|
| 268 |
-
y = df[target_col].astype(str)
|
| 269 |
-
uniq = y.dropna().unique()
|
| 270 |
-
if len(uniq) <= 10:
|
| 271 |
-
for u in sorted(uniq):
|
| 272 |
-
mask = (y == u).values
|
| 273 |
-
plt.scatter(Z[mask, 0], Z[mask, 1], s=18, label=u)
|
| 274 |
-
plt.legend(fontsize=8)
|
| 275 |
-
else:
|
| 276 |
-
plt.scatter(Z[:, 0], Z[:, 1], s=18)
|
| 277 |
-
plt.title(f"PCA (2D) colored by {target_col}")
|
| 278 |
-
else:
|
| 279 |
-
plt.scatter(Z[:, 0], Z[:, 1], s=18)
|
| 280 |
-
plt.title("PCA (2D)")
|
| 281 |
|
| 282 |
-
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
|
| 283 |
-
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
|
| 284 |
-
plt.tight_layout()
|
| 285 |
-
else:
|
| 286 |
-
pca_fig = plt.figure(figsize=(7, 2))
|
| 287 |
-
plt.text(0.01, 0.5, "Not enough numeric columns/rows for PCA.", fontsize=10)
|
| 288 |
-
plt.axis("off")
|
| 289 |
|
| 290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
with gr.Blocks(
|
| 294 |
-
title="
|
| 295 |
-
theme=gr.themes.Soft(
|
| 296 |
-
|
| 297 |
-
secondary_hue="slate",
|
| 298 |
-
neutral_hue="gray",
|
| 299 |
-
radius_size="lg",
|
| 300 |
-
font=["Inter", "ui-sans-serif", "system-ui"]
|
| 301 |
-
),
|
| 302 |
-
css="""
|
| 303 |
-
.topbar {
|
| 304 |
-
display:flex; align-items:center; gap:14px;
|
| 305 |
-
padding:16px 18px; border-radius:16px;
|
| 306 |
-
background: linear-gradient(90deg, rgba(15,23,42,1), rgba(30,58,138,1));
|
| 307 |
-
color:white; margin-bottom:14px;
|
| 308 |
-
box-shadow: 0 10px 24px rgba(2,6,23,0.25);
|
| 309 |
-
}
|
| 310 |
-
.topbar img { height:56px; width:auto; border-radius:10px; background:white; padding:6px; }
|
| 311 |
-
.topbar .title { font-size:20px; font-weight:800; line-height:1.1; }
|
| 312 |
-
.topbar .sub { font-size:12px; opacity:0.9; margin-top:2px; }
|
| 313 |
-
.chiprow { margin-top:10px; display:flex; flex-wrap:wrap; gap:8px; }
|
| 314 |
-
.chip {
|
| 315 |
-
display:inline-block; padding:6px 10px; border-radius:999px;
|
| 316 |
-
background: rgba(255,255,255,0.14); color:white;
|
| 317 |
-
font-size:12px; border: 1px solid rgba(255,255,255,0.18);
|
| 318 |
-
}
|
| 319 |
-
.card {
|
| 320 |
-
border-radius:18px; padding:14px 14px;
|
| 321 |
-
border:1px solid rgba(148,163,184,0.35);
|
| 322 |
-
box-shadow: 0 10px 22px rgba(15,23,42,0.06);
|
| 323 |
-
background: rgba(255,255,255,0.88);
|
| 324 |
-
}
|
| 325 |
-
.hint { font-size:12px; color:#475569; }
|
| 326 |
-
.stepbox {
|
| 327 |
-
border-radius:14px;
|
| 328 |
-
border:1px dashed rgba(100,116,139,0.55);
|
| 329 |
-
padding:12px 12px;
|
| 330 |
-
background: rgba(248,250,252,0.95);
|
| 331 |
-
}
|
| 332 |
-
"""
|
| 333 |
) as demo:
|
| 334 |
-
with gr.Row(variant="compact"):
|
| 335 |
-
gr.Image(value="logo.jpg", show_label=False, height=80, container=False)
|
| 336 |
-
gr.Markdown(
|
| 337 |
-
"## **Samudramadanam-Amirthum1**\n"
|
| 338 |
-
"**SASTRA • CSV EDA & Report Studio (Upload → EDA → Plots → DOCX)**"
|
| 339 |
-
)
|
| 340 |
|
| 341 |
-
# ---------- Header ----------
|
| 342 |
-
|
| 343 |
-
|
|
|
|
|
|
|
| 344 |
|
| 345 |
-
|
| 346 |
|
| 347 |
-
# ----------
|
| 348 |
with gr.Row():
|
| 349 |
-
|
| 350 |
-
with gr.Column(scale=1, min_width=360):
|
| 351 |
with gr.Group(elem_classes="card"):
|
| 352 |
-
gr.Markdown("###
|
| 353 |
-
gr.Markdown("<div class='hint'>Upload
|
| 354 |
file_in = gr.File(label="Upload CSV", file_types=[".csv"], type="filepath")
|
|
|
|
|
|
|
| 355 |
|
| 356 |
-
gr.Markdown("### 2) Run EDA")
|
| 357 |
-
gr.Markdown(
|
| 358 |
-
"<div class='stepbox'>"
|
| 359 |
-
"<b>Procedure</b><br>"
|
| 360 |
-
"• Upload CSV<br>"
|
| 361 |
-
"• Click <b>Run EDA + Generate DOCX</b><br>"
|
| 362 |
-
"• View preview, missing %, numeric summary<br>"
|
| 363 |
-
"• Download the DOCX report<br>"
|
| 364 |
-
"• Optional: feature analysis (boxplot, skewness, PCA)"
|
| 365 |
-
"</div>"
|
| 366 |
-
)
|
| 367 |
-
|
| 368 |
-
run_btn = gr.Button("Run EDA + Generate DOCX", variant="primary")
|
| 369 |
-
summary = gr.Textbox(label="Status", lines=2)
|
| 370 |
-
docx_out = gr.File(label="Download EDA Report (.docx)")
|
| 371 |
-
|
| 372 |
-
with gr.Group(elem_classes="card"):
|
| 373 |
-
gr.Markdown("### 3) Feature Analysis (Optional)")
|
| 374 |
-
gr.Markdown("<div class='hint'>Select a numeric feature. Choose a target column if you want grouping/color.</div>")
|
| 375 |
-
with gr.Row():
|
| 376 |
-
feature_dd = gr.Dropdown(label="Numeric Feature", choices=[], value=None)
|
| 377 |
-
target_dd = gr.Dropdown(label="Target Column (optional)", choices=["None"], value="None")
|
| 378 |
-
analyze_btn = gr.Button("Run Feature Analysis", variant="secondary")
|
| 379 |
-
|
| 380 |
-
# Right: Outputs
|
| 381 |
with gr.Column(scale=2, min_width=520):
|
| 382 |
with gr.Tabs():
|
| 383 |
-
|
|
|
|
| 384 |
with gr.Group(elem_classes="card"):
|
| 385 |
preview_out = gr.Dataframe(label="Preview (first 25 rows)", interactive=False)
|
| 386 |
-
dtypes_out = gr.Dataframe(label="Column
|
| 387 |
with gr.Group(elem_classes="card"):
|
| 388 |
-
miss_out = gr.Dataframe(label="Missing
|
| 389 |
-
desc_out = gr.Dataframe(label="Numeric
|
| 390 |
|
| 391 |
-
|
|
|
|
| 392 |
with gr.Group(elem_classes="card"):
|
| 393 |
with gr.Row():
|
| 394 |
-
corr_plot_out = gr.Plot(label="Correlation Heatmap
|
| 395 |
-
hist_plot_out = gr.Plot(label="Histograms
|
| 396 |
|
| 397 |
-
|
|
|
|
| 398 |
with gr.Group(elem_classes="card"):
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
|
|
|
|
|
|
|
|
|
| 403 |
|
| 404 |
-
# ---------- Wiring
|
| 405 |
run_btn.click(
|
| 406 |
fn=eda_pipeline,
|
| 407 |
inputs=[file_in],
|
| 408 |
-
outputs=[
|
| 409 |
-
)
|
| 410 |
-
|
| 411 |
-
file_in.change(
|
| 412 |
-
fn=get_columns_for_dropdowns,
|
| 413 |
-
inputs=[file_in],
|
| 414 |
-
outputs=[feature_dd, target_dd]
|
| 415 |
-
)
|
| 416 |
-
|
| 417 |
-
analyze_btn.click(
|
| 418 |
-
fn=feature_analysis,
|
| 419 |
-
inputs=[file_in, feature_dd, target_dd],
|
| 420 |
-
outputs=[box_plot_out, pca_plot_out, skew_out]
|
| 421 |
)
|
| 422 |
|
| 423 |
demo.launch()
|
| 424 |
-
|
|
|
|
| 1 |
+
# app.py (Hugging Face Spaces + Gradio)
|
| 2 |
+
# Requirements: gradio, pandas, numpy, matplotlib, python-docx, scikit-learn
|
| 3 |
|
|
|
|
| 4 |
import os
|
| 5 |
import numpy as np
|
| 6 |
import pandas as pd
|
|
|
|
| 10 |
from docx import Document
|
| 11 |
from docx.shared import Inches
|
| 12 |
|
|
|
|
|
|
|
| 13 |
from sklearn.impute import SimpleImputer
|
| 14 |
+
from sklearn.preprocessing import StandardScaler
|
| 15 |
+
from sklearn.decomposition import PCA
|
| 16 |
|
| 17 |
DOCX_OUT_PATH = "/tmp/EDA_Report.docx"
|
| 18 |
|
| 19 |
|
| 20 |
+
# ----------------------------- Helpers -----------------------------
|
| 21 |
def read_csv_safely(filepath: str) -> pd.DataFrame:
|
| 22 |
try:
|
| 23 |
return pd.read_csv(filepath)
|
|
|
|
| 25 |
return pd.read_csv(filepath, encoding="latin1")
|
| 26 |
|
| 27 |
|
| 28 |
+
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
|
| 29 |
+
df = df.copy()
|
| 30 |
+
df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]
|
| 31 |
+
for c in list(df.columns):
|
| 32 |
+
if c.lower().startswith("unnamed"):
|
| 33 |
+
df = df.drop(columns=[c])
|
| 34 |
+
return df
|
| 35 |
+
|
| 36 |
+
|
| 37 |
def save_plot(fig, out_path: str) -> str:
|
| 38 |
fig.savefig(out_path, dpi=180, bbox_inches="tight")
|
| 39 |
plt.close(fig)
|
| 40 |
return out_path
|
| 41 |
|
| 42 |
|
| 43 |
+
def make_interpretation_notes(df: pd.DataFrame) -> str:
|
| 44 |
+
notes = []
|
| 45 |
+
notes.append(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns.")
|
| 46 |
+
|
| 47 |
+
miss = (df.isna().mean() * 100).sort_values(ascending=False)
|
| 48 |
+
top_miss = miss[miss > 0].head(5)
|
| 49 |
+
if len(top_miss) == 0:
|
| 50 |
+
notes.append("No missing values detected.")
|
| 51 |
+
else:
|
| 52 |
+
notes.append("Top missing columns (%): " + ", ".join([f"{k}={v:.1f}%" for k, v in top_miss.items()]))
|
| 53 |
+
|
| 54 |
+
num_df = df.select_dtypes(include=[np.number])
|
| 55 |
+
if num_df.shape[1] > 0:
|
| 56 |
+
skew = num_df.skew(numeric_only=True)
|
| 57 |
+
high_skew = skew[skew.abs() > 1].sort_values(key=lambda s: s.abs(), ascending=False).head(5)
|
| 58 |
+
if len(high_skew) > 0:
|
| 59 |
+
notes.append("Highly skewed numeric features (|skew|>1): " +
|
| 60 |
+
", ".join([f"{k}={v:.2f}" for k, v in high_skew.items()]) +
|
| 61 |
+
". Consider log/Box-Cox or robust scaling if needed.")
|
| 62 |
+
else:
|
| 63 |
+
notes.append("No strongly skewed numeric features (|skew|>1) detected among numeric columns.")
|
| 64 |
+
|
| 65 |
+
if num_df.shape[1] >= 2:
|
| 66 |
+
corr = num_df.corr(numeric_only=True)
|
| 67 |
+
# strongest correlations (excluding self)
|
| 68 |
+
pairs = []
|
| 69 |
+
cols = corr.columns
|
| 70 |
+
for i in range(len(cols)):
|
| 71 |
+
for j in range(i + 1, len(cols)):
|
| 72 |
+
pairs.append((cols[i], cols[j], corr.iloc[i, j]))
|
| 73 |
+
pairs = sorted(pairs, key=lambda x: abs(x[2]), reverse=True)[:5]
|
| 74 |
+
if pairs:
|
| 75 |
+
notes.append("Top correlations (absolute): " + ", ".join([f"{a}-{b}={c:.2f}" for a, b, c in pairs]))
|
| 76 |
+
else:
|
| 77 |
+
notes.append("No numeric columns detected; plots and numeric summary will be limited.")
|
| 78 |
+
|
| 79 |
+
return "\n• " + "\n• ".join(notes)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
# ----------------------------- DOCX Report -----------------------------
|
| 83 |
def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
|
| 84 |
doc = Document()
|
| 85 |
doc.add_heading("EDA Report (Auto-generated)", level=1)
|
|
|
|
| 90 |
|
| 91 |
doc.add_heading("Column Types", level=2)
|
| 92 |
dtypes = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
|
| 93 |
+
t = doc.add_table(rows=1, cols=2)
|
| 94 |
+
t.rows[0].cells[0].text = "column"
|
| 95 |
+
t.rows[0].cells[1].text = "dtype"
|
| 96 |
for _, r in dtypes.head(100).iterrows():
|
| 97 |
+
row = t.add_row().cells
|
| 98 |
row[0].text = str(r["column"])
|
| 99 |
row[1].text = str(r["dtype"])
|
| 100 |
|
| 101 |
doc.add_heading("Missing Values", level=2)
|
| 102 |
miss = (df.isna().mean() * 100).sort_values(ascending=False)
|
| 103 |
+
t2 = doc.add_table(rows=1, cols=2)
|
| 104 |
+
t2.rows[0].cells[0].text = "column"
|
| 105 |
+
t2.rows[0].cells[1].text = "missing_%"
|
|
|
|
| 106 |
for idx, val in miss.head(25).items():
|
| 107 |
+
row = t2.add_row().cells
|
| 108 |
row[0].text = str(idx)
|
| 109 |
row[1].text = f"{val:.2f}"
|
| 110 |
|
| 111 |
+
doc.add_paragraph("Interpretation: Columns with high missing values may need imputation or removal.")
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
doc.add_heading("Summary Statistics (Numeric)", level=2)
|
| 114 |
num_df = df.select_dtypes(include=[np.number])
|
| 115 |
if num_df.shape[1] > 0:
|
| 116 |
desc = num_df.describe().T.reset_index().rename(columns={"index": "feature"})
|
| 117 |
+
cols = [c for c in ["feature", "mean", "std", "min", "50%", "max"] if c in desc.columns]
|
| 118 |
+
t3 = doc.add_table(rows=1, cols=len(cols))
|
|
|
|
| 119 |
for j, c in enumerate(cols):
|
| 120 |
+
t3.rows[0].cells[j].text = c
|
| 121 |
for _, r in desc.head(30).iterrows():
|
| 122 |
+
row = t3.add_row().cells
|
| 123 |
for j, c in enumerate(cols):
|
| 124 |
v = r[c]
|
| 125 |
row[j].text = str(round(v, 6)) if isinstance(v, (int, float, np.floating)) else str(v)
|
|
|
|
| 127 |
doc.add_paragraph("No numeric columns found.")
|
| 128 |
|
| 129 |
doc.add_heading("Charts + Interpretations", level=2)
|
| 130 |
+
for fp, note in zip(fig_paths, notes):
|
| 131 |
doc.add_paragraph(f"Interpretation: {note}")
|
| 132 |
+
if os.path.exists(fp):
|
| 133 |
+
doc.add_picture(fp, width=Inches(6.5))
|
| 134 |
|
| 135 |
doc.save(DOCX_OUT_PATH)
|
| 136 |
return DOCX_OUT_PATH
|
| 137 |
|
| 138 |
|
| 139 |
+
# ----------------------------- EDA Pipeline -----------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
def eda_pipeline(csv_path: str):
|
| 141 |
if csv_path is None or str(csv_path).strip() == "":
|
| 142 |
+
return "Please upload a CSV.", None, None, None, None, None, None, None, ""
|
| 143 |
|
| 144 |
try:
|
| 145 |
+
df = clean_df(read_csv_safely(csv_path))
|
| 146 |
except Exception as e:
|
| 147 |
+
return f"Could not read CSV: {e}", None, None, None, None, None, None, None, ""
|
|
|
|
|
|
|
| 148 |
|
| 149 |
preview = df.head(25)
|
| 150 |
dtypes_df = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
|
|
|
|
| 153 |
num_df = df.select_dtypes(include=[np.number])
|
| 154 |
desc = num_df.describe().T if num_df.shape[1] > 0 else pd.DataFrame()
|
| 155 |
|
| 156 |
+
fig_paths, notes = [], []
|
|
|
|
| 157 |
|
| 158 |
+
# Plot 1: Missingness
|
| 159 |
miss_series = (df.isna().mean() * 100).sort_values(ascending=False).head(15)
|
| 160 |
fig1 = plt.figure(figsize=(10, 4))
|
| 161 |
plt.bar(miss_series.index.astype(str), miss_series.values)
|
| 162 |
plt.title("Missing Values (%): Top 15 Columns")
|
| 163 |
plt.xticks(rotation=45, ha="right", fontsize=7)
|
| 164 |
plt.ylabel("Missing (%)")
|
| 165 |
+
fig_paths.append(save_plot(fig1, "/tmp/missingness.png"))
|
| 166 |
+
notes.append("High-missing columns may need imputation (median/mode) or removal based on usefulness.")
|
|
|
|
| 167 |
|
| 168 |
corr_plot = None
|
| 169 |
hist_plot = None
|
| 170 |
|
| 171 |
+
# Plot 2: Correlation
|
| 172 |
if num_df.shape[1] >= 2:
|
| 173 |
corr = num_df.corr(numeric_only=True)
|
| 174 |
fig2 = plt.figure(figsize=(10, 5))
|
|
|
|
| 178 |
plt.yticks(range(len(corr.index)), corr.index, fontsize=7)
|
| 179 |
plt.colorbar()
|
| 180 |
corr_plot = fig2
|
| 181 |
+
fig_paths.append(save_plot(fig2, "/tmp/corr_heatmap.png"))
|
| 182 |
+
notes.append("Strong correlations may indicate redundant features; consider feature selection/regularization.")
|
|
|
|
| 183 |
|
| 184 |
+
# Plot 3: Histograms
|
| 185 |
if num_df.shape[1] > 0:
|
| 186 |
cols = list(num_df.columns)[:4]
|
| 187 |
fig3 = plt.figure(figsize=(10, 6))
|
|
|
|
| 192 |
plt.suptitle("Histograms (first 4 numeric columns)", y=1.02)
|
| 193 |
plt.tight_layout()
|
| 194 |
hist_plot = fig3
|
| 195 |
+
fig_paths.append(save_plot(fig3, "/tmp/histograms.png"))
|
| 196 |
+
notes.append("Histograms show distribution/outliers/skewness; consider transforms for highly skewed features.")
|
|
|
|
| 197 |
|
| 198 |
# DOCX
|
| 199 |
try:
|
| 200 |
docx_path = make_docx_report(df, fig_paths, notes)
|
| 201 |
except Exception as e:
|
| 202 |
+
interp = make_interpretation_notes(df)
|
| 203 |
+
return f"Error while creating DOCX: {e}", preview, dtypes_df, miss_df, desc, None, corr_plot, hist_plot, interp
|
| 204 |
|
| 205 |
+
interp = make_interpretation_notes(df)
|
| 206 |
summary_text = f"Loaded CSV successfully. Rows: {df.shape[0]} | Columns: {df.shape[1]}"
|
| 207 |
+
return summary_text, preview, dtypes_df, miss_df, desc, docx_path, corr_plot, hist_plot, interp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
+
# ----------------------------- App UI (Beautiful College View) -----------------------------
|
| 211 |
+
CSS = """
|
| 212 |
+
/* Center header */
|
| 213 |
+
#hdr {text-align:center; margin-top:8px; margin-bottom:6px;}
|
| 214 |
+
#appname {color:#0b3d91; font-weight:900; font-size:28px; margin:0;}
|
| 215 |
+
#appsub {color:#0b3d91; font-weight:700; font-size:16px; margin-top:4px;}
|
| 216 |
+
#appauth {color:#0b3d91; font-weight:700; font-size:14px; margin-top:2px;}
|
| 217 |
|
| 218 |
+
/* Ribbon tabs */
|
| 219 |
+
.gradio-container .tabs {border-radius:14px;}
|
| 220 |
+
.gradio-container .tabitem {font-weight:800;}
|
| 221 |
+
/* Card style */
|
| 222 |
+
.card {border:1px solid rgba(148,163,184,.35); border-radius:18px; padding:14px; background:rgba(255,255,255,.92);}
|
| 223 |
+
.hint {font-size:12px; color:#475569;}
|
| 224 |
+
"""
|
| 225 |
|
| 226 |
with gr.Blocks(
|
| 227 |
+
title="SAMUDHRAMADANAM-AMIRTHAM1 | SASTRA",
|
| 228 |
+
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate", neutral_hue="gray"),
|
| 229 |
+
css=CSS
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
) as demo:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
+
# ---------- Header (CENTER) ----------
|
| 233 |
+
with gr.Column(elem_id="hdr"):
|
| 234 |
+
gr.Image(value="logo.jpg", show_label=False, height=120, container=False)
|
| 235 |
+
gr.Markdown("<div id='appname'>SAMUDHRAMADANAM-AMIRTHAM1</div>")
|
| 236 |
+
gr.Markdown("<div id='appauth'>Prof.B.Santhi, SRC, SASTRA</div>")
|
| 237 |
|
| 238 |
+
gr.Markdown("<hr>")
|
| 239 |
|
| 240 |
+
# ---------- Left controls + Ribbon outputs ----------
|
| 241 |
with gr.Row():
|
| 242 |
+
with gr.Column(scale=1, min_width=340):
|
|
|
|
| 243 |
with gr.Group(elem_classes="card"):
|
| 244 |
+
gr.Markdown("### Upload CSV")
|
| 245 |
+
gr.Markdown("<div class='hint'>Upload your dataset (CSV). Then run EDA to view tables, graphs and download report.</div>")
|
| 246 |
file_in = gr.File(label="Upload CSV", file_types=[".csv"], type="filepath")
|
| 247 |
+
run_btn = gr.Button("Run EDA", variant="primary")
|
| 248 |
+
status = gr.Textbox(label="Status", lines=2)
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
with gr.Column(scale=2, min_width=520):
|
| 251 |
with gr.Tabs():
|
| 252 |
+
# Ribbon 1: EDA
|
| 253 |
+
with gr.TabItem("EDA"):
|
| 254 |
with gr.Group(elem_classes="card"):
|
| 255 |
preview_out = gr.Dataframe(label="Preview (first 25 rows)", interactive=False)
|
| 256 |
+
dtypes_out = gr.Dataframe(label="Column Types", interactive=False)
|
| 257 |
with gr.Group(elem_classes="card"):
|
| 258 |
+
miss_out = gr.Dataframe(label="Missing Values (% top 25)", interactive=False)
|
| 259 |
+
desc_out = gr.Dataframe(label="Numeric Summary (describe)", interactive=False)
|
| 260 |
|
| 261 |
+
# Ribbon 2: Graph
|
| 262 |
+
with gr.TabItem("Graph"):
|
| 263 |
with gr.Group(elem_classes="card"):
|
| 264 |
with gr.Row():
|
| 265 |
+
corr_plot_out = gr.Plot(label="Correlation Heatmap")
|
| 266 |
+
hist_plot_out = gr.Plot(label="Histograms")
|
| 267 |
|
| 268 |
+
# Ribbon 3: Report
|
| 269 |
+
with gr.TabItem("Report"):
|
| 270 |
with gr.Group(elem_classes="card"):
|
| 271 |
+
gr.Markdown("### Download Report")
|
| 272 |
+
docx_out = gr.File(label="EDA Report (.docx)")
|
| 273 |
+
|
| 274 |
+
# Ribbon 4: Interpretation
|
| 275 |
+
with gr.TabItem("Interpretation"):
|
| 276 |
+
with gr.Group(elem_classes="card"):
|
| 277 |
+
interp_out = gr.Textbox(label="Auto Interpretation", lines=10)
|
| 278 |
|
| 279 |
+
# ---------- Wiring ----------
|
| 280 |
run_btn.click(
|
| 281 |
fn=eda_pipeline,
|
| 282 |
inputs=[file_in],
|
| 283 |
+
outputs=[status, preview_out, dtypes_out, miss_out, desc_out, docx_out, corr_plot_out, hist_plot_out, interp_out]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
)
|
| 285 |
|
| 286 |
demo.launch()
|
|
|