Demo / app.py
SanthiSastra's picture
Update app.py
09db78e verified
# app.py (Hugging Face Spaces + Gradio)
# Requirements: gradio, pandas, numpy, matplotlib, python-docx, scikit-learn
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gradio as gr
from docx import Document
from docx.shared import Inches
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
DOCX_OUT_PATH = "/tmp/EDA_Report.docx"
# ----------------------------- Helpers -----------------------------
def read_csv_safely(filepath: str) -> pd.DataFrame:
try:
return pd.read_csv(filepath)
except UnicodeDecodeError:
return pd.read_csv(filepath, encoding="latin1")
def clean_df(df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]
for c in list(df.columns):
if c.lower().startswith("unnamed"):
df = df.drop(columns=[c])
return df
def save_plot(fig, out_path: str) -> str:
fig.savefig(out_path, dpi=180, bbox_inches="tight")
plt.close(fig)
return out_path
def make_interpretation_notes(df: pd.DataFrame) -> str:
notes = []
notes.append(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns.")
miss = (df.isna().mean() * 100).sort_values(ascending=False)
top_miss = miss[miss > 0].head(5)
if len(top_miss) == 0:
notes.append("No missing values detected.")
else:
notes.append("Top missing columns (%): " + ", ".join([f"{k}={v:.1f}%" for k, v in top_miss.items()]))
num_df = df.select_dtypes(include=[np.number])
if num_df.shape[1] > 0:
skew = num_df.skew(numeric_only=True)
high_skew = skew[skew.abs() > 1].sort_values(key=lambda s: s.abs(), ascending=False).head(5)
if len(high_skew) > 0:
notes.append("Highly skewed numeric features (|skew|>1): " +
", ".join([f"{k}={v:.2f}" for k, v in high_skew.items()]) +
". Consider log/Box-Cox or robust scaling if needed.")
else:
notes.append("No strongly skewed numeric features (|skew|>1) detected among numeric columns.")
if num_df.shape[1] >= 2:
corr = num_df.corr(numeric_only=True)
# strongest correlations (excluding self)
pairs = []
cols = corr.columns
for i in range(len(cols)):
for j in range(i + 1, len(cols)):
pairs.append((cols[i], cols[j], corr.iloc[i, j]))
pairs = sorted(pairs, key=lambda x: abs(x[2]), reverse=True)[:5]
if pairs:
notes.append("Top correlations (absolute): " + ", ".join([f"{a}-{b}={c:.2f}" for a, b, c in pairs]))
else:
notes.append("No numeric columns detected; plots and numeric summary will be limited.")
return "\n• " + "\n• ".join(notes)
# ----------------------------- DOCX Report -----------------------------
def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
doc = Document()
doc.add_heading("EDA Report (Auto-generated)", level=1)
doc.add_heading("Dataset Overview", level=2)
doc.add_paragraph(f"Rows: {df.shape[0]}")
doc.add_paragraph(f"Columns: {df.shape[1]}")
doc.add_heading("Column Types", level=2)
dtypes = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
t = doc.add_table(rows=1, cols=2)
t.rows[0].cells[0].text = "column"
t.rows[0].cells[1].text = "dtype"
for _, r in dtypes.head(100).iterrows():
row = t.add_row().cells
row[0].text = str(r["column"])
row[1].text = str(r["dtype"])
doc.add_heading("Missing Values", level=2)
miss = (df.isna().mean() * 100).sort_values(ascending=False)
t2 = doc.add_table(rows=1, cols=2)
t2.rows[0].cells[0].text = "column"
t2.rows[0].cells[1].text = "missing_%"
for idx, val in miss.head(25).items():
row = t2.add_row().cells
row[0].text = str(idx)
row[1].text = f"{val:.2f}"
doc.add_paragraph("Interpretation: Columns with high missing values may need imputation or removal.")
doc.add_heading("Summary Statistics (Numeric)", level=2)
num_df = df.select_dtypes(include=[np.number])
if num_df.shape[1] > 0:
desc = num_df.describe().T.reset_index().rename(columns={"index": "feature"})
cols = [c for c in ["feature", "mean", "std", "min", "50%", "max"] if c in desc.columns]
t3 = doc.add_table(rows=1, cols=len(cols))
for j, c in enumerate(cols):
t3.rows[0].cells[j].text = c
for _, r in desc.head(30).iterrows():
row = t3.add_row().cells
for j, c in enumerate(cols):
v = r[c]
row[j].text = str(round(v, 6)) if isinstance(v, (int, float, np.floating)) else str(v)
else:
doc.add_paragraph("No numeric columns found.")
doc.add_heading("Charts + Interpretations", level=2)
for fp, note in zip(fig_paths, notes):
doc.add_paragraph(f"Interpretation: {note}")
if os.path.exists(fp):
doc.add_picture(fp, width=Inches(6.5))
doc.save(DOCX_OUT_PATH)
return DOCX_OUT_PATH
# ----------------------------- EDA Pipeline -----------------------------
def eda_pipeline(csv_path: str):
if csv_path is None or str(csv_path).strip() == "":
return "Please upload a CSV.", None, None, None, None, None, None, None, ""
try:
df = clean_df(read_csv_safely(csv_path))
except Exception as e:
return f"Could not read CSV: {e}", None, None, None, None, None, None, None, ""
preview = df.head(25)
dtypes_df = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
miss_df = (df.isna().mean() * 100).sort_values(ascending=False).to_frame("missing_%").head(25)
num_df = df.select_dtypes(include=[np.number])
desc = num_df.describe().T if num_df.shape[1] > 0 else pd.DataFrame()
fig_paths, notes = [], []
# Plot 1: Missingness
miss_series = (df.isna().mean() * 100).sort_values(ascending=False).head(15)
fig1 = plt.figure(figsize=(10, 4))
plt.bar(miss_series.index.astype(str), miss_series.values)
plt.title("Missing Values (%): Top 15 Columns")
plt.xticks(rotation=45, ha="right", fontsize=7)
plt.ylabel("Missing (%)")
fig_paths.append(save_plot(fig1, "/tmp/missingness.png"))
notes.append("High-missing columns may need imputation (median/mode) or removal based on usefulness.")
corr_plot = None
hist_plot = None
# Plot 2: Correlation
if num_df.shape[1] >= 2:
corr = num_df.corr(numeric_only=True)
fig2 = plt.figure(figsize=(10, 5))
plt.imshow(corr.values, aspect="auto")
plt.title("Correlation Heatmap (Numeric)")
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90, fontsize=7)
plt.yticks(range(len(corr.index)), corr.index, fontsize=7)
plt.colorbar()
corr_plot = fig2
fig_paths.append(save_plot(fig2, "/tmp/corr_heatmap.png"))
notes.append("Strong correlations may indicate redundant features; consider feature selection/regularization.")
# Plot 3: Histograms
if num_df.shape[1] > 0:
cols = list(num_df.columns)[:4]
fig3 = plt.figure(figsize=(10, 6))
for i, c in enumerate(cols, start=1):
plt.subplot(2, 2, i)
plt.hist(num_df[c].dropna().values, bins=25)
plt.title(c, fontsize=9)
plt.suptitle("Histograms (first 4 numeric columns)", y=1.02)
plt.tight_layout()
hist_plot = fig3
fig_paths.append(save_plot(fig3, "/tmp/histograms.png"))
notes.append("Histograms show distribution/outliers/skewness; consider transforms for highly skewed features.")
# DOCX
try:
docx_path = make_docx_report(df, fig_paths, notes)
except Exception as e:
interp = make_interpretation_notes(df)
return f"Error while creating DOCX: {e}", preview, dtypes_df, miss_df, desc, None, corr_plot, hist_plot, interp
interp = make_interpretation_notes(df)
summary_text = f"Loaded CSV successfully. Rows: {df.shape[0]} | Columns: {df.shape[1]}"
return summary_text, preview, dtypes_df, miss_df, desc, docx_path, corr_plot, hist_plot, interp
# ----------------------------- App UI (Beautiful College View) -----------------------------
CSS = """
/* Center header */
#hdr {text-align:center; margin-top:8px; margin-bottom:6px;}
#appname {color:#0b3d91; font-weight:900; font-size:28px; margin:0;}
#appsub {color:#0b3d91; font-weight:700; font-size:16px; margin-top:4px;}
#appauth {color:#0b3d91; font-weight:700; font-size:14px; margin-top:2px;}
/* Ribbon tabs */
.gradio-container .tabs {border-radius:14px;}
.gradio-container .tabitem {font-weight:800;}
/* Card style */
.card {border:1px solid rgba(148,163,184,.35); border-radius:18px; padding:14px; background:rgba(255,255,255,.92);}
.hint {font-size:12px; color:#475569;}
"""
with gr.Blocks(
title="SAMUDHRAMADANAM-AMIRTHAM1 | SASTRA",
theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate", neutral_hue="gray"),
css=CSS
) as demo:
# ---------- Header (CENTER) ----------
with gr.Column(elem_id="hdr"):
gr.Image(value="logo.jpg", show_label=False, height=120, container=False)
gr.Markdown("<div id='appname'>SAMUDHRAMADANAM-AMIRTHAM1</div>")
gr.Markdown("<div id='appauth'>Prof.B.Santhi, SRC, SASTRA</div>")
gr.Markdown("<hr>")
# ---------- Left controls + Ribbon outputs ----------
with gr.Row():
with gr.Column(scale=1, min_width=340):
with gr.Group(elem_classes="card"):
gr.Markdown("### Upload CSV")
gr.Markdown("<div class='hint'>Upload your dataset (CSV). Then run EDA to view tables, graphs and download report.</div>")
file_in = gr.File(label="Upload CSV", file_types=[".csv"], type="filepath")
run_btn = gr.Button("Run EDA", variant="primary")
status = gr.Textbox(label="Status", lines=2)
with gr.Column(scale=2, min_width=520):
with gr.Tabs():
# Ribbon 1: EDA
with gr.TabItem("EDA"):
with gr.Group(elem_classes="card"):
preview_out = gr.Dataframe(label="Preview (first 25 rows)", interactive=False)
dtypes_out = gr.Dataframe(label="Column Types", interactive=False)
with gr.Group(elem_classes="card"):
miss_out = gr.Dataframe(label="Missing Values (% top 25)", interactive=False)
desc_out = gr.Dataframe(label="Numeric Summary (describe)", interactive=False)
# Ribbon 2: Graph
with gr.TabItem("Graph"):
with gr.Group(elem_classes="card"):
with gr.Row():
corr_plot_out = gr.Plot(label="Correlation Heatmap")
hist_plot_out = gr.Plot(label="Histograms")
# Ribbon 3: Report
with gr.TabItem("Report"):
with gr.Group(elem_classes="card"):
gr.Markdown("### Download Report")
docx_out = gr.File(label="EDA Report (.docx)")
# Ribbon 4: Interpretation
with gr.TabItem("Interpretation"):
with gr.Group(elem_classes="card"):
interp_out = gr.Textbox(label="Auto Interpretation", lines=10)
# ---------- Wiring ----------
run_btn.click(
fn=eda_pipeline,
inputs=[file_in],
outputs=[status, preview_out, dtypes_out, miss_out, desc_out, docx_out, corr_plot_out, hist_plot_out, interp_out]
)
demo.launch()