Spaces:
Sleeping
Sleeping
| # app.py (Hugging Face Spaces + Gradio) | |
| # Requirements: gradio, pandas, numpy, matplotlib, python-docx, scikit-learn | |
| import os | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import gradio as gr | |
| from docx import Document | |
| from docx.shared import Inches | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.decomposition import PCA | |
| DOCX_OUT_PATH = "/tmp/EDA_Report.docx" | |
| # ----------------------------- Helpers ----------------------------- | |
| def read_csv_safely(filepath: str) -> pd.DataFrame: | |
| try: | |
| return pd.read_csv(filepath) | |
| except UnicodeDecodeError: | |
| return pd.read_csv(filepath, encoding="latin1") | |
| def clean_df(df: pd.DataFrame) -> pd.DataFrame: | |
| df = df.copy() | |
| df.columns = [str(c).strip().replace(" ", "_") for c in df.columns] | |
| for c in list(df.columns): | |
| if c.lower().startswith("unnamed"): | |
| df = df.drop(columns=[c]) | |
| return df | |
| def save_plot(fig, out_path: str) -> str: | |
| fig.savefig(out_path, dpi=180, bbox_inches="tight") | |
| plt.close(fig) | |
| return out_path | |
| def make_interpretation_notes(df: pd.DataFrame) -> str: | |
| notes = [] | |
| notes.append(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns.") | |
| miss = (df.isna().mean() * 100).sort_values(ascending=False) | |
| top_miss = miss[miss > 0].head(5) | |
| if len(top_miss) == 0: | |
| notes.append("No missing values detected.") | |
| else: | |
| notes.append("Top missing columns (%): " + ", ".join([f"{k}={v:.1f}%" for k, v in top_miss.items()])) | |
| num_df = df.select_dtypes(include=[np.number]) | |
| if num_df.shape[1] > 0: | |
| skew = num_df.skew(numeric_only=True) | |
| high_skew = skew[skew.abs() > 1].sort_values(key=lambda s: s.abs(), ascending=False).head(5) | |
| if len(high_skew) > 0: | |
| notes.append("Highly skewed numeric features (|skew|>1): " + | |
| ", ".join([f"{k}={v:.2f}" for k, v in high_skew.items()]) + | |
| ". Consider log/Box-Cox or robust scaling if needed.") | |
| else: | |
| notes.append("No strongly skewed numeric features (|skew|>1) detected among numeric columns.") | |
| if num_df.shape[1] >= 2: | |
| corr = num_df.corr(numeric_only=True) | |
| # strongest correlations (excluding self) | |
| pairs = [] | |
| cols = corr.columns | |
| for i in range(len(cols)): | |
| for j in range(i + 1, len(cols)): | |
| pairs.append((cols[i], cols[j], corr.iloc[i, j])) | |
| pairs = sorted(pairs, key=lambda x: abs(x[2]), reverse=True)[:5] | |
| if pairs: | |
| notes.append("Top correlations (absolute): " + ", ".join([f"{a}-{b}={c:.2f}" for a, b, c in pairs])) | |
| else: | |
| notes.append("No numeric columns detected; plots and numeric summary will be limited.") | |
| return "\n• " + "\n• ".join(notes) | |
| # ----------------------------- DOCX Report ----------------------------- | |
| def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str: | |
| doc = Document() | |
| doc.add_heading("EDA Report (Auto-generated)", level=1) | |
| doc.add_heading("Dataset Overview", level=2) | |
| doc.add_paragraph(f"Rows: {df.shape[0]}") | |
| doc.add_paragraph(f"Columns: {df.shape[1]}") | |
| doc.add_heading("Column Types", level=2) | |
| dtypes = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]}) | |
| t = doc.add_table(rows=1, cols=2) | |
| t.rows[0].cells[0].text = "column" | |
| t.rows[0].cells[1].text = "dtype" | |
| for _, r in dtypes.head(100).iterrows(): | |
| row = t.add_row().cells | |
| row[0].text = str(r["column"]) | |
| row[1].text = str(r["dtype"]) | |
| doc.add_heading("Missing Values", level=2) | |
| miss = (df.isna().mean() * 100).sort_values(ascending=False) | |
| t2 = doc.add_table(rows=1, cols=2) | |
| t2.rows[0].cells[0].text = "column" | |
| t2.rows[0].cells[1].text = "missing_%" | |
| for idx, val in miss.head(25).items(): | |
| row = t2.add_row().cells | |
| row[0].text = str(idx) | |
| row[1].text = f"{val:.2f}" | |
| doc.add_paragraph("Interpretation: Columns with high missing values may need imputation or removal.") | |
| doc.add_heading("Summary Statistics (Numeric)", level=2) | |
| num_df = df.select_dtypes(include=[np.number]) | |
| if num_df.shape[1] > 0: | |
| desc = num_df.describe().T.reset_index().rename(columns={"index": "feature"}) | |
| cols = [c for c in ["feature", "mean", "std", "min", "50%", "max"] if c in desc.columns] | |
| t3 = doc.add_table(rows=1, cols=len(cols)) | |
| for j, c in enumerate(cols): | |
| t3.rows[0].cells[j].text = c | |
| for _, r in desc.head(30).iterrows(): | |
| row = t3.add_row().cells | |
| for j, c in enumerate(cols): | |
| v = r[c] | |
| row[j].text = str(round(v, 6)) if isinstance(v, (int, float, np.floating)) else str(v) | |
| else: | |
| doc.add_paragraph("No numeric columns found.") | |
| doc.add_heading("Charts + Interpretations", level=2) | |
| for fp, note in zip(fig_paths, notes): | |
| doc.add_paragraph(f"Interpretation: {note}") | |
| if os.path.exists(fp): | |
| doc.add_picture(fp, width=Inches(6.5)) | |
| doc.save(DOCX_OUT_PATH) | |
| return DOCX_OUT_PATH | |
| # ----------------------------- EDA Pipeline ----------------------------- | |
| def eda_pipeline(csv_path: str): | |
| if csv_path is None or str(csv_path).strip() == "": | |
| return "Please upload a CSV.", None, None, None, None, None, None, None, "" | |
| try: | |
| df = clean_df(read_csv_safely(csv_path)) | |
| except Exception as e: | |
| return f"Could not read CSV: {e}", None, None, None, None, None, None, None, "" | |
| preview = df.head(25) | |
| dtypes_df = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]}) | |
| miss_df = (df.isna().mean() * 100).sort_values(ascending=False).to_frame("missing_%").head(25) | |
| num_df = df.select_dtypes(include=[np.number]) | |
| desc = num_df.describe().T if num_df.shape[1] > 0 else pd.DataFrame() | |
| fig_paths, notes = [], [] | |
| # Plot 1: Missingness | |
| miss_series = (df.isna().mean() * 100).sort_values(ascending=False).head(15) | |
| fig1 = plt.figure(figsize=(10, 4)) | |
| plt.bar(miss_series.index.astype(str), miss_series.values) | |
| plt.title("Missing Values (%): Top 15 Columns") | |
| plt.xticks(rotation=45, ha="right", fontsize=7) | |
| plt.ylabel("Missing (%)") | |
| fig_paths.append(save_plot(fig1, "/tmp/missingness.png")) | |
| notes.append("High-missing columns may need imputation (median/mode) or removal based on usefulness.") | |
| corr_plot = None | |
| hist_plot = None | |
| # Plot 2: Correlation | |
| if num_df.shape[1] >= 2: | |
| corr = num_df.corr(numeric_only=True) | |
| fig2 = plt.figure(figsize=(10, 5)) | |
| plt.imshow(corr.values, aspect="auto") | |
| plt.title("Correlation Heatmap (Numeric)") | |
| plt.xticks(range(len(corr.columns)), corr.columns, rotation=90, fontsize=7) | |
| plt.yticks(range(len(corr.index)), corr.index, fontsize=7) | |
| plt.colorbar() | |
| corr_plot = fig2 | |
| fig_paths.append(save_plot(fig2, "/tmp/corr_heatmap.png")) | |
| notes.append("Strong correlations may indicate redundant features; consider feature selection/regularization.") | |
| # Plot 3: Histograms | |
| if num_df.shape[1] > 0: | |
| cols = list(num_df.columns)[:4] | |
| fig3 = plt.figure(figsize=(10, 6)) | |
| for i, c in enumerate(cols, start=1): | |
| plt.subplot(2, 2, i) | |
| plt.hist(num_df[c].dropna().values, bins=25) | |
| plt.title(c, fontsize=9) | |
| plt.suptitle("Histograms (first 4 numeric columns)", y=1.02) | |
| plt.tight_layout() | |
| hist_plot = fig3 | |
| fig_paths.append(save_plot(fig3, "/tmp/histograms.png")) | |
| notes.append("Histograms show distribution/outliers/skewness; consider transforms for highly skewed features.") | |
| # DOCX | |
| try: | |
| docx_path = make_docx_report(df, fig_paths, notes) | |
| except Exception as e: | |
| interp = make_interpretation_notes(df) | |
| return f"Error while creating DOCX: {e}", preview, dtypes_df, miss_df, desc, None, corr_plot, hist_plot, interp | |
| interp = make_interpretation_notes(df) | |
| summary_text = f"Loaded CSV successfully. Rows: {df.shape[0]} | Columns: {df.shape[1]}" | |
| return summary_text, preview, dtypes_df, miss_df, desc, docx_path, corr_plot, hist_plot, interp | |
| # ----------------------------- App UI (Beautiful College View) ----------------------------- | |
| CSS = """ | |
| /* Center header */ | |
| #hdr {text-align:center; margin-top:8px; margin-bottom:6px;} | |
| #appname {color:#0b3d91; font-weight:900; font-size:28px; margin:0;} | |
| #appsub {color:#0b3d91; font-weight:700; font-size:16px; margin-top:4px;} | |
| #appauth {color:#0b3d91; font-weight:700; font-size:14px; margin-top:2px;} | |
| /* Ribbon tabs */ | |
| .gradio-container .tabs {border-radius:14px;} | |
| .gradio-container .tabitem {font-weight:800;} | |
| /* Card style */ | |
| .card {border:1px solid rgba(148,163,184,.35); border-radius:18px; padding:14px; background:rgba(255,255,255,.92);} | |
| .hint {font-size:12px; color:#475569;} | |
| """ | |
| with gr.Blocks( | |
| title="SAMUDHRAMADANAM-AMIRTHAM1 | SASTRA", | |
| theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate", neutral_hue="gray"), | |
| css=CSS | |
| ) as demo: | |
| # ---------- Header (CENTER) ---------- | |
| with gr.Column(elem_id="hdr"): | |
| gr.Image(value="logo.jpg", show_label=False, height=120, container=False) | |
| gr.Markdown("<div id='appname'>SAMUDHRAMADANAM-AMIRTHAM1</div>") | |
| gr.Markdown("<div id='appauth'>Prof.B.Santhi, SRC, SASTRA</div>") | |
| gr.Markdown("<hr>") | |
| # ---------- Left controls + Ribbon outputs ---------- | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=340): | |
| with gr.Group(elem_classes="card"): | |
| gr.Markdown("### Upload CSV") | |
| gr.Markdown("<div class='hint'>Upload your dataset (CSV). Then run EDA to view tables, graphs and download report.</div>") | |
| file_in = gr.File(label="Upload CSV", file_types=[".csv"], type="filepath") | |
| run_btn = gr.Button("Run EDA", variant="primary") | |
| status = gr.Textbox(label="Status", lines=2) | |
| with gr.Column(scale=2, min_width=520): | |
| with gr.Tabs(): | |
| # Ribbon 1: EDA | |
| with gr.TabItem("EDA"): | |
| with gr.Group(elem_classes="card"): | |
| preview_out = gr.Dataframe(label="Preview (first 25 rows)", interactive=False) | |
| dtypes_out = gr.Dataframe(label="Column Types", interactive=False) | |
| with gr.Group(elem_classes="card"): | |
| miss_out = gr.Dataframe(label="Missing Values (% top 25)", interactive=False) | |
| desc_out = gr.Dataframe(label="Numeric Summary (describe)", interactive=False) | |
| # Ribbon 2: Graph | |
| with gr.TabItem("Graph"): | |
| with gr.Group(elem_classes="card"): | |
| with gr.Row(): | |
| corr_plot_out = gr.Plot(label="Correlation Heatmap") | |
| hist_plot_out = gr.Plot(label="Histograms") | |
| # Ribbon 3: Report | |
| with gr.TabItem("Report"): | |
| with gr.Group(elem_classes="card"): | |
| gr.Markdown("### Download Report") | |
| docx_out = gr.File(label="EDA Report (.docx)") | |
| # Ribbon 4: Interpretation | |
| with gr.TabItem("Interpretation"): | |
| with gr.Group(elem_classes="card"): | |
| interp_out = gr.Textbox(label="Auto Interpretation", lines=10) | |
| # ---------- Wiring ---------- | |
| run_btn.click( | |
| fn=eda_pipeline, | |
| inputs=[file_in], | |
| outputs=[status, preview_out, dtypes_out, miss_out, desc_out, docx_out, corr_plot_out, hist_plot_out, interp_out] | |
| ) | |
| demo.launch() | |