Spaces:

SanthiSastra
/

Demo

Sleeping

App Files Files Community

SanthiSastra commited on Feb 13

Commit

09db78e

verified ·

1 Parent(s): a005bc9

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -275

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
-import io
 import os
 import numpy as np
 import pandas as pd
@@ -9,14 +10,14 @@ import gradio as gr
 from docx import Document
 from docx.shared import Inches
-from sklearn.decomposition import PCA
-from sklearn.preprocessing import StandardScaler
 from sklearn.impute import SimpleImputer
 DOCX_OUT_PATH = "/tmp/EDA_Report.docx"
 def read_csv_safely(filepath: str) -> pd.DataFrame:
     try:
         return pd.read_csv(filepath)
@@ -24,12 +25,61 @@ def read_csv_safely(filepath: str) -> pd.DataFrame:
         return pd.read_csv(filepath, encoding="latin1")
 def save_plot(fig, out_path: str) -> str:
     fig.savefig(out_path, dpi=180, bbox_inches="tight")
     plt.close(fig)
     return out_path
 def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
     doc = Document()
     doc.add_heading("EDA Report (Auto-generated)", level=1)
@@ -40,41 +90,36 @@ def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
     doc.add_heading("Column Types", level=2)
     dtypes = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
-    table = doc.add_table(rows=1, cols=2)
-    table.rows[0].cells[0].text = "column"
-    table.rows[0].cells[1].text = "dtype"
     for _, r in dtypes.head(100).iterrows():
-        row = table.add_row().cells
         row[0].text = str(r["column"])
         row[1].text = str(r["dtype"])
     doc.add_heading("Missing Values", level=2)
     miss = (df.isna().mean() * 100).sort_values(ascending=False)
-    doc.add_paragraph("Top columns by missing percentage:")
-    table2 = doc.add_table(rows=1, cols=2)
-    table2.rows[0].cells[0].text = "column"
-    table2.rows[0].cells[1].text = "missing_%"
     for idx, val in miss.head(25).items():
-        row = table2.add_row().cells
         row[0].text = str(idx)
         row[1].text = f"{val:.2f}"
-    doc.add_paragraph(
-        "Interpretation: Columns with high missing values may need imputation (median/mode) "
-        "or removal depending on domain importance."
-    )
     doc.add_heading("Summary Statistics (Numeric)", level=2)
     num_df = df.select_dtypes(include=[np.number])
     if num_df.shape[1] > 0:
         desc = num_df.describe().T.reset_index().rename(columns={"index": "feature"})
-        cols = ["feature", "mean", "std", "min", "50%", "max"]
-        cols = [c for c in cols if c in desc.columns]
-        table3 = doc.add_table(rows=1, cols=len(cols))
         for j, c in enumerate(cols):
-            table3.rows[0].cells[j].text = c
         for _, r in desc.head(30).iterrows():
-            row = table3.add_row().cells
             for j, c in enumerate(cols):
                 v = r[c]
                 row[j].text = str(round(v, 6)) if isinstance(v, (int, float, np.floating)) else str(v)
@@ -82,34 +127,24 @@ def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
         doc.add_paragraph("No numeric columns found.")
     doc.add_heading("Charts + Interpretations", level=2)
-    for fig_path, note in zip(fig_paths, notes):
         doc.add_paragraph(f"Interpretation: {note}")
-        if os.path.exists(fig_path):
-            doc.add_picture(fig_path, width=Inches(6.5))
     doc.save(DOCX_OUT_PATH)
     return DOCX_OUT_PATH
-def clean_df(df: pd.DataFrame) -> pd.DataFrame:
-    df = df.copy()
-    df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]
-    for c in list(df.columns):
-        if c.lower().startswith("unnamed"):
-            df = df.drop(columns=[c])
-    return df
 def eda_pipeline(csv_path: str):
     if csv_path is None or str(csv_path).strip() == "":
-        return "Please upload a CSV.", None, None, None, None, None, None, None
     try:
-        df = read_csv_safely(csv_path)
     except Exception as e:
-        return f"Could not read CSV: {e}", None, None, None, None, None, None, None
-    df = clean_df(df)
     preview = df.head(25)
     dtypes_df = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
@@ -118,24 +153,22 @@ def eda_pipeline(csv_path: str):
     num_df = df.select_dtypes(include=[np.number])
     desc = num_df.describe().T if num_df.shape[1] > 0 else pd.DataFrame()
-    fig_paths = []
-    notes = []
-    # Missingness bar
     miss_series = (df.isna().mean() * 100).sort_values(ascending=False).head(15)
     fig1 = plt.figure(figsize=(10, 4))
     plt.bar(miss_series.index.astype(str), miss_series.values)
     plt.title("Missing Values (%): Top 15 Columns")
     plt.xticks(rotation=45, ha="right", fontsize=7)
     plt.ylabel("Missing (%)")
-    p1 = save_plot(fig1, "/tmp/missingness.png")
-    fig_paths.append(p1)
-    notes.append("High-missing columns may need imputation (median/mode) or removal depending on usefulness.")
     corr_plot = None
     hist_plot = None
-    # Correlation heatmap
     if num_df.shape[1] >= 2:
         corr = num_df.corr(numeric_only=True)
         fig2 = plt.figure(figsize=(10, 5))
@@ -145,11 +178,10 @@ def eda_pipeline(csv_path: str):
         plt.yticks(range(len(corr.index)), corr.index, fontsize=7)
         plt.colorbar()
         corr_plot = fig2
-        p2 = save_plot(fig2, "/tmp/corr_heatmap.png")
-        fig_paths.append(p2)
-        notes.append("Strong correlations can indicate redundancy; consider regularization or feature selection.")
-    # Histograms
     if num_df.shape[1] > 0:
         cols = list(num_df.columns)[:4]
         fig3 = plt.figure(figsize=(10, 6))
@@ -160,265 +192,95 @@ def eda_pipeline(csv_path: str):
         plt.suptitle("Histograms (first 4 numeric columns)", y=1.02)
         plt.tight_layout()
         hist_plot = fig3
-        p3 = save_plot(fig3, "/tmp/histograms.png")
-        fig_paths.append(p3)
-        notes.append("Histograms show spread/outliers/skewness. Skewed features may need transforms.")
     # DOCX
     try:
         docx_path = make_docx_report(df, fig_paths, notes)
     except Exception as e:
-        return f"Error while creating DOCX: {e}", preview, dtypes_df, miss_df, desc, None, corr_plot, hist_plot
     summary_text = f"Loaded CSV successfully. Rows: {df.shape[0]} | Columns: {df.shape[1]}"
-    return summary_text, preview, dtypes_df, miss_df, desc, docx_path, corr_plot, hist_plot
-def get_columns_for_dropdowns(csv_path: str):
-    if csv_path is None or str(csv_path).strip() == "":
-        return gr.update(choices=[], value=None), gr.update(choices=["None"], value="None")
-    try:
-        df = read_csv_safely(csv_path)
-        df = clean_df(df)
-        num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-        all_cols = df.columns.tolist()
-        feature_default = num_cols[0] if len(num_cols) else None
-        target_choices = ["None"] + all_cols
-        target_default = "None"
-        return gr.update(choices=num_cols, value=feature_default), gr.update(choices=target_choices, value=target_default)
-    except Exception:
-        return gr.update(choices=[], value=None), gr.update(choices=["None"], value="None")
-def feature_analysis(csv_path: str, feature_col: str, target_col: str):
-    if csv_path is None or str(csv_path).strip() == "":
-        return None, None, pd.DataFrame({"message": ["Please upload a CSV first."]})
-    try:
-        df = clean_df(read_csv_safely(csv_path))
-    except Exception as e:
-        return None, None, pd.DataFrame({"error": [f"Could not read CSV: {e}"]})
-    if feature_col is None or feature_col not in df.columns:
-        return None, None, pd.DataFrame({"error": ["Please select a valid numeric feature."]})
-    if not pd.api.types.is_numeric_dtype(df[feature_col]):
-        return None, None, pd.DataFrame({"error": [f"Selected feature '{feature_col}' is not numeric."]})
-    # Box plot
-    box_fig = plt.figure(figsize=(7, 4))
-    if target_col and target_col != "None" and target_col in df.columns:
-        uniq = df[target_col].dropna().unique()
-        if len(uniq) <= 20:
-            groups, labels = [], []
-            for u in sorted(uniq, key=lambda x: str(x)):
-                vals = df.loc[df[target_col] == u, feature_col].dropna().values
-                if len(vals):
-                    groups.append(vals)
-                    labels.append(str(u))
-            if len(groups) >= 2:
-                plt.boxplot(groups, labels=labels, showfliers=True)
-                plt.title(f"Box Plot: {feature_col} by {target_col}")
-                plt.xlabel(target_col)
-                plt.ylabel(feature_col)
-            else:
-                plt.boxplot(df[feature_col].dropna().values)
-                plt.title(f"Box Plot: {feature_col}")
-                plt.ylabel(feature_col)
-        else:
-            plt.boxplot(df[feature_col].dropna().values)
-            plt.title(f"Box Plot: {feature_col} (target too many groups)")
-            plt.ylabel(feature_col)
-    else:
-        plt.boxplot(df[feature_col].dropna().values)
-        plt.title(f"Box Plot: {feature_col}")
-        plt.ylabel(feature_col)
-    plt.tight_layout()
-    # Skewness table
-    num_df = df.select_dtypes(include=[np.number]).copy()
-    if num_df.shape[1] == 0:
-        skew_table = pd.DataFrame({"error": ["No numeric columns to compute skewness."]})
-    else:
-        skew_series = num_df.skew(numeric_only=True).sort_values(key=lambda s: s.abs(), ascending=False)
-        skew_table = pd.DataFrame({
-            "feature": skew_series.index,
-            "skewness": skew_series.values,
-            "abs_skewness": np.abs(skew_series.values)
-        }).head(20)
-        selected_skew = float(num_df[feature_col].skew()) if feature_col in num_df.columns else np.nan
-        skew_table = pd.concat([
-            pd.DataFrame({"feature": [feature_col], "skewness": [selected_skew], "abs_skewness": [abs(selected_skew)]}),
-            skew_table
-        ], ignore_index=True).drop_duplicates(subset=["feature"], keep="first")
-    # PCA plot
-    if num_df.shape[1] >= 2 and num_df.shape[0] >= 5:
-        X = SimpleImputer(strategy="median").fit_transform(num_df.values)
-        X = StandardScaler().fit_transform(X)
-        pca = PCA(n_components=2, random_state=42)
-        Z = pca.fit_transform(X)
-        pca_fig = plt.figure(figsize=(7, 4))
-        if target_col and target_col != "None" and target_col in df.columns:
-            y = df[target_col].astype(str)
-            uniq = y.dropna().unique()
-            if len(uniq) <= 10:
-                for u in sorted(uniq):
-                    mask = (y == u).values
-                    plt.scatter(Z[mask, 0], Z[mask, 1], s=18, label=u)
-                plt.legend(fontsize=8)
-            else:
-                plt.scatter(Z[:, 0], Z[:, 1], s=18)
-            plt.title(f"PCA (2D) colored by {target_col}")
-        else:
-            plt.scatter(Z[:, 0], Z[:, 1], s=18)
-            plt.title("PCA (2D)")
-        plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
-        plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
-        plt.tight_layout()
-    else:
-        pca_fig = plt.figure(figsize=(7, 2))
-        plt.text(0.01, 0.5, "Not enough numeric columns/rows for PCA.", fontsize=10)
-        plt.axis("off")
-    return box_fig, pca_fig, skew_table
 with gr.Blocks(
-    title="Samudramadanam-Amirthum1 | SASTRA",
-    theme=gr.themes.Soft(
-        primary_hue="blue",
-        secondary_hue="slate",
-        neutral_hue="gray",
-        radius_size="lg",
-        font=["Inter", "ui-sans-serif", "system-ui"]
-    ),
-    css="""
-    .topbar {
-        display:flex; align-items:center; gap:14px;
-        padding:16px 18px; border-radius:16px;
-        background: linear-gradient(90deg, rgba(15,23,42,1), rgba(30,58,138,1));
-        color:white; margin-bottom:14px;
-        box-shadow: 0 10px 24px rgba(2,6,23,0.25);
-    }
-    .topbar img { height:56px; width:auto; border-radius:10px; background:white; padding:6px; }
-    .topbar .title { font-size:20px; font-weight:800; line-height:1.1; }
-    .topbar .sub { font-size:12px; opacity:0.9; margin-top:2px; }
-    .chiprow { margin-top:10px; display:flex; flex-wrap:wrap; gap:8px; }
-    .chip {
-        display:inline-block; padding:6px 10px; border-radius:999px;
-        background: rgba(255,255,255,0.14); color:white;
-        font-size:12px; border: 1px solid rgba(255,255,255,0.18);
-    }
-    .card {
-        border-radius:18px; padding:14px 14px;
-        border:1px solid rgba(148,163,184,0.35);
-        box-shadow: 0 10px 22px rgba(15,23,42,0.06);
-        background: rgba(255,255,255,0.88);
-    }
-    .hint { font-size:12px; color:#475569; }
-    .stepbox {
-        border-radius:14px;
-        border:1px dashed rgba(100,116,139,0.55);
-        padding:12px 12px;
-        background: rgba(248,250,252,0.95);
-    }
-    """
 ) as demo:
-with gr.Row(variant="compact"):
-    gr.Image(value="logo.jpg", show_label=False, height=80, container=False)
-    gr.Markdown(
-        "## **Samudramadanam-Amirthum1**\n"
-        "**SASTRA • CSV EDA & Report Studio (Upload → EDA → Plots → DOCX)**"
-    )
-    # ---------- Header ----------
-   # gr.Markdown("## Samudhramadanam-Amirtham1")
-    #gr.Markdown("### SASTRA • CSV EDA & Report Studio")
-    # ---------- Main Layout ----------
     with gr.Row():
-        # Left: Controls
-        with gr.Column(scale=1, min_width=360):
             with gr.Group(elem_classes="card"):
-                gr.Markdown("### 1) Upload Dataset")
-                gr.Markdown("<div class='hint'>Upload a CSV file. The app processes it securely on the server.</div>")
                 file_in = gr.File(label="Upload CSV", file_types=[".csv"], type="filepath")
-                gr.Markdown("### 2) Run EDA")
-                gr.Markdown(
-                    "<div class='stepbox'>"
-                    "<b>Procedure</b><br>"
-                    "• Upload CSV<br>"
-                    "• Click <b>Run EDA + Generate DOCX</b><br>"
-                    "• View preview, missing %, numeric summary<br>"
-                    "• Download the DOCX report<br>"
-                    "• Optional: feature analysis (boxplot, skewness, PCA)"
-                    "</div>"
-                )
-                run_btn = gr.Button("Run EDA + Generate DOCX", variant="primary")
-                summary = gr.Textbox(label="Status", lines=2)
-                docx_out = gr.File(label="Download EDA Report (.docx)")
-            with gr.Group(elem_classes="card"):
-                gr.Markdown("### 3) Feature Analysis (Optional)")
-                gr.Markdown("<div class='hint'>Select a numeric feature. Choose a target column if you want grouping/color.</div>")
-                with gr.Row():
-                    feature_dd = gr.Dropdown(label="Numeric Feature", choices=[], value=None)
-                    target_dd = gr.Dropdown(label="Target Column (optional)", choices=["None"], value="None")
-                analyze_btn = gr.Button("Run Feature Analysis", variant="secondary")
-        # Right: Outputs
         with gr.Column(scale=2, min_width=520):
             with gr.Tabs():
-                with gr.TabItem("📄 EDA Tables"):
                     with gr.Group(elem_classes="card"):
                         preview_out = gr.Dataframe(label="Preview (first 25 rows)", interactive=False)
-                        dtypes_out = gr.Dataframe(label="Column types", interactive=False)
                     with gr.Group(elem_classes="card"):
-                        miss_out = gr.Dataframe(label="Missing values (% top 25)", interactive=False)
-                        desc_out = gr.Dataframe(label="Numeric summary (describe)", interactive=False)
-                with gr.TabItem("📈 EDA Plots"):
                     with gr.Group(elem_classes="card"):
                         with gr.Row():
-                            corr_plot_out = gr.Plot(label="Correlation Heatmap (numeric)")
-                            hist_plot_out = gr.Plot(label="Histograms (first 4 numeric columns)")
-                with gr.TabItem("🔍 Feature Analysis"):
                     with gr.Group(elem_classes="card"):
-                        with gr.Row():
-                            box_plot_out = gr.Plot(label="Box Plot")
-                            pca_plot_out = gr.Plot(label="PCA (2D)")
-                        skew_out = gr.Dataframe(label="Skewness (Top 20 numeric features)", interactive=False)
-    # ---------- Wiring (uses your existing functions) ----------
     run_btn.click(
         fn=eda_pipeline,
         inputs=[file_in],
-        outputs=[summary, preview_out, dtypes_out, miss_out, desc_out, docx_out, corr_plot_out, hist_plot_out]
-    )
-    file_in.change(
-        fn=get_columns_for_dropdowns,
-        inputs=[file_in],
-        outputs=[feature_dd, target_dd]
-    )
-    analyze_btn.click(
-        fn=feature_analysis,
-        inputs=[file_in, feature_dd, target_dd],
-        outputs=[box_plot_out, pca_plot_out, skew_out]
     )
 demo.launch()

+# app.py  (Hugging Face Spaces + Gradio)
+# Requirements: gradio, pandas, numpy, matplotlib, python-docx, scikit-learn
 import os
 import numpy as np
 import pandas as pd
 from docx import Document
 from docx.shared import Inches
 from sklearn.impute import SimpleImputer
+from sklearn.preprocessing import StandardScaler
+from sklearn.decomposition import PCA
 DOCX_OUT_PATH = "/tmp/EDA_Report.docx"
+# ----------------------------- Helpers -----------------------------
 def read_csv_safely(filepath: str) -> pd.DataFrame:
     try:
         return pd.read_csv(filepath)
         return pd.read_csv(filepath, encoding="latin1")
+def clean_df(df: pd.DataFrame) -> pd.DataFrame:
+    df = df.copy()
+    df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]
+    for c in list(df.columns):
+        if c.lower().startswith("unnamed"):
+            df = df.drop(columns=[c])
+    return df
 def save_plot(fig, out_path: str) -> str:
     fig.savefig(out_path, dpi=180, bbox_inches="tight")
     plt.close(fig)
     return out_path
+def make_interpretation_notes(df: pd.DataFrame) -> str:
+    notes = []
+    notes.append(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns.")
+    miss = (df.isna().mean() * 100).sort_values(ascending=False)
+    top_miss = miss[miss > 0].head(5)
+    if len(top_miss) == 0:
+        notes.append("No missing values detected.")
+    else:
+        notes.append("Top missing columns (%): " + ", ".join([f"{k}={v:.1f}%" for k, v in top_miss.items()]))
+    num_df = df.select_dtypes(include=[np.number])
+    if num_df.shape[1] > 0:
+        skew = num_df.skew(numeric_only=True)
+        high_skew = skew[skew.abs() > 1].sort_values(key=lambda s: s.abs(), ascending=False).head(5)
+        if len(high_skew) > 0:
+            notes.append("Highly skewed numeric features (|skew|>1): " +
+                         ", ".join([f"{k}={v:.2f}" for k, v in high_skew.items()]) +
+                         ". Consider log/Box-Cox or robust scaling if needed.")
+        else:
+            notes.append("No strongly skewed numeric features (|skew|>1) detected among numeric columns.")
+        if num_df.shape[1] >= 2:
+            corr = num_df.corr(numeric_only=True)
+            # strongest correlations (excluding self)
+            pairs = []
+            cols = corr.columns
+            for i in range(len(cols)):
+                for j in range(i + 1, len(cols)):
+                    pairs.append((cols[i], cols[j], corr.iloc[i, j]))
+            pairs = sorted(pairs, key=lambda x: abs(x[2]), reverse=True)[:5]
+            if pairs:
+                notes.append("Top correlations (absolute): " + ", ".join([f"{a}-{b}={c:.2f}" for a, b, c in pairs]))
+    else:
+        notes.append("No numeric columns detected; plots and numeric summary will be limited.")
+    return "\n• " + "\n• ".join(notes)
+# ----------------------------- DOCX Report -----------------------------
 def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
     doc = Document()
     doc.add_heading("EDA Report (Auto-generated)", level=1)
     doc.add_heading("Column Types", level=2)
     dtypes = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
+    t = doc.add_table(rows=1, cols=2)
+    t.rows[0].cells[0].text = "column"
+    t.rows[0].cells[1].text = "dtype"
     for _, r in dtypes.head(100).iterrows():
+        row = t.add_row().cells
         row[0].text = str(r["column"])
         row[1].text = str(r["dtype"])
     doc.add_heading("Missing Values", level=2)
     miss = (df.isna().mean() * 100).sort_values(ascending=False)
+    t2 = doc.add_table(rows=1, cols=2)
+    t2.rows[0].cells[0].text = "column"
+    t2.rows[0].cells[1].text = "missing_%"
     for idx, val in miss.head(25).items():
+        row = t2.add_row().cells
         row[0].text = str(idx)
         row[1].text = f"{val:.2f}"
+    doc.add_paragraph("Interpretation: Columns with high missing values may need imputation or removal.")
     doc.add_heading("Summary Statistics (Numeric)", level=2)
     num_df = df.select_dtypes(include=[np.number])
     if num_df.shape[1] > 0:
         desc = num_df.describe().T.reset_index().rename(columns={"index": "feature"})
+        cols = [c for c in ["feature", "mean", "std", "min", "50%", "max"] if c in desc.columns]
+        t3 = doc.add_table(rows=1, cols=len(cols))
         for j, c in enumerate(cols):
+            t3.rows[0].cells[j].text = c
         for _, r in desc.head(30).iterrows():
+            row = t3.add_row().cells
             for j, c in enumerate(cols):
                 v = r[c]
                 row[j].text = str(round(v, 6)) if isinstance(v, (int, float, np.floating)) else str(v)
         doc.add_paragraph("No numeric columns found.")
     doc.add_heading("Charts + Interpretations", level=2)
+    for fp, note in zip(fig_paths, notes):
         doc.add_paragraph(f"Interpretation: {note}")
+        if os.path.exists(fp):
+            doc.add_picture(fp, width=Inches(6.5))
     doc.save(DOCX_OUT_PATH)
     return DOCX_OUT_PATH
+# ----------------------------- EDA Pipeline -----------------------------
 def eda_pipeline(csv_path: str):
     if csv_path is None or str(csv_path).strip() == "":
+        return "Please upload a CSV.", None, None, None, None, None, None, None, ""
     try:
+        df = clean_df(read_csv_safely(csv_path))
     except Exception as e:
+        return f"Could not read CSV: {e}", None, None, None, None, None, None, None, ""
     preview = df.head(25)
     dtypes_df = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
     num_df = df.select_dtypes(include=[np.number])
     desc = num_df.describe().T if num_df.shape[1] > 0 else pd.DataFrame()
+    fig_paths, notes = [], []
+    # Plot 1: Missingness
     miss_series = (df.isna().mean() * 100).sort_values(ascending=False).head(15)
     fig1 = plt.figure(figsize=(10, 4))
     plt.bar(miss_series.index.astype(str), miss_series.values)
     plt.title("Missing Values (%): Top 15 Columns")
     plt.xticks(rotation=45, ha="right", fontsize=7)
     plt.ylabel("Missing (%)")
+    fig_paths.append(save_plot(fig1, "/tmp/missingness.png"))
+    notes.append("High-missing columns may need imputation (median/mode) or removal based on usefulness.")
     corr_plot = None
     hist_plot = None
+    # Plot 2: Correlation
     if num_df.shape[1] >= 2:
         corr = num_df.corr(numeric_only=True)
         fig2 = plt.figure(figsize=(10, 5))
         plt.yticks(range(len(corr.index)), corr.index, fontsize=7)
         plt.colorbar()
         corr_plot = fig2
+        fig_paths.append(save_plot(fig2, "/tmp/corr_heatmap.png"))
+        notes.append("Strong correlations may indicate redundant features; consider feature selection/regularization.")
+    # Plot 3: Histograms
     if num_df.shape[1] > 0:
         cols = list(num_df.columns)[:4]
         fig3 = plt.figure(figsize=(10, 6))
         plt.suptitle("Histograms (first 4 numeric columns)", y=1.02)
         plt.tight_layout()
         hist_plot = fig3
+        fig_paths.append(save_plot(fig3, "/tmp/histograms.png"))
+        notes.append("Histograms show distribution/outliers/skewness; consider transforms for highly skewed features.")
     # DOCX
     try:
         docx_path = make_docx_report(df, fig_paths, notes)
     except Exception as e:
+        interp = make_interpretation_notes(df)
+        return f"Error while creating DOCX: {e}", preview, dtypes_df, miss_df, desc, None, corr_plot, hist_plot, interp
+    interp = make_interpretation_notes(df)
     summary_text = f"Loaded CSV successfully. Rows: {df.shape[0]} | Columns: {df.shape[1]}"
+    return summary_text, preview, dtypes_df, miss_df, desc, docx_path, corr_plot, hist_plot, interp
+# ----------------------------- App UI (Beautiful College View) -----------------------------
+CSS = """
+/* Center header */
+#hdr {text-align:center; margin-top:8px; margin-bottom:6px;}
+#appname {color:#0b3d91; font-weight:900; font-size:28px; margin:0;}
+#appsub {color:#0b3d91; font-weight:700; font-size:16px; margin-top:4px;}
+#appauth {color:#0b3d91; font-weight:700; font-size:14px; margin-top:2px;}
+/* Ribbon tabs */
+.gradio-container .tabs {border-radius:14px;}
+.gradio-container .tabitem {font-weight:800;}
+/* Card style */
+.card {border:1px solid rgba(148,163,184,.35); border-radius:18px; padding:14px; background:rgba(255,255,255,.92);}
+.hint {font-size:12px; color:#475569;}
+"""
 with gr.Blocks(
+    title="SAMUDHRAMADANAM-AMIRTHAM1 | SASTRA",
+    theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate", neutral_hue="gray"),
+    css=CSS
 ) as demo:
+    # ---------- Header (CENTER) ----------
+    with gr.Column(elem_id="hdr"):
+        gr.Image(value="logo.jpg", show_label=False, height=120, container=False)
+        gr.Markdown("<div id='appname'>SAMUDHRAMADANAM-AMIRTHAM1</div>")
+        gr.Markdown("<div id='appauth'>Prof.B.Santhi, SRC, SASTRA</div>")
+    gr.Markdown("<hr>")
+    # ---------- Left controls + Ribbon outputs ----------
     with gr.Row():
+        with gr.Column(scale=1, min_width=340):
             with gr.Group(elem_classes="card"):
+                gr.Markdown("### Upload CSV")
+                gr.Markdown("<div class='hint'>Upload your dataset (CSV). Then run EDA to view tables, graphs and download report.</div>")
                 file_in = gr.File(label="Upload CSV", file_types=[".csv"], type="filepath")
+                run_btn = gr.Button("Run EDA", variant="primary")
+                status = gr.Textbox(label="Status", lines=2)
         with gr.Column(scale=2, min_width=520):
             with gr.Tabs():
+                # Ribbon 1: EDA
+                with gr.TabItem("EDA"):
                     with gr.Group(elem_classes="card"):
                         preview_out = gr.Dataframe(label="Preview (first 25 rows)", interactive=False)
+                        dtypes_out = gr.Dataframe(label="Column Types", interactive=False)
                     with gr.Group(elem_classes="card"):
+                        miss_out = gr.Dataframe(label="Missing Values (% top 25)", interactive=False)
+                        desc_out = gr.Dataframe(label="Numeric Summary (describe)", interactive=False)
+                # Ribbon 2: Graph
+                with gr.TabItem("Graph"):
                     with gr.Group(elem_classes="card"):
                         with gr.Row():
+                            corr_plot_out = gr.Plot(label="Correlation Heatmap")
+                            hist_plot_out = gr.Plot(label="Histograms")
+                # Ribbon 3: Report
+                with gr.TabItem("Report"):
                     with gr.Group(elem_classes="card"):
+                        gr.Markdown("### Download Report")
+                        docx_out = gr.File(label="EDA Report (.docx)")
+                # Ribbon 4: Interpretation
+                with gr.TabItem("Interpretation"):
+                    with gr.Group(elem_classes="card"):
+                        interp_out = gr.Textbox(label="Auto Interpretation", lines=10)
+    # ---------- Wiring ----------
     run_btn.click(
         fn=eda_pipeline,
         inputs=[file_in],
+        outputs=[status, preview_out, dtypes_out, miss_out, desc_out, docx_out, corr_plot_out, hist_plot_out, interp_out]
     )
 demo.launch()