Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| """app.ipynb | |
| Automatically generated by Colab. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1zR9cbA-JzknrRBL2Y35IE1FKmyHJ7_ME | |
| """ | |
| import os | |
| import io | |
| import tempfile | |
| from datetime import datetime | |
| import numpy as np | |
| import pandas as pd | |
| import gradio as gr | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from scipy import stats | |
| from docx import Document | |
| from docx.shared import Inches, Pt | |
| from docx.enum.text import WD_ALIGN_PARAGRAPH | |
| # ---------------------------- | |
| # EDA helpers | |
| # ---------------------------- | |
| def find_numeric_columns(df: pd.DataFrame): | |
| return df.select_dtypes(include=[np.number]).columns.tolist() | |
| def safe_skew(series: pd.Series): | |
| s = series.dropna() | |
| if len(s) < 3: | |
| return np.nan | |
| return float(s.skew()) | |
| def safe_kurt(series: pd.Series): | |
| s = series.dropna() | |
| if len(s) < 4: | |
| return np.nan | |
| return float(s.kurt()) | |
| def basic_numeric_stats(df: pd.DataFrame, num_cols: list[str]) -> pd.DataFrame: | |
| rows = [] | |
| for c in num_cols: | |
| s = df[c] | |
| s2 = s.dropna() | |
| rows.append({ | |
| "Attribute": c, | |
| "Count": int(s2.shape[0]), | |
| "Missing": int(s.isna().sum()), | |
| "Mean": float(s2.mean()) if len(s2) else np.nan, | |
| "Std": float(s2.std(ddof=1)) if len(s2) > 1 else np.nan, | |
| "Var": float(s2.var(ddof=1)) if len(s2) > 1 else np.nan, | |
| "Min": float(s2.min()) if len(s2) else np.nan, | |
| "25%": float(s2.quantile(0.25)) if len(s2) else np.nan, | |
| "Median": float(s2.median()) if len(s2) else np.nan, | |
| "75%": float(s2.quantile(0.75)) if len(s2) else np.nan, | |
| "Max": float(s2.max()) if len(s2) else np.nan, | |
| "Skewness": safe_skew(s), | |
| "Kurtosis": safe_kurt(s), | |
| }) | |
| return pd.DataFrame(rows) | |
| def five_point_summary_table(df: pd.DataFrame, num_cols: list[str]) -> pd.DataFrame: | |
| rows = [] | |
| for c in num_cols: | |
| s = df[c].dropna() | |
| if len(s) == 0: | |
| rows.append({"Attribute": c, "Min": np.nan, "Q1": np.nan, "Median": np.nan, "Q3": np.nan, "Max": np.nan}) | |
| else: | |
| rows.append({ | |
| "Attribute": c, | |
| "Min": float(s.min()), | |
| "Q1": float(s.quantile(0.25)), | |
| "Median": float(s.median()), | |
| "Q3": float(s.quantile(0.75)), | |
| "Max": float(s.max()), | |
| }) | |
| return pd.DataFrame(rows) | |
| def interpretation_numeric(stats_df: pd.DataFrame) -> str: | |
| if stats_df.empty: | |
| return "No numeric attributes were detected in the uploaded dataset." | |
| tmp = stats_df[["Attribute", "Skewness"]].dropna() | |
| skew_top = tmp.reindex(tmp["Skewness"].abs().sort_values(ascending=False).index).head(3) | |
| lines = [] | |
| lines.append(f"Numeric attributes detected: {len(stats_df)}.") | |
| if len(skew_top) > 0: | |
| parts = [f"{r.Attribute} (skew={r.Skewness:.2f})" for r in skew_top.itertuples(index=False)] | |
| lines.append("Most skewed attributes (absolute skewness): " + ", ".join(parts) + ".") | |
| miss_sorted = stats_df.sort_values("Missing", ascending=False).head(3) | |
| if miss_sorted["Missing"].max() > 0: | |
| parts = [f"{r.Attribute} (missing={int(r.Missing)})" for r in miss_sorted.itertuples(index=False)] | |
| lines.append("Attributes with higher missing values: " + ", ".join(parts) + ".") | |
| else: | |
| lines.append("No missing values were observed in numeric attributes.") | |
| return " ".join(lines) | |
| def correlation_interpretation(corr: pd.DataFrame) -> str: | |
| if corr is None or corr.empty: | |
| return "Correlation could not be computed (insufficient numeric attributes)." | |
| c = corr.copy() | |
| np.fill_diagonal(c.values, np.nan) | |
| stacked = c.stack().dropna() | |
| if stacked.empty: | |
| return "No meaningful pairwise correlations were found." | |
| top = stacked.abs().sort_values(ascending=False).head(3) | |
| lines = [] | |
| for (a, b), _ in top.items(): | |
| val = float(corr.loc[a, b]) | |
| sign = "positive" if val >= 0 else "negative" | |
| lines.append(f"{a} vs {b}: {val:.2f} ({sign})") | |
| return "Strongest correlations: " + "; ".join(lines) + "." | |
| def fig_to_png_path(fig) -> str: | |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png") | |
| fig.savefig(tmp.name, bbox_inches="tight", dpi=200) | |
| plt.close(fig) | |
| return tmp.name | |
| def plot_correlogram_annotated(corr: pd.DataFrame, title="Correlogram (Annotated)"): | |
| fig, ax = plt.subplots(figsize=(8, 6)) | |
| data = corr.values | |
| im = ax.imshow(data, aspect="auto") | |
| ax.set_title(title) | |
| ax.set_xticks(range(len(corr.columns))) | |
| ax.set_xticklabels(corr.columns, rotation=45, ha="right") | |
| ax.set_yticks(range(len(corr.index))) | |
| ax.set_yticklabels(corr.index) | |
| for i in range(data.shape[0]): | |
| for j in range(data.shape[1]): | |
| val = data[i, j] | |
| ax.text(j, i, "" if np.isnan(val) else f"{val:.2f}", ha="center", va="center", fontsize=8) | |
| fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04) | |
| fig.tight_layout() | |
| return fig | |
| def plot_pairplot(df: pd.DataFrame, num_cols: list[str], max_cols=6): | |
| use_cols = num_cols[:max_cols] | |
| if len(use_cols) < 2: | |
| return None | |
| grid = sns.pairplot(df[use_cols].dropna(), corner=True, diag_kind="hist") | |
| grid.fig.suptitle("Pair Plot", y=1.02) | |
| return grid | |
| # ---------------------------- | |
| # DOCX helpers | |
| # ---------------------------- | |
| def add_heading_centered(doc: Document, text: str, font_size=16, bold=True, color_rgb="1E5AA8"): | |
| p = doc.add_paragraph() | |
| p.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| run = p.add_run(text) | |
| run.bold = bold | |
| run.font.size = Pt(font_size) | |
| # set run color | |
| rPr = run._element.get_or_add_rPr() | |
| color = rPr.get_or_add_color() | |
| color.val = color_rgb | |
| def add_image(doc: Document, image_path: str, width_inches=6.2): | |
| doc.add_picture(image_path, width=Inches(width_inches)) | |
| def build_docx_report(df: pd.DataFrame, dataset_name: str, id_col: str | None, | |
| stats_df: pd.DataFrame, corr: pd.DataFrame, | |
| graph_paths: list[tuple[str, str]]) -> str: | |
| doc = Document() | |
| add_heading_centered(doc, "Amrita Manthana", font_size=20, bold=True) | |
| add_heading_centered(doc, "Prof.B.Santhi,SRC,SASTRA", font_size=14, bold=True) | |
| doc.add_paragraph("") | |
| p = doc.add_paragraph() | |
| p.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| p.add_run("EDA Report").bold = True | |
| doc.add_paragraph(f"Dataset: {dataset_name}") | |
| doc.add_paragraph(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") | |
| if id_col: | |
| doc.add_paragraph(f"Record ID column: {id_col}") | |
| doc.add_paragraph("") | |
| doc.add_paragraph("1) Numeric Statistics").runs[0].bold = True | |
| doc.add_paragraph(interpretation_numeric(stats_df)) | |
| if not stats_df.empty: | |
| tdf = stats_df.head(25) if len(stats_df) > 25 else stats_df | |
| table = doc.add_table(rows=1, cols=len(tdf.columns)) | |
| for j, col in enumerate(tdf.columns): | |
| table.rows[0].cells[j].text = str(col) | |
| for _, row in tdf.iterrows(): | |
| cells = table.add_row().cells | |
| for j, col in enumerate(tdf.columns): | |
| val = row[col] | |
| if isinstance(val, float): | |
| cells[j].text = "" if np.isnan(val) else f"{val:.4f}" | |
| else: | |
| cells[j].text = str(val) | |
| doc.add_paragraph("") | |
| doc.add_paragraph("2) Correlation").runs[0].bold = True | |
| doc.add_paragraph(correlation_interpretation(corr)) | |
| doc.add_paragraph("") | |
| doc.add_paragraph("3) Graphs & Interpretation").runs[0].bold = True | |
| for title, path in graph_paths: | |
| doc.add_paragraph("") | |
| doc.add_paragraph(title).runs[0].bold = True | |
| add_image(doc, path) | |
| if "Correlogram" in title: | |
| doc.add_paragraph("Interpretation: Values near +1/-1 indicate strong positive/negative association.") | |
| elif "Bar" in title: | |
| doc.add_paragraph("Interpretation: Taller bars indicate larger frequency/aggregate value.") | |
| elif "Pie" in title: | |
| doc.add_paragraph("Interpretation: Slice proportions show relative contribution of categories.") | |
| elif "Scatter" in title: | |
| doc.add_paragraph("Interpretation: Patterns indicate linear/non-linear trend, clustering, or outliers.") | |
| elif "Pair Plot" in title: | |
| doc.add_paragraph("Interpretation: Diagonal shows distributions; others show pairwise relationships/outliers.") | |
| doc.add_paragraph("") | |
| doc.add_paragraph("4) Final Remarks").runs[0].bold = True | |
| doc.add_paragraph("This report consolidates numeric measures, distributions, and relationships among attributes.") | |
| out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name | |
| doc.save(out_path) | |
| return out_path | |
| # ---------------------------- | |
| # App logic | |
| # ---------------------------- | |
| def load_csv(file_obj, sep, header_flag): | |
| if file_obj is None: | |
| return None, "Please upload a CSV.", None, [], None | |
| try: | |
| header = 0 if header_flag else None | |
| df = pd.read_csv(file_obj.name, sep=sep, header=header) | |
| if not header_flag: | |
| df.columns = [f"col_{i+1}" for i in range(df.shape[1])] | |
| num_cols = find_numeric_columns(df) | |
| cols = df.columns.tolist() | |
| info = f"Loaded: rows={df.shape[0]}, cols={df.shape[1]}. Numeric cols={len(num_cols)}." | |
| return df, info, df.head(30), cols, num_cols | |
| except Exception as e: | |
| return None, f"Could not read CSV: {e}", None, [], None | |
| def eda_compute(df: pd.DataFrame): | |
| if df is None: | |
| return "Upload a CSV first.", None, None, None, None | |
| num_cols = find_numeric_columns(df) | |
| if len(num_cols) == 0: | |
| return "No numeric columns found.", pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), "Correlation not available." | |
| stats_df = basic_numeric_stats(df, num_cols) | |
| fps_df = five_point_summary_table(df, num_cols) | |
| corr_df = pd.DataFrame() | |
| corr_text = "Correlation not available (need at least 2 numeric columns)." | |
| if len(num_cols) >= 2: | |
| corr_df = df[num_cols].corr(numeric_only=True) | |
| corr_text = correlation_interpretation(corr_df) | |
| skew_df = pd.DataFrame({"Attribute": num_cols, "Skewness": [safe_skew(df[c]) for c in num_cols]}) | |
| return interpretation_numeric(stats_df), stats_df, fps_df, skew_df, corr_df, corr_text | |
| def graph_make(df: pd.DataFrame, barpie_col: str, topn: int, | |
| scatter_x: str, scatter_y: str, pair_max: int): | |
| if df is None: | |
| return None, None, None, None, None | |
| paths = [None, None, None, None, None] | |
| # Bar / Pie data | |
| series = df[barpie_col] | |
| if pd.api.types.is_numeric_dtype(series): | |
| binned = pd.cut(series.dropna(), bins=10) | |
| counts = binned.value_counts().head(topn) | |
| labels = counts.index.astype(str).tolist() | |
| yvals = counts.values | |
| suffix = "(binned)" | |
| else: | |
| counts = series.astype(str).value_counts().head(topn) | |
| labels = counts.index.tolist() | |
| yvals = counts.values | |
| suffix = "" | |
| fig_bar, ax = plt.subplots(figsize=(7, 4)) | |
| ax.bar(range(len(labels)), yvals) | |
| ax.set_xticks(range(len(labels))) | |
| ax.set_xticklabels(labels, rotation=45, ha="right") | |
| ax.set_title(f"Bar Chart: {barpie_col} {suffix}") | |
| ax.set_ylabel("Count") | |
| paths[0] = fig_to_png_path(fig_bar) | |
| fig_pie, ax2 = plt.subplots(figsize=(6, 4)) | |
| ax2.pie(yvals, labels=labels, autopct="%1.1f%%") | |
| ax2.set_title(f"Pie Chart: {barpie_col} {suffix}") | |
| paths[1] = fig_to_png_path(fig_pie) | |
| # Scatter / Corr / Pair | |
| num_cols = find_numeric_columns(df) | |
| if len(num_cols) >= 2 and scatter_x in num_cols and scatter_y in num_cols: | |
| fig_sc, ax3 = plt.subplots(figsize=(7, 4)) | |
| ax3.scatter(df[scatter_x], df[scatter_y], alpha=0.7) | |
| ax3.set_xlabel(scatter_x) | |
| ax3.set_ylabel(scatter_y) | |
| ax3.set_title(f"Scatter: {scatter_x} vs {scatter_y}") | |
| paths[2] = fig_to_png_path(fig_sc) | |
| corr = df[num_cols].corr(numeric_only=True) | |
| fig_corr = plot_correlogram_annotated(corr) | |
| paths[3] = fig_to_png_path(fig_corr) | |
| pair_max = max(2, min(pair_max, len(num_cols))) | |
| grid = plot_pairplot(df, num_cols, max_cols=pair_max) | |
| if grid is not None: | |
| tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name | |
| grid.fig.savefig(tmp, bbox_inches="tight", dpi=200) | |
| plt.close(grid.fig) | |
| paths[4] = tmp | |
| return tuple(paths) | |
| def report_generate(df: pd.DataFrame, dataset_name: str, id_col: str, | |
| barpie_col: str, topn: int, scatter_x: str, scatter_y: str, pair_max: int): | |
| if df is None: | |
| return None, "Upload a CSV first." | |
| num_cols = find_numeric_columns(df) | |
| stats_df = basic_numeric_stats(df, num_cols) if len(num_cols) else pd.DataFrame() | |
| corr = df[num_cols].corr(numeric_only=True) if len(num_cols) >= 2 else pd.DataFrame() | |
| # build graphs (same as graph tab) | |
| bar_path, pie_path, sc_path, corr_path, pair_path = graph_make(df, barpie_col, topn, scatter_x, scatter_y, pair_max) | |
| graph_paths = [] | |
| if bar_path: graph_paths.append((f"Bar Chart: {barpie_col}", bar_path)) | |
| if pie_path: graph_paths.append((f"Pie Chart: {barpie_col}", pie_path)) | |
| if sc_path: graph_paths.append((f"Scatter: {scatter_x} vs {scatter_y}", sc_path)) | |
| if corr_path: graph_paths.append(("Correlogram (Annotated)", corr_path)) | |
| if pair_path: graph_paths.append((f"Pair Plot (first {min(pair_max, len(num_cols))} numeric cols)", pair_path)) | |
| docx_path = build_docx_report(df, dataset_name or "uploaded.csv", id_col, stats_df, corr, graph_paths) | |
| # (optional) cleanup images later is fine; HF temp storage is ok for session | |
| return docx_path, "DOCX report generated successfully." | |
| def search_record(df: pd.DataFrame, id_col: str, query: str): | |
| if df is None: | |
| return "Upload a CSV first.", None | |
| if not query: | |
| return "Enter a value to search.", None | |
| if id_col not in df.columns: | |
| return "Select a valid ID column.", None | |
| col = df[id_col] | |
| result = None | |
| if pd.api.types.is_numeric_dtype(col): | |
| try: | |
| q = float(query) | |
| result = df[df[id_col] == q] | |
| except: | |
| result = df[col.astype(str) == query] | |
| else: | |
| result = df[col.astype(str) == query] | |
| if result is None or result.empty: | |
| return "No matching record found.", pd.DataFrame() | |
| return f"Found {len(result)} record(s).", result | |
| # ---------------------------- | |
| # UI | |
| # ---------------------------- | |
| CSS = """ | |
| #titleblock {text-align:center; margin-top: 5px; margin-bottom: 10px;} | |
| #t1 {font-size:30px; font-weight:800; color:#1E5AA8;} | |
| #t2 {font-size:18px; font-weight:800; color:#1E5AA8;} | |
| """ | |
| def logo_html(): | |
| # place logo.png in repo root | |
| if os.path.exists("logo.png"): | |
| # gradio serves files placed in root with relative path in HTML | |
| return f""" | |
| <div id="titleblock"> | |
| <img src="file=logo.png" style="width:110px; display:block; margin:0 auto;" /> | |
| <div id="t1">Amrita Manthana</div> | |
| <div id="t2">Prof.B.Santhi,SRC,SASTRA</div> | |
| </div> | |
| """ | |
| else: | |
| return """ | |
| <div id="titleblock"> | |
| <div id="t1">Amrita Manthana</div> | |
| <div id="t2">Prof.B.Santhi,SRC,SASTRA</div> | |
| </div> | |
| """ | |
| with gr.Blocks(css=CSS, title="Amrita Manthana - EDA (Gradio)") as demo: | |
| gr.HTML(logo_html()) | |
| df_state = gr.State(None) | |
| cols_state = gr.State([]) | |
| numcols_state = gr.State([]) | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=320): | |
| gr.Markdown("### Data Upload") | |
| file_in = gr.File(label="Upload CSV", file_types=[".csv"]) | |
| sep = gr.Dropdown(label="CSV Separator", choices=[",", ";", "\t", "|"], value=",") | |
| header_flag = gr.Checkbox(label="First row is header", value=True) | |
| load_btn = gr.Button("Load Data", variant="primary") | |
| load_msg = gr.Textbox(label="Status", interactive=False) | |
| preview = gr.Dataframe(label="Preview (first 30 rows)", interactive=False, wrap=True) | |
| with gr.Column(scale=2): | |
| with gr.Tabs(): | |
| with gr.Tab("EDA"): | |
| eda_btn = gr.Button("Compute EDA") | |
| eda_note = gr.Textbox(label="Interpretation", lines=3, interactive=False) | |
| stats_table = gr.Dataframe(label="Descriptive Statistics", interactive=False, wrap=True) | |
| fps_table = gr.Dataframe(label="Five-Point Summary", interactive=False, wrap=True) | |
| skew_table = gr.Dataframe(label="Skewness", interactive=False, wrap=True) | |
| corr_table = gr.Dataframe(label="Correlation (numeric)", interactive=False, wrap=True) | |
| corr_note = gr.Textbox(label="Correlation Interpretation", lines=2, interactive=False) | |
| with gr.Tab("Graph"): | |
| gr.Markdown("#### Choose settings, then generate graphs") | |
| barpie_col = gr.Dropdown(label="Column for Bar/Pie", choices=[], value=None) | |
| topn = gr.Slider(label="Top-N categories", minimum=3, maximum=30, value=10, step=1) | |
| scatter_x = gr.Dropdown(label="Scatter X (numeric)", choices=[], value=None) | |
| scatter_y = gr.Dropdown(label="Scatter Y (numeric)", choices=[], value=None) | |
| pair_max = gr.Slider(label="Pair plot max numeric columns", minimum=2, maximum=10, value=6, step=1) | |
| graph_btn = gr.Button("Generate Graphs") | |
| with gr.Row(): | |
| bar_img = gr.Image(label="Bar", type="filepath") | |
| pie_img = gr.Image(label="Pie", type="filepath") | |
| with gr.Row(): | |
| sc_img = gr.Image(label="Scatter", type="filepath") | |
| corr_img = gr.Image(label="Correlogram (numbers inside)", type="filepath") | |
| pair_img = gr.Image(label="Pair Plot", type="filepath") | |
| with gr.Tab("Report"): | |
| gr.Markdown("#### DOCX report (includes all graphs + interpretations)") | |
| id_col_rep = gr.Dropdown(label="Record ID column (for report/search)", choices=[], value=None) | |
| rep_btn = gr.Button("Generate DOCX Report", variant="primary") | |
| rep_status = gr.Textbox(label="Report Status", interactive=False) | |
| rep_file = gr.File(label="Download Report (.docx)") | |
| with gr.Tab("Search"): | |
| id_col_search = gr.Dropdown(label="Select ID column", choices=[], value=None) | |
| query = gr.Textbox(label="Enter ID value (exact match)") | |
| search_btn = gr.Button("Search") | |
| search_msg = gr.Textbox(label="Search Status", interactive=False) | |
| search_out = gr.Dataframe(label="Matching Records", interactive=False, wrap=True) | |
| # --- events --- | |
| def after_load(file_obj, sep_val, header_val): | |
| df, msg, prev, cols, numcols = load_csv(file_obj, sep_val, header_val) | |
| # for dropdowns | |
| return ( | |
| df, cols, numcols, | |
| msg, prev, | |
| gr.update(choices=cols, value=(cols[0] if cols else None)), # barpie_col | |
| gr.update(choices=numcols, value=(numcols[0] if len(numcols) else None)), # scatter_x | |
| gr.update(choices=numcols, value=(numcols[1] if len(numcols) > 1 else None)), # scatter_y | |
| gr.update(choices=cols, value=(cols[0] if cols else None)), # id_col_rep | |
| gr.update(choices=cols, value=(cols[0] if cols else None)), # id_col_search | |
| ) | |
| load_btn.click( | |
| after_load, | |
| inputs=[file_in, sep, header_flag], | |
| outputs=[df_state, cols_state, numcols_state, | |
| load_msg, preview, | |
| barpie_col, scatter_x, scatter_y, id_col_rep, id_col_search] | |
| ) | |
| eda_btn.click( | |
| eda_compute, | |
| inputs=[df_state], | |
| outputs=[eda_note, stats_table, fps_table, skew_table, corr_table, corr_note] | |
| ) | |
| graph_btn.click( | |
| graph_make, | |
| inputs=[df_state, barpie_col, topn, scatter_x, scatter_y, pair_max], | |
| outputs=[bar_img, pie_img, sc_img, corr_img, pair_img] | |
| ) | |
| def rep_run(df, idcol, barcol, topn_v, sx, sy, pmx, file_obj): | |
| name = file_obj.name if file_obj is not None else "uploaded.csv" | |
| path, status = report_generate(df, name, idcol, barcol, topn_v, sx, sy, pmx) | |
| return status, path | |
| rep_btn.click( | |
| rep_run, | |
| inputs=[df_state, id_col_rep, barpie_col, topn, scatter_x, scatter_y, pair_max, file_in], | |
| outputs=[rep_status, rep_file] | |
| ) | |
| search_btn.click( | |
| search_record, | |
| inputs=[df_state, id_col_search, query], | |
| outputs=[search_msg, search_out] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |
| gradio==4.44.1 | |
| pandas==2.2.1 | |
| numpy==1.26.4 | |
| matplotlib==3.8.3 | |
| seaborn==0.13.2 | |
| scipy==1.12.0 | |
| python-docx==1.1.0 |