ipa1 / app.py
SanthiSastra's picture
Upload 3 files
ff6ae19 verified
# -*- coding: utf-8 -*-
"""app.ipynb
Automatically generated by Colab.
Original file is located at
https://colab.research.google.com/drive/1zR9cbA-JzknrRBL2Y35IE1FKmyHJ7_ME
"""
import os
import io
import tempfile
from datetime import datetime
import numpy as np
import pandas as pd
import gradio as gr
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_ALIGN_PARAGRAPH
# ----------------------------
# EDA helpers
# ----------------------------
def find_numeric_columns(df: pd.DataFrame):
return df.select_dtypes(include=[np.number]).columns.tolist()
def safe_skew(series: pd.Series):
s = series.dropna()
if len(s) < 3:
return np.nan
return float(s.skew())
def safe_kurt(series: pd.Series):
s = series.dropna()
if len(s) < 4:
return np.nan
return float(s.kurt())
def basic_numeric_stats(df: pd.DataFrame, num_cols: list[str]) -> pd.DataFrame:
rows = []
for c in num_cols:
s = df[c]
s2 = s.dropna()
rows.append({
"Attribute": c,
"Count": int(s2.shape[0]),
"Missing": int(s.isna().sum()),
"Mean": float(s2.mean()) if len(s2) else np.nan,
"Std": float(s2.std(ddof=1)) if len(s2) > 1 else np.nan,
"Var": float(s2.var(ddof=1)) if len(s2) > 1 else np.nan,
"Min": float(s2.min()) if len(s2) else np.nan,
"25%": float(s2.quantile(0.25)) if len(s2) else np.nan,
"Median": float(s2.median()) if len(s2) else np.nan,
"75%": float(s2.quantile(0.75)) if len(s2) else np.nan,
"Max": float(s2.max()) if len(s2) else np.nan,
"Skewness": safe_skew(s),
"Kurtosis": safe_kurt(s),
})
return pd.DataFrame(rows)
def five_point_summary_table(df: pd.DataFrame, num_cols: list[str]) -> pd.DataFrame:
rows = []
for c in num_cols:
s = df[c].dropna()
if len(s) == 0:
rows.append({"Attribute": c, "Min": np.nan, "Q1": np.nan, "Median": np.nan, "Q3": np.nan, "Max": np.nan})
else:
rows.append({
"Attribute": c,
"Min": float(s.min()),
"Q1": float(s.quantile(0.25)),
"Median": float(s.median()),
"Q3": float(s.quantile(0.75)),
"Max": float(s.max()),
})
return pd.DataFrame(rows)
def interpretation_numeric(stats_df: pd.DataFrame) -> str:
if stats_df.empty:
return "No numeric attributes were detected in the uploaded dataset."
tmp = stats_df[["Attribute", "Skewness"]].dropna()
skew_top = tmp.reindex(tmp["Skewness"].abs().sort_values(ascending=False).index).head(3)
lines = []
lines.append(f"Numeric attributes detected: {len(stats_df)}.")
if len(skew_top) > 0:
parts = [f"{r.Attribute} (skew={r.Skewness:.2f})" for r in skew_top.itertuples(index=False)]
lines.append("Most skewed attributes (absolute skewness): " + ", ".join(parts) + ".")
miss_sorted = stats_df.sort_values("Missing", ascending=False).head(3)
if miss_sorted["Missing"].max() > 0:
parts = [f"{r.Attribute} (missing={int(r.Missing)})" for r in miss_sorted.itertuples(index=False)]
lines.append("Attributes with higher missing values: " + ", ".join(parts) + ".")
else:
lines.append("No missing values were observed in numeric attributes.")
return " ".join(lines)
def correlation_interpretation(corr: pd.DataFrame) -> str:
if corr is None or corr.empty:
return "Correlation could not be computed (insufficient numeric attributes)."
c = corr.copy()
np.fill_diagonal(c.values, np.nan)
stacked = c.stack().dropna()
if stacked.empty:
return "No meaningful pairwise correlations were found."
top = stacked.abs().sort_values(ascending=False).head(3)
lines = []
for (a, b), _ in top.items():
val = float(corr.loc[a, b])
sign = "positive" if val >= 0 else "negative"
lines.append(f"{a} vs {b}: {val:.2f} ({sign})")
return "Strongest correlations: " + "; ".join(lines) + "."
def fig_to_png_path(fig) -> str:
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
fig.savefig(tmp.name, bbox_inches="tight", dpi=200)
plt.close(fig)
return tmp.name
def plot_correlogram_annotated(corr: pd.DataFrame, title="Correlogram (Annotated)"):
fig, ax = plt.subplots(figsize=(8, 6))
data = corr.values
im = ax.imshow(data, aspect="auto")
ax.set_title(title)
ax.set_xticks(range(len(corr.columns)))
ax.set_xticklabels(corr.columns, rotation=45, ha="right")
ax.set_yticks(range(len(corr.index)))
ax.set_yticklabels(corr.index)
for i in range(data.shape[0]):
for j in range(data.shape[1]):
val = data[i, j]
ax.text(j, i, "" if np.isnan(val) else f"{val:.2f}", ha="center", va="center", fontsize=8)
fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
fig.tight_layout()
return fig
def plot_pairplot(df: pd.DataFrame, num_cols: list[str], max_cols=6):
use_cols = num_cols[:max_cols]
if len(use_cols) < 2:
return None
grid = sns.pairplot(df[use_cols].dropna(), corner=True, diag_kind="hist")
grid.fig.suptitle("Pair Plot", y=1.02)
return grid
# ----------------------------
# DOCX helpers
# ----------------------------
def add_heading_centered(doc: Document, text: str, font_size=16, bold=True, color_rgb="1E5AA8"):
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
run = p.add_run(text)
run.bold = bold
run.font.size = Pt(font_size)
# set run color
rPr = run._element.get_or_add_rPr()
color = rPr.get_or_add_color()
color.val = color_rgb
def add_image(doc: Document, image_path: str, width_inches=6.2):
doc.add_picture(image_path, width=Inches(width_inches))
def build_docx_report(df: pd.DataFrame, dataset_name: str, id_col: str | None,
stats_df: pd.DataFrame, corr: pd.DataFrame,
graph_paths: list[tuple[str, str]]) -> str:
doc = Document()
add_heading_centered(doc, "Amrita Manthana", font_size=20, bold=True)
add_heading_centered(doc, "Prof.B.Santhi,SRC,SASTRA", font_size=14, bold=True)
doc.add_paragraph("")
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.add_run("EDA Report").bold = True
doc.add_paragraph(f"Dataset: {dataset_name}")
doc.add_paragraph(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if id_col:
doc.add_paragraph(f"Record ID column: {id_col}")
doc.add_paragraph("")
doc.add_paragraph("1) Numeric Statistics").runs[0].bold = True
doc.add_paragraph(interpretation_numeric(stats_df))
if not stats_df.empty:
tdf = stats_df.head(25) if len(stats_df) > 25 else stats_df
table = doc.add_table(rows=1, cols=len(tdf.columns))
for j, col in enumerate(tdf.columns):
table.rows[0].cells[j].text = str(col)
for _, row in tdf.iterrows():
cells = table.add_row().cells
for j, col in enumerate(tdf.columns):
val = row[col]
if isinstance(val, float):
cells[j].text = "" if np.isnan(val) else f"{val:.4f}"
else:
cells[j].text = str(val)
doc.add_paragraph("")
doc.add_paragraph("2) Correlation").runs[0].bold = True
doc.add_paragraph(correlation_interpretation(corr))
doc.add_paragraph("")
doc.add_paragraph("3) Graphs & Interpretation").runs[0].bold = True
for title, path in graph_paths:
doc.add_paragraph("")
doc.add_paragraph(title).runs[0].bold = True
add_image(doc, path)
if "Correlogram" in title:
doc.add_paragraph("Interpretation: Values near +1/-1 indicate strong positive/negative association.")
elif "Bar" in title:
doc.add_paragraph("Interpretation: Taller bars indicate larger frequency/aggregate value.")
elif "Pie" in title:
doc.add_paragraph("Interpretation: Slice proportions show relative contribution of categories.")
elif "Scatter" in title:
doc.add_paragraph("Interpretation: Patterns indicate linear/non-linear trend, clustering, or outliers.")
elif "Pair Plot" in title:
doc.add_paragraph("Interpretation: Diagonal shows distributions; others show pairwise relationships/outliers.")
doc.add_paragraph("")
doc.add_paragraph("4) Final Remarks").runs[0].bold = True
doc.add_paragraph("This report consolidates numeric measures, distributions, and relationships among attributes.")
out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
doc.save(out_path)
return out_path
# ----------------------------
# App logic
# ----------------------------
def load_csv(file_obj, sep, header_flag):
if file_obj is None:
return None, "Please upload a CSV.", None, [], None
try:
header = 0 if header_flag else None
df = pd.read_csv(file_obj.name, sep=sep, header=header)
if not header_flag:
df.columns = [f"col_{i+1}" for i in range(df.shape[1])]
num_cols = find_numeric_columns(df)
cols = df.columns.tolist()
info = f"Loaded: rows={df.shape[0]}, cols={df.shape[1]}. Numeric cols={len(num_cols)}."
return df, info, df.head(30), cols, num_cols
except Exception as e:
return None, f"Could not read CSV: {e}", None, [], None
def eda_compute(df: pd.DataFrame):
if df is None:
return "Upload a CSV first.", None, None, None, None
num_cols = find_numeric_columns(df)
if len(num_cols) == 0:
return "No numeric columns found.", pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), "Correlation not available."
stats_df = basic_numeric_stats(df, num_cols)
fps_df = five_point_summary_table(df, num_cols)
corr_df = pd.DataFrame()
corr_text = "Correlation not available (need at least 2 numeric columns)."
if len(num_cols) >= 2:
corr_df = df[num_cols].corr(numeric_only=True)
corr_text = correlation_interpretation(corr_df)
skew_df = pd.DataFrame({"Attribute": num_cols, "Skewness": [safe_skew(df[c]) for c in num_cols]})
return interpretation_numeric(stats_df), stats_df, fps_df, skew_df, corr_df, corr_text
def graph_make(df: pd.DataFrame, barpie_col: str, topn: int,
scatter_x: str, scatter_y: str, pair_max: int):
if df is None:
return None, None, None, None, None
paths = [None, None, None, None, None]
# Bar / Pie data
series = df[barpie_col]
if pd.api.types.is_numeric_dtype(series):
binned = pd.cut(series.dropna(), bins=10)
counts = binned.value_counts().head(topn)
labels = counts.index.astype(str).tolist()
yvals = counts.values
suffix = "(binned)"
else:
counts = series.astype(str).value_counts().head(topn)
labels = counts.index.tolist()
yvals = counts.values
suffix = ""
fig_bar, ax = plt.subplots(figsize=(7, 4))
ax.bar(range(len(labels)), yvals)
ax.set_xticks(range(len(labels)))
ax.set_xticklabels(labels, rotation=45, ha="right")
ax.set_title(f"Bar Chart: {barpie_col} {suffix}")
ax.set_ylabel("Count")
paths[0] = fig_to_png_path(fig_bar)
fig_pie, ax2 = plt.subplots(figsize=(6, 4))
ax2.pie(yvals, labels=labels, autopct="%1.1f%%")
ax2.set_title(f"Pie Chart: {barpie_col} {suffix}")
paths[1] = fig_to_png_path(fig_pie)
# Scatter / Corr / Pair
num_cols = find_numeric_columns(df)
if len(num_cols) >= 2 and scatter_x in num_cols and scatter_y in num_cols:
fig_sc, ax3 = plt.subplots(figsize=(7, 4))
ax3.scatter(df[scatter_x], df[scatter_y], alpha=0.7)
ax3.set_xlabel(scatter_x)
ax3.set_ylabel(scatter_y)
ax3.set_title(f"Scatter: {scatter_x} vs {scatter_y}")
paths[2] = fig_to_png_path(fig_sc)
corr = df[num_cols].corr(numeric_only=True)
fig_corr = plot_correlogram_annotated(corr)
paths[3] = fig_to_png_path(fig_corr)
pair_max = max(2, min(pair_max, len(num_cols)))
grid = plot_pairplot(df, num_cols, max_cols=pair_max)
if grid is not None:
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
grid.fig.savefig(tmp, bbox_inches="tight", dpi=200)
plt.close(grid.fig)
paths[4] = tmp
return tuple(paths)
def report_generate(df: pd.DataFrame, dataset_name: str, id_col: str,
barpie_col: str, topn: int, scatter_x: str, scatter_y: str, pair_max: int):
if df is None:
return None, "Upload a CSV first."
num_cols = find_numeric_columns(df)
stats_df = basic_numeric_stats(df, num_cols) if len(num_cols) else pd.DataFrame()
corr = df[num_cols].corr(numeric_only=True) if len(num_cols) >= 2 else pd.DataFrame()
# build graphs (same as graph tab)
bar_path, pie_path, sc_path, corr_path, pair_path = graph_make(df, barpie_col, topn, scatter_x, scatter_y, pair_max)
graph_paths = []
if bar_path: graph_paths.append((f"Bar Chart: {barpie_col}", bar_path))
if pie_path: graph_paths.append((f"Pie Chart: {barpie_col}", pie_path))
if sc_path: graph_paths.append((f"Scatter: {scatter_x} vs {scatter_y}", sc_path))
if corr_path: graph_paths.append(("Correlogram (Annotated)", corr_path))
if pair_path: graph_paths.append((f"Pair Plot (first {min(pair_max, len(num_cols))} numeric cols)", pair_path))
docx_path = build_docx_report(df, dataset_name or "uploaded.csv", id_col, stats_df, corr, graph_paths)
# (optional) cleanup images later is fine; HF temp storage is ok for session
return docx_path, "DOCX report generated successfully."
def search_record(df: pd.DataFrame, id_col: str, query: str):
if df is None:
return "Upload a CSV first.", None
if not query:
return "Enter a value to search.", None
if id_col not in df.columns:
return "Select a valid ID column.", None
col = df[id_col]
result = None
if pd.api.types.is_numeric_dtype(col):
try:
q = float(query)
result = df[df[id_col] == q]
except:
result = df[col.astype(str) == query]
else:
result = df[col.astype(str) == query]
if result is None or result.empty:
return "No matching record found.", pd.DataFrame()
return f"Found {len(result)} record(s).", result
# ----------------------------
# UI
# ----------------------------
CSS = """
#titleblock {text-align:center; margin-top: 5px; margin-bottom: 10px;}
#t1 {font-size:30px; font-weight:800; color:#1E5AA8;}
#t2 {font-size:18px; font-weight:800; color:#1E5AA8;}
"""
def logo_html():
# place logo.png in repo root
if os.path.exists("logo.png"):
# gradio serves files placed in root with relative path in HTML
return f"""
<div id="titleblock">
<img src="file=logo.png" style="width:110px; display:block; margin:0 auto;" />
<div id="t1">Amrita Manthana</div>
<div id="t2">Prof.B.Santhi,SRC,SASTRA</div>
</div>
"""
else:
return """
<div id="titleblock">
<div id="t1">Amrita Manthana</div>
<div id="t2">Prof.B.Santhi,SRC,SASTRA</div>
</div>
"""
with gr.Blocks(css=CSS, title="Amrita Manthana - EDA (Gradio)") as demo:
gr.HTML(logo_html())
df_state = gr.State(None)
cols_state = gr.State([])
numcols_state = gr.State([])
with gr.Row():
with gr.Column(scale=1, min_width=320):
gr.Markdown("### Data Upload")
file_in = gr.File(label="Upload CSV", file_types=[".csv"])
sep = gr.Dropdown(label="CSV Separator", choices=[",", ";", "\t", "|"], value=",")
header_flag = gr.Checkbox(label="First row is header", value=True)
load_btn = gr.Button("Load Data", variant="primary")
load_msg = gr.Textbox(label="Status", interactive=False)
preview = gr.Dataframe(label="Preview (first 30 rows)", interactive=False, wrap=True)
with gr.Column(scale=2):
with gr.Tabs():
with gr.Tab("EDA"):
eda_btn = gr.Button("Compute EDA")
eda_note = gr.Textbox(label="Interpretation", lines=3, interactive=False)
stats_table = gr.Dataframe(label="Descriptive Statistics", interactive=False, wrap=True)
fps_table = gr.Dataframe(label="Five-Point Summary", interactive=False, wrap=True)
skew_table = gr.Dataframe(label="Skewness", interactive=False, wrap=True)
corr_table = gr.Dataframe(label="Correlation (numeric)", interactive=False, wrap=True)
corr_note = gr.Textbox(label="Correlation Interpretation", lines=2, interactive=False)
with gr.Tab("Graph"):
gr.Markdown("#### Choose settings, then generate graphs")
barpie_col = gr.Dropdown(label="Column for Bar/Pie", choices=[], value=None)
topn = gr.Slider(label="Top-N categories", minimum=3, maximum=30, value=10, step=1)
scatter_x = gr.Dropdown(label="Scatter X (numeric)", choices=[], value=None)
scatter_y = gr.Dropdown(label="Scatter Y (numeric)", choices=[], value=None)
pair_max = gr.Slider(label="Pair plot max numeric columns", minimum=2, maximum=10, value=6, step=1)
graph_btn = gr.Button("Generate Graphs")
with gr.Row():
bar_img = gr.Image(label="Bar", type="filepath")
pie_img = gr.Image(label="Pie", type="filepath")
with gr.Row():
sc_img = gr.Image(label="Scatter", type="filepath")
corr_img = gr.Image(label="Correlogram (numbers inside)", type="filepath")
pair_img = gr.Image(label="Pair Plot", type="filepath")
with gr.Tab("Report"):
gr.Markdown("#### DOCX report (includes all graphs + interpretations)")
id_col_rep = gr.Dropdown(label="Record ID column (for report/search)", choices=[], value=None)
rep_btn = gr.Button("Generate DOCX Report", variant="primary")
rep_status = gr.Textbox(label="Report Status", interactive=False)
rep_file = gr.File(label="Download Report (.docx)")
with gr.Tab("Search"):
id_col_search = gr.Dropdown(label="Select ID column", choices=[], value=None)
query = gr.Textbox(label="Enter ID value (exact match)")
search_btn = gr.Button("Search")
search_msg = gr.Textbox(label="Search Status", interactive=False)
search_out = gr.Dataframe(label="Matching Records", interactive=False, wrap=True)
# --- events ---
def after_load(file_obj, sep_val, header_val):
df, msg, prev, cols, numcols = load_csv(file_obj, sep_val, header_val)
# for dropdowns
return (
df, cols, numcols,
msg, prev,
gr.update(choices=cols, value=(cols[0] if cols else None)), # barpie_col
gr.update(choices=numcols, value=(numcols[0] if len(numcols) else None)), # scatter_x
gr.update(choices=numcols, value=(numcols[1] if len(numcols) > 1 else None)), # scatter_y
gr.update(choices=cols, value=(cols[0] if cols else None)), # id_col_rep
gr.update(choices=cols, value=(cols[0] if cols else None)), # id_col_search
)
load_btn.click(
after_load,
inputs=[file_in, sep, header_flag],
outputs=[df_state, cols_state, numcols_state,
load_msg, preview,
barpie_col, scatter_x, scatter_y, id_col_rep, id_col_search]
)
eda_btn.click(
eda_compute,
inputs=[df_state],
outputs=[eda_note, stats_table, fps_table, skew_table, corr_table, corr_note]
)
graph_btn.click(
graph_make,
inputs=[df_state, barpie_col, topn, scatter_x, scatter_y, pair_max],
outputs=[bar_img, pie_img, sc_img, corr_img, pair_img]
)
def rep_run(df, idcol, barcol, topn_v, sx, sy, pmx, file_obj):
name = file_obj.name if file_obj is not None else "uploaded.csv"
path, status = report_generate(df, name, idcol, barcol, topn_v, sx, sy, pmx)
return status, path
rep_btn.click(
rep_run,
inputs=[df_state, id_col_rep, barpie_col, topn, scatter_x, scatter_y, pair_max, file_in],
outputs=[rep_status, rep_file]
)
search_btn.click(
search_record,
inputs=[df_state, id_col_search, query],
outputs=[search_msg, search_out]
)
if __name__ == "__main__":
demo.launch()
gradio==4.44.1
pandas==2.2.1
numpy==1.26.4
matplotlib==3.8.3
seaborn==0.13.2
scipy==1.12.0
python-docx==1.1.0