Spaces:

SanthiSastra
/

Demo

Sleeping

App Files Files Community

Demo / app.py

SanthiSastra

Update app.py

09db78e verified 2 months ago

raw

history blame contribute delete

11.9 kB

	# app.py (Hugging Face Spaces + Gradio)
	# Requirements: gradio, pandas, numpy, matplotlib, python-docx, scikit-learn

	import os
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import gradio as gr

	from docx import Document
	from docx.shared import Inches

	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import StandardScaler
	from sklearn.decomposition import PCA

	DOCX_OUT_PATH = "/tmp/EDA_Report.docx"


	# ----------------------------- Helpers -----------------------------
	def read_csv_safely(filepath: str) -> pd.DataFrame:
	try:
	return pd.read_csv(filepath)
	except UnicodeDecodeError:
	return pd.read_csv(filepath, encoding="latin1")


	def clean_df(df: pd.DataFrame) -> pd.DataFrame:
	df = df.copy()
	df.columns = [str(c).strip().replace(" ", "_") for c in df.columns]
	for c in list(df.columns):
	if c.lower().startswith("unnamed"):
	df = df.drop(columns=[c])
	return df


	def save_plot(fig, out_path: str) -> str:
	fig.savefig(out_path, dpi=180, bbox_inches="tight")
	plt.close(fig)
	return out_path


	def make_interpretation_notes(df: pd.DataFrame) -> str:
	notes = []
	notes.append(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns.")

	miss = (df.isna().mean() * 100).sort_values(ascending=False)
	top_miss = miss[miss > 0].head(5)
	if len(top_miss) == 0:
	notes.append("No missing values detected.")
	else:
	notes.append("Top missing columns (%): " + ", ".join([f"{k}={v:.1f}%" for k, v in top_miss.items()]))

	num_df = df.select_dtypes(include=[np.number])
	if num_df.shape[1] > 0:
	skew = num_df.skew(numeric_only=True)
	high_skew = skew[skew.abs() > 1].sort_values(key=lambda s: s.abs(), ascending=False).head(5)
	if len(high_skew) > 0:
	notes.append("Highly skewed numeric features (\|skew\|>1): " +
	", ".join([f"{k}={v:.2f}" for k, v in high_skew.items()]) +
	". Consider log/Box-Cox or robust scaling if needed.")
	else:
	notes.append("No strongly skewed numeric features (\|skew\|>1) detected among numeric columns.")

	if num_df.shape[1] >= 2:
	corr = num_df.corr(numeric_only=True)
	# strongest correlations (excluding self)
	pairs = []
	cols = corr.columns
	for i in range(len(cols)):
	for j in range(i + 1, len(cols)):
	pairs.append((cols[i], cols[j], corr.iloc[i, j]))
	pairs = sorted(pairs, key=lambda x: abs(x[2]), reverse=True)[:5]
	if pairs:
	notes.append("Top correlations (absolute): " + ", ".join([f"{a}-{b}={c:.2f}" for a, b, c in pairs]))
	else:
	notes.append("No numeric columns detected; plots and numeric summary will be limited.")

	return "\n• " + "\n• ".join(notes)


	# ----------------------------- DOCX Report -----------------------------
	def make_docx_report(df: pd.DataFrame, fig_paths: list, notes: list) -> str:
	doc = Document()
	doc.add_heading("EDA Report (Auto-generated)", level=1)

	doc.add_heading("Dataset Overview", level=2)
	doc.add_paragraph(f"Rows: {df.shape[0]}")
	doc.add_paragraph(f"Columns: {df.shape[1]}")

	doc.add_heading("Column Types", level=2)
	dtypes = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
	t = doc.add_table(rows=1, cols=2)
	t.rows[0].cells[0].text = "column"
	t.rows[0].cells[1].text = "dtype"
	for _, r in dtypes.head(100).iterrows():
	row = t.add_row().cells
	row[0].text = str(r["column"])
	row[1].text = str(r["dtype"])

	doc.add_heading("Missing Values", level=2)
	miss = (df.isna().mean() * 100).sort_values(ascending=False)
	t2 = doc.add_table(rows=1, cols=2)
	t2.rows[0].cells[0].text = "column"
	t2.rows[0].cells[1].text = "missing_%"
	for idx, val in miss.head(25).items():
	row = t2.add_row().cells
	row[0].text = str(idx)
	row[1].text = f"{val:.2f}"

	doc.add_paragraph("Interpretation: Columns with high missing values may need imputation or removal.")

	doc.add_heading("Summary Statistics (Numeric)", level=2)
	num_df = df.select_dtypes(include=[np.number])
	if num_df.shape[1] > 0:
	desc = num_df.describe().T.reset_index().rename(columns={"index": "feature"})
	cols = [c for c in ["feature", "mean", "std", "min", "50%", "max"] if c in desc.columns]
	t3 = doc.add_table(rows=1, cols=len(cols))
	for j, c in enumerate(cols):
	t3.rows[0].cells[j].text = c
	for _, r in desc.head(30).iterrows():
	row = t3.add_row().cells
	for j, c in enumerate(cols):
	v = r[c]
	row[j].text = str(round(v, 6)) if isinstance(v, (int, float, np.floating)) else str(v)
	else:
	doc.add_paragraph("No numeric columns found.")

	doc.add_heading("Charts + Interpretations", level=2)
	for fp, note in zip(fig_paths, notes):
	doc.add_paragraph(f"Interpretation: {note}")
	if os.path.exists(fp):
	doc.add_picture(fp, width=Inches(6.5))

	doc.save(DOCX_OUT_PATH)
	return DOCX_OUT_PATH


	# ----------------------------- EDA Pipeline -----------------------------
	def eda_pipeline(csv_path: str):
	if csv_path is None or str(csv_path).strip() == "":
	return "Please upload a CSV.", None, None, None, None, None, None, None, ""

	try:
	df = clean_df(read_csv_safely(csv_path))
	except Exception as e:
	return f"Could not read CSV: {e}", None, None, None, None, None, None, None, ""

	preview = df.head(25)
	dtypes_df = pd.DataFrame({"column": df.columns, "dtype": [str(df[c].dtype) for c in df.columns]})
	miss_df = (df.isna().mean() * 100).sort_values(ascending=False).to_frame("missing_%").head(25)

	num_df = df.select_dtypes(include=[np.number])
	desc = num_df.describe().T if num_df.shape[1] > 0 else pd.DataFrame()

	fig_paths, notes = [], []

	# Plot 1: Missingness
	miss_series = (df.isna().mean() * 100).sort_values(ascending=False).head(15)
	fig1 = plt.figure(figsize=(10, 4))
	plt.bar(miss_series.index.astype(str), miss_series.values)
	plt.title("Missing Values (%): Top 15 Columns")
	plt.xticks(rotation=45, ha="right", fontsize=7)
	plt.ylabel("Missing (%)")
	fig_paths.append(save_plot(fig1, "/tmp/missingness.png"))
	notes.append("High-missing columns may need imputation (median/mode) or removal based on usefulness.")

	corr_plot = None
	hist_plot = None

	# Plot 2: Correlation
	if num_df.shape[1] >= 2:
	corr = num_df.corr(numeric_only=True)
	fig2 = plt.figure(figsize=(10, 5))
	plt.imshow(corr.values, aspect="auto")
	plt.title("Correlation Heatmap (Numeric)")
	plt.xticks(range(len(corr.columns)), corr.columns, rotation=90, fontsize=7)
	plt.yticks(range(len(corr.index)), corr.index, fontsize=7)
	plt.colorbar()
	corr_plot = fig2
	fig_paths.append(save_plot(fig2, "/tmp/corr_heatmap.png"))
	notes.append("Strong correlations may indicate redundant features; consider feature selection/regularization.")

	# Plot 3: Histograms
	if num_df.shape[1] > 0:
	cols = list(num_df.columns)[:4]
	fig3 = plt.figure(figsize=(10, 6))
	for i, c in enumerate(cols, start=1):
	plt.subplot(2, 2, i)
	plt.hist(num_df[c].dropna().values, bins=25)
	plt.title(c, fontsize=9)
	plt.suptitle("Histograms (first 4 numeric columns)", y=1.02)
	plt.tight_layout()
	hist_plot = fig3
	fig_paths.append(save_plot(fig3, "/tmp/histograms.png"))
	notes.append("Histograms show distribution/outliers/skewness; consider transforms for highly skewed features.")

	# DOCX
	try:
	docx_path = make_docx_report(df, fig_paths, notes)
	except Exception as e:
	interp = make_interpretation_notes(df)
	return f"Error while creating DOCX: {e}", preview, dtypes_df, miss_df, desc, None, corr_plot, hist_plot, interp

	interp = make_interpretation_notes(df)
	summary_text = f"Loaded CSV successfully. Rows: {df.shape[0]} \| Columns: {df.shape[1]}"
	return summary_text, preview, dtypes_df, miss_df, desc, docx_path, corr_plot, hist_plot, interp


	# ----------------------------- App UI (Beautiful College View) -----------------------------
	CSS = """
	/* Center header */
	#hdr {text-align:center; margin-top:8px; margin-bottom:6px;}
	#appname {color:#0b3d91; font-weight:900; font-size:28px; margin:0;}
	#appsub {color:#0b3d91; font-weight:700; font-size:16px; margin-top:4px;}
	#appauth {color:#0b3d91; font-weight:700; font-size:14px; margin-top:2px;}

	/* Ribbon tabs */
	.gradio-container .tabs {border-radius:14px;}
	.gradio-container .tabitem {font-weight:800;}
	/* Card style */
	.card {border:1px solid rgba(148,163,184,.35); border-radius:18px; padding:14px; background:rgba(255,255,255,.92);}
	.hint {font-size:12px; color:#475569;}
	"""

	with gr.Blocks(
	title="SAMUDHRAMADANAM-AMIRTHAM1 \| SASTRA",
	theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate", neutral_hue="gray"),
	css=CSS
	) as demo:

	# ---------- Header (CENTER) ----------
	with gr.Column(elem_id="hdr"):
	gr.Image(value="logo.jpg", show_label=False, height=120, container=False)
	gr.Markdown("<div id='appname'>SAMUDHRAMADANAM-AMIRTHAM1</div>")
	gr.Markdown("<div id='appauth'>Prof.B.Santhi, SRC, SASTRA</div>")

	gr.Markdown("<hr>")

	# ---------- Left controls + Ribbon outputs ----------
	with gr.Row():
	with gr.Column(scale=1, min_width=340):
	with gr.Group(elem_classes="card"):
	gr.Markdown("### Upload CSV")
	gr.Markdown("<div class='hint'>Upload your dataset (CSV). Then run EDA to view tables, graphs and download report.</div>")
	file_in = gr.File(label="Upload CSV", file_types=[".csv"], type="filepath")
	run_btn = gr.Button("Run EDA", variant="primary")
	status = gr.Textbox(label="Status", lines=2)

	with gr.Column(scale=2, min_width=520):
	with gr.Tabs():
	# Ribbon 1: EDA
	with gr.TabItem("EDA"):
	with gr.Group(elem_classes="card"):
	preview_out = gr.Dataframe(label="Preview (first 25 rows)", interactive=False)
	dtypes_out = gr.Dataframe(label="Column Types", interactive=False)
	with gr.Group(elem_classes="card"):
	miss_out = gr.Dataframe(label="Missing Values (% top 25)", interactive=False)
	desc_out = gr.Dataframe(label="Numeric Summary (describe)", interactive=False)

	# Ribbon 2: Graph
	with gr.TabItem("Graph"):
	with gr.Group(elem_classes="card"):
	with gr.Row():
	corr_plot_out = gr.Plot(label="Correlation Heatmap")
	hist_plot_out = gr.Plot(label="Histograms")

	# Ribbon 3: Report
	with gr.TabItem("Report"):
	with gr.Group(elem_classes="card"):
	gr.Markdown("### Download Report")
	docx_out = gr.File(label="EDA Report (.docx)")

	# Ribbon 4: Interpretation
	with gr.TabItem("Interpretation"):
	with gr.Group(elem_classes="card"):
	interp_out = gr.Textbox(label="Auto Interpretation", lines=10)

	# ---------- Wiring ----------
	run_btn.click(
	fn=eda_pipeline,
	inputs=[file_in],
	outputs=[status, preview_out, dtypes_out, miss_out, desc_out, docx_out, corr_plot_out, hist_plot_out, interp_out]
	)

	demo.launch()