Spaces:

SanthiSastra
/

ipa1

Build error

App Files Files Community

ipa1 / app.py

SanthiSastra

Upload 3 files

ff6ae19 verified about 2 months ago

raw

history blame contribute delete

21.3 kB

	# -- coding: utf-8 --
	"""app.ipynb

	Automatically generated by Colab.

	Original file is located at
	https://colab.research.google.com/drive/1zR9cbA-JzknrRBL2Y35IE1FKmyHJ7_ME
	"""

	import os
	import io
	import tempfile
	from datetime import datetime

	import numpy as np
	import pandas as pd

	import gradio as gr

	import matplotlib.pyplot as plt
	import seaborn as sns
	from scipy import stats

	from docx import Document
	from docx.shared import Inches, Pt
	from docx.enum.text import WD_ALIGN_PARAGRAPH


	# ----------------------------
	# EDA helpers
	# ----------------------------
	def find_numeric_columns(df: pd.DataFrame):
	return df.select_dtypes(include=[np.number]).columns.tolist()

	def safe_skew(series: pd.Series):
	s = series.dropna()
	if len(s) < 3:
	return np.nan
	return float(s.skew())

	def safe_kurt(series: pd.Series):
	s = series.dropna()
	if len(s) < 4:
	return np.nan
	return float(s.kurt())

	def basic_numeric_stats(df: pd.DataFrame, num_cols: list[str]) -> pd.DataFrame:
	rows = []
	for c in num_cols:
	s = df[c]
	s2 = s.dropna()
	rows.append({
	"Attribute": c,
	"Count": int(s2.shape[0]),
	"Missing": int(s.isna().sum()),
	"Mean": float(s2.mean()) if len(s2) else np.nan,
	"Std": float(s2.std(ddof=1)) if len(s2) > 1 else np.nan,
	"Var": float(s2.var(ddof=1)) if len(s2) > 1 else np.nan,
	"Min": float(s2.min()) if len(s2) else np.nan,
	"25%": float(s2.quantile(0.25)) if len(s2) else np.nan,
	"Median": float(s2.median()) if len(s2) else np.nan,
	"75%": float(s2.quantile(0.75)) if len(s2) else np.nan,
	"Max": float(s2.max()) if len(s2) else np.nan,
	"Skewness": safe_skew(s),
	"Kurtosis": safe_kurt(s),
	})
	return pd.DataFrame(rows)

	def five_point_summary_table(df: pd.DataFrame, num_cols: list[str]) -> pd.DataFrame:
	rows = []
	for c in num_cols:
	s = df[c].dropna()
	if len(s) == 0:
	rows.append({"Attribute": c, "Min": np.nan, "Q1": np.nan, "Median": np.nan, "Q3": np.nan, "Max": np.nan})
	else:
	rows.append({
	"Attribute": c,
	"Min": float(s.min()),
	"Q1": float(s.quantile(0.25)),
	"Median": float(s.median()),
	"Q3": float(s.quantile(0.75)),
	"Max": float(s.max()),
	})
	return pd.DataFrame(rows)

	def interpretation_numeric(stats_df: pd.DataFrame) -> str:
	if stats_df.empty:
	return "No numeric attributes were detected in the uploaded dataset."
	tmp = stats_df[["Attribute", "Skewness"]].dropna()
	skew_top = tmp.reindex(tmp["Skewness"].abs().sort_values(ascending=False).index).head(3)
	lines = []
	lines.append(f"Numeric attributes detected: {len(stats_df)}.")
	if len(skew_top) > 0:
	parts = [f"{r.Attribute} (skew={r.Skewness:.2f})" for r in skew_top.itertuples(index=False)]
	lines.append("Most skewed attributes (absolute skewness): " + ", ".join(parts) + ".")
	miss_sorted = stats_df.sort_values("Missing", ascending=False).head(3)
	if miss_sorted["Missing"].max() > 0:
	parts = [f"{r.Attribute} (missing={int(r.Missing)})" for r in miss_sorted.itertuples(index=False)]
	lines.append("Attributes with higher missing values: " + ", ".join(parts) + ".")
	else:
	lines.append("No missing values were observed in numeric attributes.")
	return " ".join(lines)

	def correlation_interpretation(corr: pd.DataFrame) -> str:
	if corr is None or corr.empty:
	return "Correlation could not be computed (insufficient numeric attributes)."
	c = corr.copy()
	np.fill_diagonal(c.values, np.nan)
	stacked = c.stack().dropna()
	if stacked.empty:
	return "No meaningful pairwise correlations were found."
	top = stacked.abs().sort_values(ascending=False).head(3)
	lines = []
	for (a, b), _ in top.items():
	val = float(corr.loc[a, b])
	sign = "positive" if val >= 0 else "negative"
	lines.append(f"{a} vs {b}: {val:.2f} ({sign})")
	return "Strongest correlations: " + "; ".join(lines) + "."

	def fig_to_png_path(fig) -> str:
	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
	fig.savefig(tmp.name, bbox_inches="tight", dpi=200)
	plt.close(fig)
	return tmp.name

	def plot_correlogram_annotated(corr: pd.DataFrame, title="Correlogram (Annotated)"):
	fig, ax = plt.subplots(figsize=(8, 6))
	data = corr.values
	im = ax.imshow(data, aspect="auto")
	ax.set_title(title)
	ax.set_xticks(range(len(corr.columns)))
	ax.set_xticklabels(corr.columns, rotation=45, ha="right")
	ax.set_yticks(range(len(corr.index)))
	ax.set_yticklabels(corr.index)

	for i in range(data.shape[0]):
	for j in range(data.shape[1]):
	val = data[i, j]
	ax.text(j, i, "" if np.isnan(val) else f"{val:.2f}", ha="center", va="center", fontsize=8)

	fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
	fig.tight_layout()
	return fig

	def plot_pairplot(df: pd.DataFrame, num_cols: list[str], max_cols=6):
	use_cols = num_cols[:max_cols]
	if len(use_cols) < 2:
	return None
	grid = sns.pairplot(df[use_cols].dropna(), corner=True, diag_kind="hist")
	grid.fig.suptitle("Pair Plot", y=1.02)
	return grid


	# ----------------------------
	# DOCX helpers
	# ----------------------------
	def add_heading_centered(doc: Document, text: str, font_size=16, bold=True, color_rgb="1E5AA8"):
	p = doc.add_paragraph()
	p.alignment = WD_ALIGN_PARAGRAPH.CENTER
	run = p.add_run(text)
	run.bold = bold
	run.font.size = Pt(font_size)
	# set run color
	rPr = run._element.get_or_add_rPr()
	color = rPr.get_or_add_color()
	color.val = color_rgb

	def add_image(doc: Document, image_path: str, width_inches=6.2):
	doc.add_picture(image_path, width=Inches(width_inches))

	def build_docx_report(df: pd.DataFrame, dataset_name: str, id_col: str \| None,
	stats_df: pd.DataFrame, corr: pd.DataFrame,
	graph_paths: list[tuple[str, str]]) -> str:
	doc = Document()

	add_heading_centered(doc, "Amrita Manthana", font_size=20, bold=True)
	add_heading_centered(doc, "Prof.B.Santhi,SRC,SASTRA", font_size=14, bold=True)

	doc.add_paragraph("")
	p = doc.add_paragraph()
	p.alignment = WD_ALIGN_PARAGRAPH.CENTER
	p.add_run("EDA Report").bold = True

	doc.add_paragraph(f"Dataset: {dataset_name}")
	doc.add_paragraph(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
	if id_col:
	doc.add_paragraph(f"Record ID column: {id_col}")

	doc.add_paragraph("")
	doc.add_paragraph("1) Numeric Statistics").runs[0].bold = True
	doc.add_paragraph(interpretation_numeric(stats_df))

	if not stats_df.empty:
	tdf = stats_df.head(25) if len(stats_df) > 25 else stats_df
	table = doc.add_table(rows=1, cols=len(tdf.columns))
	for j, col in enumerate(tdf.columns):
	table.rows[0].cells[j].text = str(col)
	for _, row in tdf.iterrows():
	cells = table.add_row().cells
	for j, col in enumerate(tdf.columns):
	val = row[col]
	if isinstance(val, float):
	cells[j].text = "" if np.isnan(val) else f"{val:.4f}"
	else:
	cells[j].text = str(val)

	doc.add_paragraph("")
	doc.add_paragraph("2) Correlation").runs[0].bold = True
	doc.add_paragraph(correlation_interpretation(corr))

	doc.add_paragraph("")
	doc.add_paragraph("3) Graphs & Interpretation").runs[0].bold = True
	for title, path in graph_paths:
	doc.add_paragraph("")
	doc.add_paragraph(title).runs[0].bold = True
	add_image(doc, path)
	if "Correlogram" in title:
	doc.add_paragraph("Interpretation: Values near +1/-1 indicate strong positive/negative association.")
	elif "Bar" in title:
	doc.add_paragraph("Interpretation: Taller bars indicate larger frequency/aggregate value.")
	elif "Pie" in title:
	doc.add_paragraph("Interpretation: Slice proportions show relative contribution of categories.")
	elif "Scatter" in title:
	doc.add_paragraph("Interpretation: Patterns indicate linear/non-linear trend, clustering, or outliers.")
	elif "Pair Plot" in title:
	doc.add_paragraph("Interpretation: Diagonal shows distributions; others show pairwise relationships/outliers.")

	doc.add_paragraph("")
	doc.add_paragraph("4) Final Remarks").runs[0].bold = True
	doc.add_paragraph("This report consolidates numeric measures, distributions, and relationships among attributes.")

	out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".docx").name
	doc.save(out_path)
	return out_path


	# ----------------------------
	# App logic
	# ----------------------------
	def load_csv(file_obj, sep, header_flag):
	if file_obj is None:
	return None, "Please upload a CSV.", None, [], None

	try:
	header = 0 if header_flag else None
	df = pd.read_csv(file_obj.name, sep=sep, header=header)
	if not header_flag:
	df.columns = [f"col_{i+1}" for i in range(df.shape[1])]
	num_cols = find_numeric_columns(df)
	cols = df.columns.tolist()
	info = f"Loaded: rows={df.shape[0]}, cols={df.shape[1]}. Numeric cols={len(num_cols)}."
	return df, info, df.head(30), cols, num_cols
	except Exception as e:
	return None, f"Could not read CSV: {e}", None, [], None


	def eda_compute(df: pd.DataFrame):
	if df is None:
	return "Upload a CSV first.", None, None, None, None

	num_cols = find_numeric_columns(df)
	if len(num_cols) == 0:
	return "No numeric columns found.", pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), "Correlation not available."

	stats_df = basic_numeric_stats(df, num_cols)
	fps_df = five_point_summary_table(df, num_cols)

	corr_df = pd.DataFrame()
	corr_text = "Correlation not available (need at least 2 numeric columns)."
	if len(num_cols) >= 2:
	corr_df = df[num_cols].corr(numeric_only=True)
	corr_text = correlation_interpretation(corr_df)

	skew_df = pd.DataFrame({"Attribute": num_cols, "Skewness": [safe_skew(df[c]) for c in num_cols]})
	return interpretation_numeric(stats_df), stats_df, fps_df, skew_df, corr_df, corr_text


	def graph_make(df: pd.DataFrame, barpie_col: str, topn: int,
	scatter_x: str, scatter_y: str, pair_max: int):
	if df is None:
	return None, None, None, None, None

	paths = [None, None, None, None, None]

	# Bar / Pie data
	series = df[barpie_col]
	if pd.api.types.is_numeric_dtype(series):
	binned = pd.cut(series.dropna(), bins=10)
	counts = binned.value_counts().head(topn)
	labels = counts.index.astype(str).tolist()
	yvals = counts.values
	suffix = "(binned)"
	else:
	counts = series.astype(str).value_counts().head(topn)
	labels = counts.index.tolist()
	yvals = counts.values
	suffix = ""

	fig_bar, ax = plt.subplots(figsize=(7, 4))
	ax.bar(range(len(labels)), yvals)
	ax.set_xticks(range(len(labels)))
	ax.set_xticklabels(labels, rotation=45, ha="right")
	ax.set_title(f"Bar Chart: {barpie_col} {suffix}")
	ax.set_ylabel("Count")
	paths[0] = fig_to_png_path(fig_bar)

	fig_pie, ax2 = plt.subplots(figsize=(6, 4))
	ax2.pie(yvals, labels=labels, autopct="%1.1f%%")
	ax2.set_title(f"Pie Chart: {barpie_col} {suffix}")
	paths[1] = fig_to_png_path(fig_pie)

	# Scatter / Corr / Pair
	num_cols = find_numeric_columns(df)
	if len(num_cols) >= 2 and scatter_x in num_cols and scatter_y in num_cols:
	fig_sc, ax3 = plt.subplots(figsize=(7, 4))
	ax3.scatter(df[scatter_x], df[scatter_y], alpha=0.7)
	ax3.set_xlabel(scatter_x)
	ax3.set_ylabel(scatter_y)
	ax3.set_title(f"Scatter: {scatter_x} vs {scatter_y}")
	paths[2] = fig_to_png_path(fig_sc)

	corr = df[num_cols].corr(numeric_only=True)
	fig_corr = plot_correlogram_annotated(corr)
	paths[3] = fig_to_png_path(fig_corr)

	pair_max = max(2, min(pair_max, len(num_cols)))
	grid = plot_pairplot(df, num_cols, max_cols=pair_max)
	if grid is not None:
	tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".png").name
	grid.fig.savefig(tmp, bbox_inches="tight", dpi=200)
	plt.close(grid.fig)
	paths[4] = tmp

	return tuple(paths)


	def report_generate(df: pd.DataFrame, dataset_name: str, id_col: str,
	barpie_col: str, topn: int, scatter_x: str, scatter_y: str, pair_max: int):
	if df is None:
	return None, "Upload a CSV first."

	num_cols = find_numeric_columns(df)
	stats_df = basic_numeric_stats(df, num_cols) if len(num_cols) else pd.DataFrame()
	corr = df[num_cols].corr(numeric_only=True) if len(num_cols) >= 2 else pd.DataFrame()

	# build graphs (same as graph tab)
	bar_path, pie_path, sc_path, corr_path, pair_path = graph_make(df, barpie_col, topn, scatter_x, scatter_y, pair_max)

	graph_paths = []
	if bar_path: graph_paths.append((f"Bar Chart: {barpie_col}", bar_path))
	if pie_path: graph_paths.append((f"Pie Chart: {barpie_col}", pie_path))
	if sc_path: graph_paths.append((f"Scatter: {scatter_x} vs {scatter_y}", sc_path))
	if corr_path: graph_paths.append(("Correlogram (Annotated)", corr_path))
	if pair_path: graph_paths.append((f"Pair Plot (first {min(pair_max, len(num_cols))} numeric cols)", pair_path))

	docx_path = build_docx_report(df, dataset_name or "uploaded.csv", id_col, stats_df, corr, graph_paths)

	# (optional) cleanup images later is fine; HF temp storage is ok for session
	return docx_path, "DOCX report generated successfully."


	def search_record(df: pd.DataFrame, id_col: str, query: str):
	if df is None:
	return "Upload a CSV first.", None
	if not query:
	return "Enter a value to search.", None
	if id_col not in df.columns:
	return "Select a valid ID column.", None

	col = df[id_col]
	result = None
	if pd.api.types.is_numeric_dtype(col):
	try:
	q = float(query)
	result = df[df[id_col] == q]
	except:
	result = df[col.astype(str) == query]
	else:
	result = df[col.astype(str) == query]

	if result is None or result.empty:
	return "No matching record found.", pd.DataFrame()
	return f"Found {len(result)} record(s).", result


	# ----------------------------
	# UI
	# ----------------------------
	CSS = """
	#titleblock {text-align:center; margin-top: 5px; margin-bottom: 10px;}
	#t1 {font-size:30px; font-weight:800; color:#1E5AA8;}
	#t2 {font-size:18px; font-weight:800; color:#1E5AA8;}
	"""

	def logo_html():
	# place logo.png in repo root
	if os.path.exists("logo.png"):
	# gradio serves files placed in root with relative path in HTML
	return f"""
	<div id="titleblock">
	<img src="file=logo.png" style="width:110px; display:block; margin:0 auto;" />
	<div id="t1">Amrita Manthana</div>
	<div id="t2">Prof.B.Santhi,SRC,SASTRA</div>
	</div>
	"""
	else:
	return """
	<div id="titleblock">
	<div id="t1">Amrita Manthana</div>
	<div id="t2">Prof.B.Santhi,SRC,SASTRA</div>
	</div>
	"""

	with gr.Blocks(css=CSS, title="Amrita Manthana - EDA (Gradio)") as demo:
	gr.HTML(logo_html())

	df_state = gr.State(None)
	cols_state = gr.State([])
	numcols_state = gr.State([])

	with gr.Row():
	with gr.Column(scale=1, min_width=320):
	gr.Markdown("### Data Upload")
	file_in = gr.File(label="Upload CSV", file_types=[".csv"])
	sep = gr.Dropdown(label="CSV Separator", choices=[",", ";", "\t", "\|"], value=",")
	header_flag = gr.Checkbox(label="First row is header", value=True)
	load_btn = gr.Button("Load Data", variant="primary")
	load_msg = gr.Textbox(label="Status", interactive=False)
	preview = gr.Dataframe(label="Preview (first 30 rows)", interactive=False, wrap=True)

	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.Tab("EDA"):
	eda_btn = gr.Button("Compute EDA")
	eda_note = gr.Textbox(label="Interpretation", lines=3, interactive=False)
	stats_table = gr.Dataframe(label="Descriptive Statistics", interactive=False, wrap=True)
	fps_table = gr.Dataframe(label="Five-Point Summary", interactive=False, wrap=True)
	skew_table = gr.Dataframe(label="Skewness", interactive=False, wrap=True)
	corr_table = gr.Dataframe(label="Correlation (numeric)", interactive=False, wrap=True)
	corr_note = gr.Textbox(label="Correlation Interpretation", lines=2, interactive=False)

	with gr.Tab("Graph"):
	gr.Markdown("#### Choose settings, then generate graphs")
	barpie_col = gr.Dropdown(label="Column for Bar/Pie", choices=[], value=None)
	topn = gr.Slider(label="Top-N categories", minimum=3, maximum=30, value=10, step=1)

	scatter_x = gr.Dropdown(label="Scatter X (numeric)", choices=[], value=None)
	scatter_y = gr.Dropdown(label="Scatter Y (numeric)", choices=[], value=None)
	pair_max = gr.Slider(label="Pair plot max numeric columns", minimum=2, maximum=10, value=6, step=1)

	graph_btn = gr.Button("Generate Graphs")
	with gr.Row():
	bar_img = gr.Image(label="Bar", type="filepath")
	pie_img = gr.Image(label="Pie", type="filepath")
	with gr.Row():
	sc_img = gr.Image(label="Scatter", type="filepath")
	corr_img = gr.Image(label="Correlogram (numbers inside)", type="filepath")
	pair_img = gr.Image(label="Pair Plot", type="filepath")

	with gr.Tab("Report"):
	gr.Markdown("#### DOCX report (includes all graphs + interpretations)")
	id_col_rep = gr.Dropdown(label="Record ID column (for report/search)", choices=[], value=None)
	rep_btn = gr.Button("Generate DOCX Report", variant="primary")
	rep_status = gr.Textbox(label="Report Status", interactive=False)
	rep_file = gr.File(label="Download Report (.docx)")

	with gr.Tab("Search"):
	id_col_search = gr.Dropdown(label="Select ID column", choices=[], value=None)
	query = gr.Textbox(label="Enter ID value (exact match)")
	search_btn = gr.Button("Search")
	search_msg = gr.Textbox(label="Search Status", interactive=False)
	search_out = gr.Dataframe(label="Matching Records", interactive=False, wrap=True)

	# --- events ---
	def after_load(file_obj, sep_val, header_val):
	df, msg, prev, cols, numcols = load_csv(file_obj, sep_val, header_val)
	# for dropdowns
	return (
	df, cols, numcols,
	msg, prev,
	gr.update(choices=cols, value=(cols[0] if cols else None)), # barpie_col
	gr.update(choices=numcols, value=(numcols[0] if len(numcols) else None)), # scatter_x
	gr.update(choices=numcols, value=(numcols[1] if len(numcols) > 1 else None)), # scatter_y
	gr.update(choices=cols, value=(cols[0] if cols else None)), # id_col_rep
	gr.update(choices=cols, value=(cols[0] if cols else None)), # id_col_search
	)

	load_btn.click(
	after_load,
	inputs=[file_in, sep, header_flag],
	outputs=[df_state, cols_state, numcols_state,
	load_msg, preview,
	barpie_col, scatter_x, scatter_y, id_col_rep, id_col_search]
	)

	eda_btn.click(
	eda_compute,
	inputs=[df_state],
	outputs=[eda_note, stats_table, fps_table, skew_table, corr_table, corr_note]
	)

	graph_btn.click(
	graph_make,
	inputs=[df_state, barpie_col, topn, scatter_x, scatter_y, pair_max],
	outputs=[bar_img, pie_img, sc_img, corr_img, pair_img]
	)

	def rep_run(df, idcol, barcol, topn_v, sx, sy, pmx, file_obj):
	name = file_obj.name if file_obj is not None else "uploaded.csv"
	path, status = report_generate(df, name, idcol, barcol, topn_v, sx, sy, pmx)
	return status, path

	rep_btn.click(
	rep_run,
	inputs=[df_state, id_col_rep, barpie_col, topn, scatter_x, scatter_y, pair_max, file_in],
	outputs=[rep_status, rep_file]
	)

	search_btn.click(
	search_record,
	inputs=[df_state, id_col_search, query],
	outputs=[search_msg, search_out]
	)

	if __name__ == "__main__":
	demo.launch()

	gradio==4.44.1
	pandas==2.2.1
	numpy==1.26.4
	matplotlib==3.8.3
	seaborn==0.13.2
	scipy==1.12.0
	python-docx==1.1.0