Spaces:

venkatl
/

anemia_analysis

Sleeping

App Files Files Community

anemia_analysis / app.py

venkatl

Upload folder using huggingface_hub

5a377cd verified 2 months ago

raw

history blame contribute delete

11.7 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import gradio as gr
	from io import BytesIO
	import base64
	import random
	import scipy.stats as ss

	from PIL import Image

	def fig_to_pil(fig):
	buf = BytesIO()
	fig.savefig(buf, format="png", bbox_inches="tight")
	buf.seek(0)
	return Image.open(buf)

	# -----------------------------
	# Load Data
	# -----------------------------
	url = "https://docs.google.com/spreadsheets/d/1SE2Q7nuL-xCp52V8w5onlzySU4udZoqNUY2kGT3WV2s/export?format=xlsx"
	df = pd.read_excel(url, sheet_name="Final")

	numeric_cols = df.select_dtypes(include=['number']).columns
	categorical_cols = df.select_dtypes(include=['object', 'category']).columns

	# ----------------------------------------------------
	# 1. Descriptive Statistics
	# ----------------------------------------------------
	def get_descriptive_stats():
	stats = df[numeric_cols].describe().T
	stats = stats.reset_index().rename(columns={"index": "Feature"})
	return stats

	def download_keyword_counts(df):
	r = random.randint(1,1000)
	path = f"keyword_counts_{r}.csv"
	df.to_csv(path, index=False)
	return path

	# ----------------------------------------------------
	# 2. Keyword Frequency Table + Plots
	# ----------------------------------------------------

	def keyword_frequency(column):
	series = df[column].dropna().astype(str).str.split(',').explode().str.strip()
	counts = series.value_counts().reset_index()
	counts.columns = ["Keyword", "Count"]

	# --- BAR CHART (matplotlib → PIL) ---
	fig_bar, ax_bar = plt.subplots(figsize=(8,4))
	ax_bar.bar(counts["Keyword"].head(15), counts["Count"].head(15))
	ax_bar.set_title(f"Top Keywords in {column}")
	ax_bar.set_xticklabels(counts["Keyword"].head(15), rotation=45, ha='right')
	bar_img = fig_to_pil(fig_bar)
	plt.close(fig_bar)

	# --- PIE CHART (matplotlib → PIL) ---
	fig_pie, ax_pie = plt.subplots(figsize=(6,6))
	ax_pie.pie(
	counts["Count"].head(10),
	labels=counts["Keyword"].head(10),
	autopct="%1.1f%%"
	)
	ax_pie.set_title(f"Pie Chart {column} (Distribution)")
	pie_img = fig_to_pil(fig_pie)
	plt.close(fig_pie)

	# --- HORIZONTAL BAR CHART ---
	fig_hbar, ax_hbar = plt.subplots(figsize=(8,6))
	ax_hbar.barh(counts["Keyword"].head(15), counts["Count"].head(15))
	ax_hbar.set_title(f"Top Keywords in {column} (Horizontal Bar)")
	plt.tight_layout()
	hbar_img = fig_to_pil(fig_hbar)
	plt.close(fig_hbar)

	# --- PARETO CHART (80/20) ---
	counts_sorted = counts.sort_values("Count", ascending=False)
	cum_percentage = (counts_sorted["Count"].cumsum() / counts_sorted["Count"].sum()) * 100

	fig_pareto, ax1 = plt.subplots(figsize=(8,4))
	ax1.bar(counts_sorted["Keyword"].head(15), counts_sorted["Count"].head(15), color='skyblue')
	ax2 = ax1.twinx()
	ax2.plot(counts_sorted["Keyword"].head(15), cum_percentage.head(15), color='red', marker="o")
	ax1.set_xticklabels(counts_sorted["Keyword"].head(15), rotation=45, ha='right')
	ax1.set_title(f"Pareto Analysis of {column}")
	pareto_img = fig_to_pil(fig_pareto)
	plt.close(fig_pareto)

	# --- SCATTER PLOT (Rank vs Frequency) ---
	counts["Rank"] = range(1, len(counts) + 1)

	fig_scatter, ax_scatter = plt.subplots(figsize=(6,4))
	ax_scatter.scatter(counts["Rank"], counts["Count"])
	ax_scatter.set_title(f"Rank vs Frequency for {column}")
	ax_scatter.set_xlabel("Rank (1 = most common)")
	ax_scatter.set_ylabel("Frequency")
	scatter_img = fig_to_pil(fig_scatter)
	plt.close(fig_scatter)

	# --- CUMULATIVE DISTRIBUTION PLOT ---
	fig_cum, ax_cum = plt.subplots(figsize=(6,4))
	ax_cum.plot(cum_percentage.values)
	ax_cum.set_title(f"Cumulative Distribution of {column}")
	ax_cum.set_ylabel("Cumulative %")
	ax_cum.set_xlabel("Keyword Rank")
	cum_img = fig_to_pil(fig_cum)
	plt.close(fig_cum)


	return counts, bar_img, pie_img, hbar_img, pareto_img, scatter_img, cum_img

	# ----------------------------------------------------
	# 3. Correlation Explorer
	# ----------------------------------------------------
	def explore_two_columns(col1, col2):
	c1 = df[col1]
	c2 = df[col2]

	images = []
	result_text = ""

	# NUMERIC vs NUMERIC
	if col1 in numeric_cols and col2 in numeric_cols:
	# Pearson
	corr = c1.corr(c2)
	result_text = f"Pearson Correlation = {corr:.4f}"

	# Scatter
	fig, ax = plt.subplots(figsize=(6,4))
	ax.scatter(c1, c2)
	ax.set_xlabel(col1)
	ax.set_ylabel(col2)
	ax.set_title(f"{col1} vs {col2} (Scatter)")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Regression
	fig, ax = plt.subplots(figsize=(6,4))
	sns.regplot(x=c1, y=c2, ax=ax)
	ax.set_title("Regression Line")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Distributions
	fig, ax = plt.subplots(figsize=(6,4))
	sns.histplot(c1, color="blue", kde=True, label=col1)
	sns.histplot(c2, color="orange", kde=True, label=col2)
	ax.legend()
	ax.set_title("Distribution Comparison")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	print(result_text)
	return result_text, None, images[0], images[1], images[2]

	# CATEGORICAL vs CATEGORICAL
	if col1 in categorical_cols and col2 in categorical_cols:
	confusion = pd.crosstab(c1, c2)
	v = cramers_v(confusion)
	result_text = f"Cramér’s V = {v:.4f}"

	conf = pd.crosstab(c1,c2, margins=True, margins_name="Total")
	conf_table = conf.reset_index()
	conf_table.columns = ["Category_1"] + list(conf.columns)

	# Heatmap
	fig, ax = plt.subplots(figsize=(6,4))
	sns.heatmap(confusion, cmap="Blues", annot=True, fmt="d")
	ax.set_title("Crosstab Heatmap")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Bar chart
	fig, ax = plt.subplots(figsize=(6,4))
	confusion.sum(axis=1).plot(kind='bar', ax=ax)
	ax.set_title(f"Correlation between {col1} and {col2}")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	print(result_text)
	return result_text, conf_table, images[0], images[1], None

	# MIXED TYPES (numeric + categorical)
	# Ensure correct assignment
	if col1 in categorical_cols and col2 in numeric_cols:
	cat = col1; num = col2
	else:
	cat = col2; num = col1

	result_text = f"Numeric vs Categorical Analysis ({num} by {cat})"

	# Boxplot
	fig, ax = plt.subplots(figsize=(6,4))
	sns.boxplot(x=df[cat], y=df[num], ax=ax)
	ax.set_title("Boxplot")
	plt.xticks(rotation=45, ha='right')
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Violin plot
	fig, ax = plt.subplots(figsize=(6,4))
	sns.violinplot(x=df[cat], y=df[num], ax=ax)
	ax.set_title("Violin Plot")
	plt.xticks(rotation=45, ha='right')
	images.append(fig_to_pil(fig))
	plt.close(fig)

	print(result_text)
	return result_text, None, images[0], images[1], None

	def cramers_v(confusion_matrix):
	""" Cramér's V for categorical correlation """
	chi2 = ss.chi2_contingency(confusion_matrix)[0]
	n = confusion_matrix.sum().sum()
	r, k = confusion_matrix.shape
	return np.sqrt(chi2 / (n * (min(r, k) - 1)))


	def compute_correlation(col1, col2):
	c1 = df[col1]
	c2 = df[col2]

	# Case 1: numeric vs numeric
	if col1 in numeric_cols and col2 in numeric_cols:
	corr = c1.corr(c2)
	return f"Pearson Correlation = {corr:.4f}", None

	# Case 2: categorical vs categorical → Cramér’s V
	if col1 in categorical_cols and col2 in categorical_cols:
	confusion = pd.crosstab(c1, c2)
	v = cramers_v(confusion)
	return f"Cramér’s V = {v:.4f}", confusion

	# Case 3: keyword frequency vs numeric/categorical
	# Convert col1 or col2 (if comma-separated) into frequency counts
	def keyword_expand(col):
	return df[col].dropna().astype(str).str.split(',').explode().str.strip()

	if col1 in categorical_cols:
	k = keyword_expand(col1)
	k_counts = k.value_counts()
	df_k = df.assign(**{f"{col1}_KEYWORD_COUNTS": df[col1].fillna("").apply(
	lambda x: sum([k_counts.get(i.strip(), 0) for i in x.split(',') if i.strip()])
	)})
	c1 = df_k[f"{col1}_KEYWORD_COUNTS"]

	if col2 in categorical_cols:
	k = keyword_expand(col2)
	k_counts = k.value_counts()
	df_k = df.assign(**{f"{col2}_KEYWORD_COUNTS": df[col2].fillna("").apply(
	lambda x: sum([k_counts.get(i.strip(), 0) for i in x.split(',') if i.strip()])
	)})
	c2 = df_k[f"{col2}_KEYWORD_COUNTS"]

	corr = c1.corr(c2)
	return f"Keyword-Frequency Based Correlation = {corr:.4f}", None


	# ----------------------------------------------------
	# Gradio UI
	# ----------------------------------------------------
	with gr.Blocks(title="DATA ANALYSIS APP") as app:

	gr.Markdown("# 📊 Youth Nutritional Data Analysis System \nUpload → Analyse → Export\n Developed by Dr. Indira Priyadarsini")
	with gr.Tab("ℹ️ About & Citation"):
	from a import citation
	gr.Markdown(citation)
	with gr.Tab("1️⃣ Descriptive Statistics"):
	btn_stats = gr.Button("Generate Stats")
	stats_out = gr.Dataframe()
	btn_stats.click(get_descriptive_stats, outputs=stats_out)

	with gr.Tab("2️⃣ Keyword Frequency Explorer"):
	col_select = gr.Dropdown(choices=list(categorical_cols), label="Select Column")
	freq_table = gr.Dataframe(label="Keyword Counts")
	bar_plot = gr.Image(label="Bar Chart")
	pie_img = gr.Image(label="Pie Chart")
	hbar_img = gr.Image(label="Horizontal Bar Chart")
	pareto_img = gr.Image(label="Pareto Chart")
	scatter_img = gr.Image(label="Rank vs Frequency Scatter")
	cum_img = gr.Image(label="Cumulative Distribution")

	download_btn = gr.Button("Download as CSV")
	download_file = gr.File(label="Download File")
	col_select.change(keyword_frequency,
	inputs=col_select,
	outputs=[freq_table, bar_plot, pie_img,hbar_img, pareto_img, scatter_img, cum_img])
	download_btn.click(
	download_keyword_counts,
	inputs=freq_table,
	outputs=download_file
	)

	with gr.Tab("4️⃣ Two-Column Relationship Explorer"):
	colA = gr.Dropdown(choices=df.columns.tolist(), label="Column A")
	colB = gr.Dropdown(choices=df.columns.tolist(), label="Column B")
	btn_rel = gr.Button("Explore Relationship")

	rel_text = gr.Textbox(label="Summary")
	rel_table = gr.Dataframe(label="Crosstab (if categorical)")
	rel_img1 = gr.Image()
	rel_img2 = gr.Image()
	rel_img3 = gr.Image()

	btn_rel.click(
	explore_two_columns,
	inputs=[colA, colB],
	outputs=[rel_text, rel_table, rel_img1, rel_img2, rel_img3]
	)

	with gr.Tab("3️⃣ Correlation Explorer"):
	col_select = gr.Dropdown(choices=categorical_cols.tolist(), label="Select Column")
	col1 = gr.Dropdown(choices=df.columns.tolist(), label="Column 1")
	col2 = gr.Dropdown(choices=df.columns.tolist(), label="Column 2")
	corr_btn = gr.Button("Compute Correlation")
	corr_text = gr.Textbox(label="Correlation Result")
	confusion_out = gr.Dataframe(label="Categorical Crosstab (if applicable)")
	corr_btn.click(compute_correlation, inputs=[col1, col2], outputs=[corr_text, confusion_out])

	app.launch(theme=gr.themes.Monochrome())