Spaces:

venkatl
/

crime

Sleeping

App Files Files Community

crime / app.py

venkatl

Update app.py

3a7ff6f verified about 2 months ago

raw

history blame contribute delete

17.9 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import gradio as gr
	from io import BytesIO
	import base64
	import random
	import scipy.stats as ss
	from PIL import Image
	from cite import citation

	def fig_to_pil(fig):
	buf = BytesIO()
	fig.savefig(buf, format="png", bbox_inches="tight")
	buf.seek(0)
	return Image.open(buf)

	def cramers_v(confusion_matrix):
	""" Cramér's V for categorical correlation """
	chi2 = ss.chi2_contingency(confusion_matrix)[0]
	n = confusion_matrix.sum().sum()
	r, k = confusion_matrix.shape
	return np.sqrt(chi2 / (n * (min(r, k) - 1)))

	def create_analysis_interface(df):
	"""
	Creates the Gradio interface components for a given dataframe.
	This function returns a list of Tabs or renders them into the current context.
	"""

	numeric_cols = df.select_dtypes(include=['number']).columns
	categorical_cols = df.select_dtypes(include=['object', 'category']).columns

	# ----------------------------------------------------
	# 1. Descriptive Statistics
	# ----------------------------------------------------
	def get_descriptive_stats():
	stats = df[numeric_cols].describe().T
	stats = stats.reset_index().rename(columns={"index": "Feature"})
	return stats

	def download_keyword_counts(df_in):
	r = random.randint(1,1000)
	path = f"keyword_counts_{r}.csv"
	df_in.to_csv(path, index=False)
	return path

	# ----------------------------------------------------
	# 2. Keyword Frequency Table + Plots
	# ----------------------------------------------------
	def keyword_frequency(column):
	series = df[column].dropna().astype(str).str.split(',').explode().str.strip()
	counts = series.value_counts().reset_index()
	counts.columns = ["Keyword", "Count"]
	total = counts["Count"].sum()
	# Percentage as numeric (rounded to 1 decimal place)
	counts["Percentage"] = (counts["Count"] / total * 100).round(1)

	# --- BAR CHART (matplotlib → PIL) ---
	fig_bar, ax_bar = plt.subplots(figsize=(8,4))
	ax_bar.bar(counts["Keyword"].head(15), counts["Count"].head(15))
	ax_bar.set_title(f"Top Keywords in {column}")
	ax_bar.set_xticklabels(counts["Keyword"].head(15), rotation=45, ha='right')
	bar_img = fig_to_pil(fig_bar)
	plt.close(fig_bar)

	# --- PIE CHART (matplotlib → PIL) ---
	fig_pie, ax_pie = plt.subplots(figsize=(6,6))
	ax_pie.pie(
	counts["Count"].head(10),
	labels=counts["Keyword"].head(10),
	autopct="%1.1f%%"
	)
	ax_pie.set_title(f"{column} (Distribution)") #Pie Chart
	pie_img = fig_to_pil(fig_pie)
	plt.close(fig_pie)

	# --- HORIZONTAL BAR CHART ---
	fig_hbar, ax_hbar = plt.subplots(figsize=(8,6))
	ax_hbar.barh(counts["Keyword"].head(15), counts["Count"].head(15))
	ax_hbar.set_title(f"Top Keywords in {column} (Horizontal Bar)")
	plt.tight_layout()
	hbar_img = fig_to_pil(fig_hbar)
	plt.close(fig_hbar)

	# --- PARETO CHART (80/20) ---
	counts_sorted = counts.sort_values("Count", ascending=False)
	cum_percentage = (counts_sorted["Count"].cumsum() / counts_sorted["Count"].sum()) * 100

	fig_pareto, ax1 = plt.subplots(figsize=(8,4))
	ax1.bar(counts_sorted["Keyword"].head(15), counts_sorted["Count"].head(15), color='skyblue')
	ax2 = ax1.twinx()
	ax2.plot(counts_sorted["Keyword"].head(15), cum_percentage.head(15), color='red', marker="o")
	ax1.set_xticklabels(counts_sorted["Keyword"].head(15), rotation=45, ha='right')
	ax1.set_title(f"Pareto Analysis of {column}")
	pareto_img = fig_to_pil(fig_pareto)
	plt.close(fig_pareto)

	# --- SCATTER PLOT (Rank vs Frequency) ---
	counts["Rank"] = range(1, len(counts) + 1)

	fig_scatter, ax_scatter = plt.subplots(figsize=(6,4))
	ax_scatter.scatter(counts["Rank"], counts["Count"])
	ax_scatter.set_title(f"Rank vs Frequency for {column}")
	ax_scatter.set_xlabel("Rank (1 = most common)")
	ax_scatter.set_ylabel("Frequency")
	scatter_img = fig_to_pil(fig_scatter)
	plt.close(fig_scatter)

	# --- CUMULATIVE DISTRIBUTION PLOT ---
	fig_cum, ax_cum = plt.subplots(figsize=(6,4))
	ax_cum.plot(cum_percentage.values)
	ax_cum.set_title(f"Cumulative Distribution of {column}")
	ax_cum.set_ylabel("Cumulative %")
	ax_cum.set_xlabel("Keyword Rank")
	cum_img = fig_to_pil(fig_cum)
	plt.close(fig_cum)

	return counts, bar_img, pie_img, hbar_img, pareto_img, scatter_img, cum_img

	# ----------------------------------------------------
	# 3. Correlation Explorer
	# ----------------------------------------------------
	def explore_two_columns(col1, col2):
	c1 = df[col1]
	c2 = df[col2]

	images = []
	result_text = ""

	# NUMERIC vs NUMERIC
	if col1 in numeric_cols and col2 in numeric_cols:
	# Pearson
	corr = c1.corr(c2)
	result_text = f"Pearson Correlation = {corr:.4f}"

	# Scatter
	fig, ax = plt.subplots(figsize=(6,4))
	ax.scatter(c1, c2)
	ax.set_xlabel(col1)
	ax.set_ylabel(col2)
	ax.set_title(f"{col1} vs {col2} (Scatter)")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Regression
	fig, ax = plt.subplots(figsize=(6,4))
	sns.regplot(x=c1, y=c2, ax=ax)
	ax.set_title("Regression Line")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Distributions
	fig, ax = plt.subplots(figsize=(6,4))
	sns.histplot(c1, color="blue", kde=True, label=col1)
	sns.histplot(c2, color="orange", kde=True, label=col2)
	ax.legend()
	ax.set_title("Distribution Comparison")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	print(result_text)
	return result_text, None, images[0], images[1], images[2]

	# CATEGORICAL vs CATEGORICAL
	if col1 in categorical_cols and col2 in categorical_cols:
	confusion = pd.crosstab(c1, c2)
	v = cramers_v(confusion)
	result_text = f"Cramér’s V = {v:.4f}"

	conf = pd.crosstab(c1,c2, margins=True, margins_name="Total")
	conf_table = conf.reset_index()
	conf_table.columns = ["Category_1"] + list(conf.columns)

	# Heatmap
	fig, ax = plt.subplots(figsize=(6,4))
	sns.heatmap(confusion, cmap="Blues", annot=True, fmt="d")
	ax.set_title("Crosstab Heatmap")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Bar chart
	fig, ax = plt.subplots(figsize=(6,4))
	confusion.sum(axis=1).plot(kind='bar', ax=ax)
	ax.set_title(f"Correlation between {col1} and {col2}")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	print(result_text)
	return result_text, conf_table, images[0], images[1], None

	# MIXED TYPES (numeric + categorical)
	# Ensure correct assignment
	if col1 in categorical_cols and col2 in numeric_cols:
	cat = col1; num = col2
	else:
	cat = col2; num = col1

	result_text = f"Numeric vs Categorical Analysis ({num} by {cat})"

	# Boxplot
	fig, ax = plt.subplots(figsize=(6,4))
	sns.boxplot(x=df[cat], y=df[num], ax=ax)
	ax.set_title("Boxplot")
	plt.xticks(rotation=45, ha='right')
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Violin plot
	fig, ax = plt.subplots(figsize=(6,4))
	sns.violinplot(x=df[cat], y=df[num], ax=ax)
	ax.set_title("Violin Plot")
	plt.xticks(rotation=45, ha='right')
	images.append(fig_to_pil(fig))
	plt.close(fig)

	print(result_text)
	return result_text, None, images[0], images[1], None

	def explore_two_columns_statistical(col1, col2):
	c1 = df[col1]
	c2 = df[col2]

	images = []
	result_text = ""
	stat_results = ""

	# NUMERIC vs NUMERIC
	if col1 in numeric_cols and col2 in numeric_cols:
	# Pearson
	corr = c1.corr(c2)
	result_text = f"Pearson Correlation = {corr:.4f}"

	# Spearman
	spearman_corr, spearman_p = ss.spearmanr(c1, c2)
	stat_results += f"Spearman Correlation: {spearman_corr:.4f} (p={spearman_p:.4e})\n"

	# Scatter
	fig, ax = plt.subplots(figsize=(6,4))
	ax.scatter(c1, c2)
	ax.set_xlabel(col1)
	ax.set_ylabel(col2)
	ax.set_title(f"{col1} vs {col2} (Scatter)")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Regression
	fig, ax = plt.subplots(figsize=(6,4))
	sns.regplot(x=c1, y=c2, ax=ax)
	ax.set_title("Regression Line")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Distributions
	fig, ax = plt.subplots(figsize=(6,4))
	sns.histplot(c1, color="blue", kde=True, label=col1)
	sns.histplot(c2, color="orange", kde=True, label=col2)
	ax.legend()
	ax.set_title("Distribution Comparison")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	return result_text, stat_results, None, images[0], images[1], images[2]

	# CATEGORICAL vs CATEGORICAL
	if col1 in categorical_cols and col2 in categorical_cols:
	confusion = pd.crosstab(c1, c2)
	v = cramers_v(confusion)
	result_text = f"Cramér’s V = {v:.4f}"

	# Chi-Square Test
	chi2, p, dof, expected = ss.chi2_contingency(confusion)
	stat_results += f"Chi-Square Statistic: {chi2:.4f}\n"
	stat_results += f"P-value: {p:.4e}\n"
	stat_results += f"Degrees of Freedom: {dof}\n"
	if p < 0.05:
	stat_results += "Result: Significant Association (Reject H0)\n"
	else:
	stat_results += "Result: No Significant Association (Fail to reject H0)\n"

	conf = pd.crosstab(c1,c2, margins=True, margins_name="Total")
	conf_table = conf.reset_index()
	conf_table.columns = ["Category_1"] + list(conf.columns)

	# Heatmap
	fig, ax = plt.subplots(figsize=(6,4))
	sns.heatmap(confusion, cmap="Blues", annot=True, fmt="d")
	ax.set_title("Crosstab Heatmap")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Bar chart
	fig, ax = plt.subplots(figsize=(6,4))
	confusion.sum(axis=1).plot(kind='bar', ax=ax)
	ax.set_title(f"Correlation between {col1} and {col2}")
	images.append(fig_to_pil(fig))
	plt.close(fig)

	return result_text, stat_results, conf_table, images[0], images[1], None

	# MIXED TYPES (numeric + categorical)
	# Ensure correct assignment
	if col1 in categorical_cols and col2 in numeric_cols:
	cat = col1; num = col2
	else:
	cat = col2; num = col1

	result_text = f"Numeric vs Categorical Analysis ({num} by {cat})"

	# Group data
	groups = []
	group_labels = []
	for val in df[cat].unique():
	# Drop NaNs for the test
	data_group = df[df[cat] == val][num].dropna()
	if len(data_group) > 0:
	groups.append(data_group)
	group_labels.append(val)

	# Check number of groups
	if len(groups) == 2:
	# T-test
	t_stat, p_val = ss.ttest_ind(groups[0], groups[1])
	stat_results += f"T-test ({group_labels[0]} vs {group_labels[1]}):\n"
	stat_results += f"T-statistic: {t_stat:.4f}\n"
	stat_results += f"P-value: {p_val:.4e}\n"
	if p_val < 0.05:
	stat_results += "Result: Significant Difference (Reject H0)\n"
	else:
	stat_results += "Result: No Significant Difference (Fail to reject H0)\n"

	elif len(groups) > 2:
	# ANOVA
	f_stat, p_val = ss.f_oneway(*groups)
	stat_results += f"One-way ANOVA:\n"
	stat_results += f"F-statistic: {f_stat:.4f}\n"
	stat_results += f"P-value: {p_val:.4e}\n"
	if p_val < 0.05:
	stat_results += "Result: Significant Difference among groups (Reject H0)\n"
	else:
	stat_results += "Result: No Significant Difference among groups (Fail to reject H0) p value less than 0.05\n"
	else:
	stat_results += "Not enough groups for statistical testing.\n"

	# Boxplot
	fig, ax = plt.subplots(figsize=(6,4))
	sns.boxplot(x=df[cat], y=df[num], ax=ax)
	ax.set_title("Boxplot")
	plt.xticks(rotation=45, ha='right')
	images.append(fig_to_pil(fig))
	plt.close(fig)

	# Violin plot
	fig, ax = plt.subplots(figsize=(6,4))
	sns.violinplot(x=df[cat], y=df[num], ax=ax)
	ax.set_title("Violin Plot")
	plt.xticks(rotation=45, ha='right')
	images.append(fig_to_pil(fig))
	plt.close(fig)

	return result_text, stat_results, None, images[0], images[1], None

	# ----------------------------------------------------
	# Layout
	# ----------------------------------------------------
	with gr.Tab("ℹ️ About & Citation"):
	gr.Markdown(citation)

	with gr.Tab("1️⃣ Descriptive Statistics"):
	btn_stats = gr.Button("Generate Stats")
	stats_out = gr.Dataframe()
	btn_stats.click(get_descriptive_stats, outputs=stats_out)

	with gr.Tab("2️⃣ Keyword Frequency Explorer"):
	col_select = gr.Dropdown(choices=list(categorical_cols), label="Select Column")
	freq_table = gr.Dataframe(label="Keyword Counts")

	with gr.Row():
	bar_plot = gr.Image(label="Bar Chart")
	pie_img = gr.Image(label="Pie Chart")

	with gr.Row():
	hbar_img = gr.Image(label="Horizontal Bar Chart")
	pareto_img = gr.Image(label="Pareto Chart")

	with gr.Row():
	scatter_img = gr.Image(label="Rank vs Frequency Scatter")
	cum_img = gr.Image(label="Cumulative Distribution")

	download_btn = gr.Button("Download as CSV")
	download_file = gr.File(label="Download File")
	col_select.change(keyword_frequency,
	inputs=col_select,
	outputs=[freq_table, bar_plot, pie_img,hbar_img, pareto_img, scatter_img, cum_img])
	download_btn.click(
	download_keyword_counts,
	inputs=freq_table,
	outputs=download_file
	)

	with gr.Tab("4️⃣ Two-Column Relationship Explorer"):
	with gr.Row():
	colA = gr.Dropdown(choices=df.columns.tolist(), label="Column A")
	colB = gr.Dropdown(choices=df.columns.tolist(), label="Column B")
	btn_rel = gr.Button("Explore Relationship")

	rel_text = gr.Textbox(label="Summary")
	rel_table = gr.Dataframe(label="Crosstab (if categorical)")
	with gr.Row():
	rel_img1 = gr.Image()
	rel_img2 = gr.Image()
	rel_img3 = gr.Image()

	btn_rel.click(
	explore_two_columns,
	inputs=[colA, colB],
	outputs=[rel_text, rel_table, rel_img1, rel_img2, rel_img3]
	)

	with gr.Tab("5️⃣ Advanced Statistical Analysis"):
	gr.Markdown("### Statistical Tests: T-Test, ANOVA, Chi-Square")
	with gr.Row():
	colA_stat = gr.Dropdown(choices=df.columns.tolist(), label="Column A")
	colB_stat = gr.Dropdown(choices=df.columns.tolist(), label="Column B")
	btn_stat = gr.Button("Run Statistical Analysis")

	stat_summary = gr.Textbox(label="Summary")
	stat_details = gr.Textbox(label="Detailed Statistical Results", lines=10)
	stat_table = gr.Dataframe(label="Crosstab (if categorical)")
	with gr.Row():
	stat_img1 = gr.Image()
	stat_img2 = gr.Image()
	stat_img3 = gr.Image()

	btn_stat.click(
	explore_two_columns_statistical,
	inputs=[colA_stat, colB_stat],
	outputs=[stat_summary, stat_details, stat_table, stat_img1, stat_img2, stat_img3]
	)

	# ----------------------------------------------------
	# Main App Construction
	# ----------------------------------------------------
	# Load Data
	kadapa = 'Kadapa Special Prison for Women'
	rajahmundry = 'Rajahmundry special prison for women'

	# df_all = pd.read_excel("https://docs.google.com/spreadsheets/d/190HFE1HCdiFjeSyG45rpsg6DHxvtmt7swATedv2mXwY/export?format=xlsx",sheet_name='FinalSheet')
	df_all = pd.read_csv("data.csv")
	df_kadapa = df_all[df_all['Prison_Name'] == kadapa]
	df_rajahmundry = df_all[df_all['Prison_Name'] == rajahmundry]

	with gr.Blocks(title="DATA ANALYSIS APP") as app:
	gr.Markdown("# 📊 Criminology Data Analysis System \nUpload → Analyse → Export\n Developed by Neeraja")

	with gr.Tabs():
	with gr.Tab("🌍 All Prisons Data"):
	create_analysis_interface(df_all)

	with gr.Tab("🏢 Kadapa Prison"):
	create_analysis_interface(df_kadapa)

	with gr.Tab("🏢 Rajahmundry Prison"):
	create_analysis_interface(df_rajahmundry)

	app.launch(share=False)