| | import pandas as pd |
| | import numpy as np |
| | import matplotlib.pyplot as plt |
| | import seaborn as sns |
| | import gradio as gr |
| | from io import BytesIO |
| | import base64 |
| | import random |
| | import scipy.stats as ss |
| | from PIL import Image |
| | from cite import citation |
| |
|
| | def fig_to_pil(fig): |
| | buf = BytesIO() |
| | fig.savefig(buf, format="png", bbox_inches="tight") |
| | buf.seek(0) |
| | return Image.open(buf) |
| |
|
| | def cramers_v(confusion_matrix): |
| | """ Cramér's V for categorical correlation """ |
| | chi2 = ss.chi2_contingency(confusion_matrix)[0] |
| | n = confusion_matrix.sum().sum() |
| | r, k = confusion_matrix.shape |
| | return np.sqrt(chi2 / (n * (min(r, k) - 1))) |
| |
|
| | def create_analysis_interface(df): |
| | """ |
| | Creates the Gradio interface components for a given dataframe. |
| | This function returns a list of Tabs or renders them into the current context. |
| | """ |
| | |
| | numeric_cols = df.select_dtypes(include=['number']).columns |
| | categorical_cols = df.select_dtypes(include=['object', 'category']).columns |
| |
|
| | |
| | |
| | |
| | def get_descriptive_stats(): |
| | stats = df[numeric_cols].describe().T |
| | stats = stats.reset_index().rename(columns={"index": "Feature"}) |
| | return stats |
| |
|
| | def download_keyword_counts(df_in): |
| | r = random.randint(1,1000) |
| | path = f"keyword_counts_{r}.csv" |
| | df_in.to_csv(path, index=False) |
| | return path |
| |
|
| | |
| | |
| | |
| | def keyword_frequency(column): |
| | series = df[column].dropna().astype(str).str.split(',').explode().str.strip() |
| | counts = series.value_counts().reset_index() |
| | counts.columns = ["Keyword", "Count"] |
| | total = counts["Count"].sum() |
| | |
| | counts["Percentage"] = (counts["Count"] / total * 100).round(1) |
| |
|
| | |
| | fig_bar, ax_bar = plt.subplots(figsize=(8,4)) |
| | ax_bar.bar(counts["Keyword"].head(15), counts["Count"].head(15)) |
| | ax_bar.set_title(f"Top Keywords in {column}") |
| | ax_bar.set_xticklabels(counts["Keyword"].head(15), rotation=45, ha='right') |
| | bar_img = fig_to_pil(fig_bar) |
| | plt.close(fig_bar) |
| |
|
| | |
| | fig_pie, ax_pie = plt.subplots(figsize=(6,6)) |
| | ax_pie.pie( |
| | counts["Count"].head(10), |
| | labels=counts["Keyword"].head(10), |
| | autopct="%1.1f%%" |
| | ) |
| | ax_pie.set_title(f"{column} (Distribution)") |
| | pie_img = fig_to_pil(fig_pie) |
| | plt.close(fig_pie) |
| |
|
| | |
| | fig_hbar, ax_hbar = plt.subplots(figsize=(8,6)) |
| | ax_hbar.barh(counts["Keyword"].head(15), counts["Count"].head(15)) |
| | ax_hbar.set_title(f"Top Keywords in {column} (Horizontal Bar)") |
| | plt.tight_layout() |
| | hbar_img = fig_to_pil(fig_hbar) |
| | plt.close(fig_hbar) |
| |
|
| | |
| | counts_sorted = counts.sort_values("Count", ascending=False) |
| | cum_percentage = (counts_sorted["Count"].cumsum() / counts_sorted["Count"].sum()) * 100 |
| |
|
| | fig_pareto, ax1 = plt.subplots(figsize=(8,4)) |
| | ax1.bar(counts_sorted["Keyword"].head(15), counts_sorted["Count"].head(15), color='skyblue') |
| | ax2 = ax1.twinx() |
| | ax2.plot(counts_sorted["Keyword"].head(15), cum_percentage.head(15), color='red', marker="o") |
| | ax1.set_xticklabels(counts_sorted["Keyword"].head(15), rotation=45, ha='right') |
| | ax1.set_title(f"Pareto Analysis of {column}") |
| | pareto_img = fig_to_pil(fig_pareto) |
| | plt.close(fig_pareto) |
| |
|
| | |
| | counts["Rank"] = range(1, len(counts) + 1) |
| |
|
| | fig_scatter, ax_scatter = plt.subplots(figsize=(6,4)) |
| | ax_scatter.scatter(counts["Rank"], counts["Count"]) |
| | ax_scatter.set_title(f"Rank vs Frequency for {column}") |
| | ax_scatter.set_xlabel("Rank (1 = most common)") |
| | ax_scatter.set_ylabel("Frequency") |
| | scatter_img = fig_to_pil(fig_scatter) |
| | plt.close(fig_scatter) |
| |
|
| | |
| | fig_cum, ax_cum = plt.subplots(figsize=(6,4)) |
| | ax_cum.plot(cum_percentage.values) |
| | ax_cum.set_title(f"Cumulative Distribution of {column}") |
| | ax_cum.set_ylabel("Cumulative %") |
| | ax_cum.set_xlabel("Keyword Rank") |
| | cum_img = fig_to_pil(fig_cum) |
| | plt.close(fig_cum) |
| |
|
| | return counts, bar_img, pie_img, hbar_img, pareto_img, scatter_img, cum_img |
| |
|
| | |
| | |
| | |
| | def explore_two_columns(col1, col2): |
| | c1 = df[col1] |
| | c2 = df[col2] |
| |
|
| | images = [] |
| | result_text = "" |
| |
|
| | |
| | if col1 in numeric_cols and col2 in numeric_cols: |
| | |
| | corr = c1.corr(c2) |
| | result_text = f"Pearson Correlation = {corr:.4f}" |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | ax.scatter(c1, c2) |
| | ax.set_xlabel(col1) |
| | ax.set_ylabel(col2) |
| | ax.set_title(f"{col1} vs {col2} (Scatter)") |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | sns.regplot(x=c1, y=c2, ax=ax) |
| | ax.set_title("Regression Line") |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | sns.histplot(c1, color="blue", kde=True, label=col1) |
| | sns.histplot(c2, color="orange", kde=True, label=col2) |
| | ax.legend() |
| | ax.set_title("Distribution Comparison") |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | print(result_text) |
| | return result_text, None, images[0], images[1], images[2] |
| |
|
| | |
| | if col1 in categorical_cols and col2 in categorical_cols: |
| | confusion = pd.crosstab(c1, c2) |
| | v = cramers_v(confusion) |
| | result_text = f"Cramér’s V = {v:.4f}" |
| |
|
| | conf = pd.crosstab(c1,c2, margins=True, margins_name="Total") |
| | conf_table = conf.reset_index() |
| | conf_table.columns = ["Category_1"] + list(conf.columns) |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | sns.heatmap(confusion, cmap="Blues", annot=True, fmt="d") |
| | ax.set_title("Crosstab Heatmap") |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | confusion.sum(axis=1).plot(kind='bar', ax=ax) |
| | ax.set_title(f"Correlation between {col1} and {col2}") |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | print(result_text) |
| | return result_text, conf_table, images[0], images[1], None |
| |
|
| | |
| | |
| | if col1 in categorical_cols and col2 in numeric_cols: |
| | cat = col1; num = col2 |
| | else: |
| | cat = col2; num = col1 |
| |
|
| | result_text = f"Numeric vs Categorical Analysis ({num} by {cat})" |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | sns.boxplot(x=df[cat], y=df[num], ax=ax) |
| | ax.set_title("Boxplot") |
| | plt.xticks(rotation=45, ha='right') |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | sns.violinplot(x=df[cat], y=df[num], ax=ax) |
| | ax.set_title("Violin Plot") |
| | plt.xticks(rotation=45, ha='right') |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | print(result_text) |
| | return result_text, None, images[0], images[1], None |
| |
|
| | def explore_two_columns_statistical(col1, col2): |
| | c1 = df[col1] |
| | c2 = df[col2] |
| |
|
| | images = [] |
| | result_text = "" |
| | stat_results = "" |
| |
|
| | |
| | if col1 in numeric_cols and col2 in numeric_cols: |
| | |
| | corr = c1.corr(c2) |
| | result_text = f"Pearson Correlation = {corr:.4f}" |
| | |
| | |
| | spearman_corr, spearman_p = ss.spearmanr(c1, c2) |
| | stat_results += f"Spearman Correlation: {spearman_corr:.4f} (p={spearman_p:.4e})\n" |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | ax.scatter(c1, c2) |
| | ax.set_xlabel(col1) |
| | ax.set_ylabel(col2) |
| | ax.set_title(f"{col1} vs {col2} (Scatter)") |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | sns.regplot(x=c1, y=c2, ax=ax) |
| | ax.set_title("Regression Line") |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | sns.histplot(c1, color="blue", kde=True, label=col1) |
| | sns.histplot(c2, color="orange", kde=True, label=col2) |
| | ax.legend() |
| | ax.set_title("Distribution Comparison") |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | return result_text, stat_results, None, images[0], images[1], images[2] |
| |
|
| | |
| | if col1 in categorical_cols and col2 in categorical_cols: |
| | confusion = pd.crosstab(c1, c2) |
| | v = cramers_v(confusion) |
| | result_text = f"Cramér’s V = {v:.4f}" |
| |
|
| | |
| | chi2, p, dof, expected = ss.chi2_contingency(confusion) |
| | stat_results += f"Chi-Square Statistic: {chi2:.4f}\n" |
| | stat_results += f"P-value: {p:.4e}\n" |
| | stat_results += f"Degrees of Freedom: {dof}\n" |
| | if p < 0.05: |
| | stat_results += "Result: Significant Association (Reject H0)\n" |
| | else: |
| | stat_results += "Result: No Significant Association (Fail to reject H0)\n" |
| |
|
| | conf = pd.crosstab(c1,c2, margins=True, margins_name="Total") |
| | conf_table = conf.reset_index() |
| | conf_table.columns = ["Category_1"] + list(conf.columns) |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | sns.heatmap(confusion, cmap="Blues", annot=True, fmt="d") |
| | ax.set_title("Crosstab Heatmap") |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | confusion.sum(axis=1).plot(kind='bar', ax=ax) |
| | ax.set_title(f"Correlation between {col1} and {col2}") |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | return result_text, stat_results, conf_table, images[0], images[1], None |
| |
|
| | |
| | |
| | if col1 in categorical_cols and col2 in numeric_cols: |
| | cat = col1; num = col2 |
| | else: |
| | cat = col2; num = col1 |
| |
|
| | result_text = f"Numeric vs Categorical Analysis ({num} by {cat})" |
| | |
| | |
| | groups = [] |
| | group_labels = [] |
| | for val in df[cat].unique(): |
| | |
| | data_group = df[df[cat] == val][num].dropna() |
| | if len(data_group) > 0: |
| | groups.append(data_group) |
| | group_labels.append(val) |
| | |
| | |
| | if len(groups) == 2: |
| | |
| | t_stat, p_val = ss.ttest_ind(groups[0], groups[1]) |
| | stat_results += f"T-test ({group_labels[0]} vs {group_labels[1]}):\n" |
| | stat_results += f"T-statistic: {t_stat:.4f}\n" |
| | stat_results += f"P-value: {p_val:.4e}\n" |
| | if p_val < 0.05: |
| | stat_results += "Result: Significant Difference (Reject H0)\n" |
| | else: |
| | stat_results += "Result: No Significant Difference (Fail to reject H0)\n" |
| | |
| | elif len(groups) > 2: |
| | |
| | f_stat, p_val = ss.f_oneway(*groups) |
| | stat_results += f"One-way ANOVA:\n" |
| | stat_results += f"F-statistic: {f_stat:.4f}\n" |
| | stat_results += f"P-value: {p_val:.4e}\n" |
| | if p_val < 0.05: |
| | stat_results += "Result: Significant Difference among groups (Reject H0)\n" |
| | else: |
| | stat_results += "Result: No Significant Difference among groups (Fail to reject H0) p value less than 0.05\n" |
| | else: |
| | stat_results += "Not enough groups for statistical testing.\n" |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | sns.boxplot(x=df[cat], y=df[num], ax=ax) |
| | ax.set_title("Boxplot") |
| | plt.xticks(rotation=45, ha='right') |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | |
| | fig, ax = plt.subplots(figsize=(6,4)) |
| | sns.violinplot(x=df[cat], y=df[num], ax=ax) |
| | ax.set_title("Violin Plot") |
| | plt.xticks(rotation=45, ha='right') |
| | images.append(fig_to_pil(fig)) |
| | plt.close(fig) |
| |
|
| | return result_text, stat_results, None, images[0], images[1], None |
| |
|
| | |
| | |
| | |
| | with gr.Tab("ℹ️ About & Citation"): |
| | gr.Markdown(citation) |
| |
|
| | with gr.Tab("1️⃣ Descriptive Statistics"): |
| | btn_stats = gr.Button("Generate Stats") |
| | stats_out = gr.Dataframe() |
| | btn_stats.click(get_descriptive_stats, outputs=stats_out) |
| |
|
| | with gr.Tab("2️⃣ Keyword Frequency Explorer"): |
| | col_select = gr.Dropdown(choices=list(categorical_cols), label="Select Column") |
| | freq_table = gr.Dataframe(label="Keyword Counts") |
| | |
| | with gr.Row(): |
| | bar_plot = gr.Image(label="Bar Chart") |
| | pie_img = gr.Image(label="Pie Chart") |
| | |
| | with gr.Row(): |
| | hbar_img = gr.Image(label="Horizontal Bar Chart") |
| | pareto_img = gr.Image(label="Pareto Chart") |
| | |
| | with gr.Row(): |
| | scatter_img = gr.Image(label="Rank vs Frequency Scatter") |
| | cum_img = gr.Image(label="Cumulative Distribution") |
| | |
| | download_btn = gr.Button("Download as CSV") |
| | download_file = gr.File(label="Download File") |
| | col_select.change(keyword_frequency, |
| | inputs=col_select, |
| | outputs=[freq_table, bar_plot, pie_img,hbar_img, pareto_img, scatter_img, cum_img]) |
| | download_btn.click( |
| | download_keyword_counts, |
| | inputs=freq_table, |
| | outputs=download_file |
| | ) |
| |
|
| | with gr.Tab("4️⃣ Two-Column Relationship Explorer"): |
| | with gr.Row(): |
| | colA = gr.Dropdown(choices=df.columns.tolist(), label="Column A") |
| | colB = gr.Dropdown(choices=df.columns.tolist(), label="Column B") |
| | btn_rel = gr.Button("Explore Relationship") |
| |
|
| | rel_text = gr.Textbox(label="Summary") |
| | rel_table = gr.Dataframe(label="Crosstab (if categorical)") |
| | with gr.Row(): |
| | rel_img1 = gr.Image() |
| | rel_img2 = gr.Image() |
| | rel_img3 = gr.Image() |
| |
|
| | btn_rel.click( |
| | explore_two_columns, |
| | inputs=[colA, colB], |
| | outputs=[rel_text, rel_table, rel_img1, rel_img2, rel_img3] |
| | ) |
| |
|
| | with gr.Tab("5️⃣ Advanced Statistical Analysis"): |
| | gr.Markdown("### Statistical Tests: T-Test, ANOVA, Chi-Square") |
| | with gr.Row(): |
| | colA_stat = gr.Dropdown(choices=df.columns.tolist(), label="Column A") |
| | colB_stat = gr.Dropdown(choices=df.columns.tolist(), label="Column B") |
| | btn_stat = gr.Button("Run Statistical Analysis") |
| |
|
| | stat_summary = gr.Textbox(label="Summary") |
| | stat_details = gr.Textbox(label="Detailed Statistical Results", lines=10) |
| | stat_table = gr.Dataframe(label="Crosstab (if categorical)") |
| | with gr.Row(): |
| | stat_img1 = gr.Image() |
| | stat_img2 = gr.Image() |
| | stat_img3 = gr.Image() |
| |
|
| | btn_stat.click( |
| | explore_two_columns_statistical, |
| | inputs=[colA_stat, colB_stat], |
| | outputs=[stat_summary, stat_details, stat_table, stat_img1, stat_img2, stat_img3] |
| | ) |
| |
|
| | |
| | |
| | |
| | |
| | kadapa = 'Kadapa Special Prison for Women' |
| | rajahmundry = 'Rajahmundry special prison for women' |
| |
|
| | |
| | df_all = pd.read_csv("data.csv") |
| | df_kadapa = df_all[df_all['Prison_Name'] == kadapa] |
| | df_rajahmundry = df_all[df_all['Prison_Name'] == rajahmundry] |
| |
|
| | with gr.Blocks(title="DATA ANALYSIS APP") as app: |
| | gr.Markdown("# 📊 Criminology Data Analysis System \nUpload → Analyse → Export\n Developed by Neeraja") |
| | |
| | with gr.Tabs(): |
| | with gr.Tab("🌍 All Prisons Data"): |
| | create_analysis_interface(df_all) |
| | |
| | with gr.Tab("🏢 Kadapa Prison"): |
| | create_analysis_interface(df_kadapa) |
| | |
| | with gr.Tab("🏢 Rajahmundry Prison"): |
| | create_analysis_interface(df_rajahmundry) |
| |
|
| | app.launch(share=False) |
| |
|