Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import gradio as gr | |
| from io import BytesIO | |
| import base64 | |
| import random | |
| import scipy.stats as ss | |
| from PIL import Image | |
| def fig_to_pil(fig): | |
| buf = BytesIO() | |
| fig.savefig(buf, format="png", bbox_inches="tight") | |
| buf.seek(0) | |
| return Image.open(buf) | |
| # ----------------------------- | |
| # Load Data | |
| # ----------------------------- | |
| url = "https://docs.google.com/spreadsheets/d/1SE2Q7nuL-xCp52V8w5onlzySU4udZoqNUY2kGT3WV2s/export?format=xlsx" | |
| df = pd.read_excel(url, sheet_name="Final") | |
| numeric_cols = df.select_dtypes(include=['number']).columns | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns | |
| # ---------------------------------------------------- | |
| # 1. Descriptive Statistics | |
| # ---------------------------------------------------- | |
| def get_descriptive_stats(): | |
| stats = df[numeric_cols].describe().T | |
| stats = stats.reset_index().rename(columns={"index": "Feature"}) | |
| return stats | |
| def download_keyword_counts(df): | |
| r = random.randint(1,1000) | |
| path = f"keyword_counts_{r}.csv" | |
| df.to_csv(path, index=False) | |
| return path | |
| # ---------------------------------------------------- | |
| # 2. Keyword Frequency Table + Plots | |
| # ---------------------------------------------------- | |
| def keyword_frequency(column): | |
| series = df[column].dropna().astype(str).str.split(',').explode().str.strip() | |
| counts = series.value_counts().reset_index() | |
| counts.columns = ["Keyword", "Count"] | |
| # --- BAR CHART (matplotlib → PIL) --- | |
| fig_bar, ax_bar = plt.subplots(figsize=(8,4)) | |
| ax_bar.bar(counts["Keyword"].head(15), counts["Count"].head(15)) | |
| ax_bar.set_title(f"Top Keywords in {column}") | |
| ax_bar.set_xticklabels(counts["Keyword"].head(15), rotation=45, ha='right') | |
| bar_img = fig_to_pil(fig_bar) | |
| plt.close(fig_bar) | |
| # --- PIE CHART (matplotlib → PIL) --- | |
| fig_pie, ax_pie = plt.subplots(figsize=(6,6)) | |
| ax_pie.pie( | |
| counts["Count"].head(10), | |
| labels=counts["Keyword"].head(10), | |
| autopct="%1.1f%%" | |
| ) | |
| ax_pie.set_title(f"Pie Chart {column} (Distribution)") | |
| pie_img = fig_to_pil(fig_pie) | |
| plt.close(fig_pie) | |
| # --- HORIZONTAL BAR CHART --- | |
| fig_hbar, ax_hbar = plt.subplots(figsize=(8,6)) | |
| ax_hbar.barh(counts["Keyword"].head(15), counts["Count"].head(15)) | |
| ax_hbar.set_title(f"Top Keywords in {column} (Horizontal Bar)") | |
| plt.tight_layout() | |
| hbar_img = fig_to_pil(fig_hbar) | |
| plt.close(fig_hbar) | |
| # --- PARETO CHART (80/20) --- | |
| counts_sorted = counts.sort_values("Count", ascending=False) | |
| cum_percentage = (counts_sorted["Count"].cumsum() / counts_sorted["Count"].sum()) * 100 | |
| fig_pareto, ax1 = plt.subplots(figsize=(8,4)) | |
| ax1.bar(counts_sorted["Keyword"].head(15), counts_sorted["Count"].head(15), color='skyblue') | |
| ax2 = ax1.twinx() | |
| ax2.plot(counts_sorted["Keyword"].head(15), cum_percentage.head(15), color='red', marker="o") | |
| ax1.set_xticklabels(counts_sorted["Keyword"].head(15), rotation=45, ha='right') | |
| ax1.set_title(f"Pareto Analysis of {column}") | |
| pareto_img = fig_to_pil(fig_pareto) | |
| plt.close(fig_pareto) | |
| # --- SCATTER PLOT (Rank vs Frequency) --- | |
| counts["Rank"] = range(1, len(counts) + 1) | |
| fig_scatter, ax_scatter = plt.subplots(figsize=(6,4)) | |
| ax_scatter.scatter(counts["Rank"], counts["Count"]) | |
| ax_scatter.set_title(f"Rank vs Frequency for {column}") | |
| ax_scatter.set_xlabel("Rank (1 = most common)") | |
| ax_scatter.set_ylabel("Frequency") | |
| scatter_img = fig_to_pil(fig_scatter) | |
| plt.close(fig_scatter) | |
| # --- CUMULATIVE DISTRIBUTION PLOT --- | |
| fig_cum, ax_cum = plt.subplots(figsize=(6,4)) | |
| ax_cum.plot(cum_percentage.values) | |
| ax_cum.set_title(f"Cumulative Distribution of {column}") | |
| ax_cum.set_ylabel("Cumulative %") | |
| ax_cum.set_xlabel("Keyword Rank") | |
| cum_img = fig_to_pil(fig_cum) | |
| plt.close(fig_cum) | |
| return counts, bar_img, pie_img, hbar_img, pareto_img, scatter_img, cum_img | |
| # ---------------------------------------------------- | |
| # 3. Correlation Explorer | |
| # ---------------------------------------------------- | |
| def explore_two_columns(col1, col2): | |
| c1 = df[col1] | |
| c2 = df[col2] | |
| images = [] | |
| result_text = "" | |
| # NUMERIC vs NUMERIC | |
| if col1 in numeric_cols and col2 in numeric_cols: | |
| # Pearson | |
| corr = c1.corr(c2) | |
| result_text = f"Pearson Correlation = {corr:.4f}" | |
| # Scatter | |
| fig, ax = plt.subplots(figsize=(6,4)) | |
| ax.scatter(c1, c2) | |
| ax.set_xlabel(col1) | |
| ax.set_ylabel(col2) | |
| ax.set_title(f"{col1} vs {col2} (Scatter)") | |
| images.append(fig_to_pil(fig)) | |
| plt.close(fig) | |
| # Regression | |
| fig, ax = plt.subplots(figsize=(6,4)) | |
| sns.regplot(x=c1, y=c2, ax=ax) | |
| ax.set_title("Regression Line") | |
| images.append(fig_to_pil(fig)) | |
| plt.close(fig) | |
| # Distributions | |
| fig, ax = plt.subplots(figsize=(6,4)) | |
| sns.histplot(c1, color="blue", kde=True, label=col1) | |
| sns.histplot(c2, color="orange", kde=True, label=col2) | |
| ax.legend() | |
| ax.set_title("Distribution Comparison") | |
| images.append(fig_to_pil(fig)) | |
| plt.close(fig) | |
| print(result_text) | |
| return result_text, None, images[0], images[1], images[2] | |
| # CATEGORICAL vs CATEGORICAL | |
| if col1 in categorical_cols and col2 in categorical_cols: | |
| confusion = pd.crosstab(c1, c2) | |
| v = cramers_v(confusion) | |
| result_text = f"Cramér’s V = {v:.4f}" | |
| conf = pd.crosstab(c1,c2, margins=True, margins_name="Total") | |
| conf_table = conf.reset_index() | |
| conf_table.columns = ["Category_1"] + list(conf.columns) | |
| # Heatmap | |
| fig, ax = plt.subplots(figsize=(6,4)) | |
| sns.heatmap(confusion, cmap="Blues", annot=True, fmt="d") | |
| ax.set_title("Crosstab Heatmap") | |
| images.append(fig_to_pil(fig)) | |
| plt.close(fig) | |
| # Bar chart | |
| fig, ax = plt.subplots(figsize=(6,4)) | |
| confusion.sum(axis=1).plot(kind='bar', ax=ax) | |
| ax.set_title(f"Correlation between {col1} and {col2}") | |
| images.append(fig_to_pil(fig)) | |
| plt.close(fig) | |
| print(result_text) | |
| return result_text, conf_table, images[0], images[1], None | |
| # MIXED TYPES (numeric + categorical) | |
| # Ensure correct assignment | |
| if col1 in categorical_cols and col2 in numeric_cols: | |
| cat = col1; num = col2 | |
| else: | |
| cat = col2; num = col1 | |
| result_text = f"Numeric vs Categorical Analysis ({num} by {cat})" | |
| # Boxplot | |
| fig, ax = plt.subplots(figsize=(6,4)) | |
| sns.boxplot(x=df[cat], y=df[num], ax=ax) | |
| ax.set_title("Boxplot") | |
| plt.xticks(rotation=45, ha='right') | |
| images.append(fig_to_pil(fig)) | |
| plt.close(fig) | |
| # Violin plot | |
| fig, ax = plt.subplots(figsize=(6,4)) | |
| sns.violinplot(x=df[cat], y=df[num], ax=ax) | |
| ax.set_title("Violin Plot") | |
| plt.xticks(rotation=45, ha='right') | |
| images.append(fig_to_pil(fig)) | |
| plt.close(fig) | |
| print(result_text) | |
| return result_text, None, images[0], images[1], None | |
| def cramers_v(confusion_matrix): | |
| """ Cramér's V for categorical correlation """ | |
| chi2 = ss.chi2_contingency(confusion_matrix)[0] | |
| n = confusion_matrix.sum().sum() | |
| r, k = confusion_matrix.shape | |
| return np.sqrt(chi2 / (n * (min(r, k) - 1))) | |
| def compute_correlation(col1, col2): | |
| c1 = df[col1] | |
| c2 = df[col2] | |
| # Case 1: numeric vs numeric | |
| if col1 in numeric_cols and col2 in numeric_cols: | |
| corr = c1.corr(c2) | |
| return f"Pearson Correlation = {corr:.4f}", None | |
| # Case 2: categorical vs categorical → Cramér’s V | |
| if col1 in categorical_cols and col2 in categorical_cols: | |
| confusion = pd.crosstab(c1, c2) | |
| v = cramers_v(confusion) | |
| return f"Cramér’s V = {v:.4f}", confusion | |
| # Case 3: keyword frequency vs numeric/categorical | |
| # Convert col1 or col2 (if comma-separated) into frequency counts | |
| def keyword_expand(col): | |
| return df[col].dropna().astype(str).str.split(',').explode().str.strip() | |
| if col1 in categorical_cols: | |
| k = keyword_expand(col1) | |
| k_counts = k.value_counts() | |
| df_k = df.assign(**{f"{col1}_KEYWORD_COUNTS": df[col1].fillna("").apply( | |
| lambda x: sum([k_counts.get(i.strip(), 0) for i in x.split(',') if i.strip()]) | |
| )}) | |
| c1 = df_k[f"{col1}_KEYWORD_COUNTS"] | |
| if col2 in categorical_cols: | |
| k = keyword_expand(col2) | |
| k_counts = k.value_counts() | |
| df_k = df.assign(**{f"{col2}_KEYWORD_COUNTS": df[col2].fillna("").apply( | |
| lambda x: sum([k_counts.get(i.strip(), 0) for i in x.split(',') if i.strip()]) | |
| )}) | |
| c2 = df_k[f"{col2}_KEYWORD_COUNTS"] | |
| corr = c1.corr(c2) | |
| return f"Keyword-Frequency Based Correlation = {corr:.4f}", None | |
| # ---------------------------------------------------- | |
| # Gradio UI | |
| # ---------------------------------------------------- | |
| with gr.Blocks(title="DATA ANALYSIS APP") as app: | |
| gr.Markdown("# 📊 Youth Nutritional Data Analysis System \nUpload → Analyse → Export\n Developed by Dr. Indira Priyadarsini") | |
| with gr.Tab("ℹ️ About & Citation"): | |
| from a import citation | |
| gr.Markdown(citation) | |
| with gr.Tab("1️⃣ Descriptive Statistics"): | |
| btn_stats = gr.Button("Generate Stats") | |
| stats_out = gr.Dataframe() | |
| btn_stats.click(get_descriptive_stats, outputs=stats_out) | |
| with gr.Tab("2️⃣ Keyword Frequency Explorer"): | |
| col_select = gr.Dropdown(choices=list(categorical_cols), label="Select Column") | |
| freq_table = gr.Dataframe(label="Keyword Counts") | |
| bar_plot = gr.Image(label="Bar Chart") | |
| pie_img = gr.Image(label="Pie Chart") | |
| hbar_img = gr.Image(label="Horizontal Bar Chart") | |
| pareto_img = gr.Image(label="Pareto Chart") | |
| scatter_img = gr.Image(label="Rank vs Frequency Scatter") | |
| cum_img = gr.Image(label="Cumulative Distribution") | |
| download_btn = gr.Button("Download as CSV") | |
| download_file = gr.File(label="Download File") | |
| col_select.change(keyword_frequency, | |
| inputs=col_select, | |
| outputs=[freq_table, bar_plot, pie_img,hbar_img, pareto_img, scatter_img, cum_img]) | |
| download_btn.click( | |
| download_keyword_counts, | |
| inputs=freq_table, | |
| outputs=download_file | |
| ) | |
| with gr.Tab("4️⃣ Two-Column Relationship Explorer"): | |
| colA = gr.Dropdown(choices=df.columns.tolist(), label="Column A") | |
| colB = gr.Dropdown(choices=df.columns.tolist(), label="Column B") | |
| btn_rel = gr.Button("Explore Relationship") | |
| rel_text = gr.Textbox(label="Summary") | |
| rel_table = gr.Dataframe(label="Crosstab (if categorical)") | |
| rel_img1 = gr.Image() | |
| rel_img2 = gr.Image() | |
| rel_img3 = gr.Image() | |
| btn_rel.click( | |
| explore_two_columns, | |
| inputs=[colA, colB], | |
| outputs=[rel_text, rel_table, rel_img1, rel_img2, rel_img3] | |
| ) | |
| with gr.Tab("3️⃣ Correlation Explorer"): | |
| col_select = gr.Dropdown(choices=categorical_cols.tolist(), label="Select Column") | |
| col1 = gr.Dropdown(choices=df.columns.tolist(), label="Column 1") | |
| col2 = gr.Dropdown(choices=df.columns.tolist(), label="Column 2") | |
| corr_btn = gr.Button("Compute Correlation") | |
| corr_text = gr.Textbox(label="Correlation Result") | |
| confusion_out = gr.Dataframe(label="Categorical Crosstab (if applicable)") | |
| corr_btn.click(compute_correlation, inputs=[col1, col2], outputs=[corr_text, confusion_out]) | |
| app.launch(theme=gr.themes.Monochrome()) | |