Spaces:

juyam
/

my_gradio_test

Sleeping

App Files Files Community

""" Gradio Q&A app for your Tokyo-Olympic-Sample-Data.xlsx Features: - Loads the Excel file (path configured below) - Parses plain-language questions into simple intents (comparison, ranking, distribution, trend) - Runs Pandas aggregation queries against the table - Produces summary, 3 insights, a chart, the result table, and a generated SQL-like string - Minimal dependencies: gradio, pandas, matplotlib, openpyxl """ import os import re import math import pandas as pd import matplotlib.pyplot as plt import io import gradio as gr from typing import Tuple, Dict, Any # ---------- CONFIG ---------- EXCEL_PATH = "/mnt/data/Tokyo-Olympic-Sample-Data.xlsx" # change this if your file is in a different location SHEET_NAME = None # None loads first sheet MAX_CATEGORIES_FOR_PIE = 6 # ---------------------------- # Load dataset once def load_dataset(path: str, sheet_name=None) -> pd.DataFrame: if not os.path.exists(path): raise FileNotFoundError(f"Excel file not found at: {path}") df = pd.read_excel(path, sheet_name=sheet_name) # Normalize column names (strip) df.columns = [c.strip() for c in df.columns] return df # Very small whitelist to prevent arbitrary column injection ALLOWED_COLUMNS = {"Athlete","Country","Sport","Event","Medal","Gender","Age","Height","Weight","Year"} # Simple NLU parser: classify intent & extract entities def parse_question(question: str) -> Dict[str, Any]: q = question.strip().lower() intent = None entities = {"group_by": None, "metric": None, "filters": {}, "limit": None} # Intent detection: ranking/comparison/distribution/trend/summary if any(w in q for w in ["top", "topping", "highest", "most", "lead", "leading"]): intent = "ranking" elif any(w in q for w in ["over years", "over the years", "trend", "trend of", "year-wise", "by year"]): intent = "trend" elif any(w in q for w in ["by gender", "gender", "distribution", "share"]): intent = "distribution" elif any(w in q for w in ["summary", "overview", "insight"]): intent = "summary" elif any(w in q for w in ["which country", "which country is topping", "which country is on top", "country topping", "country top"]): intent = "ranking" else: # fallback heuristics if "country" in q: intent = "comparison" elif "medal" in q: intent = "comparison" else: intent = "summary" # Extract group_by if "country" in q: entities["group_by"] = "Country" elif "sport" in q: entities["group_by"] = "Sport" elif "athlete" in q: entities["group_by"] = "Athlete" elif "gender" in q: entities["group_by"] = "Gender" elif "year" in q: entities["group_by"] = "Year" # Metric detection if "gold" in q: entities["metric"] = ("Medal", "Gold") elif "silver" in q: entities["metric"] = ("Medal", "Silver") elif "bronze" in q: entities["metric"] = ("Medal", "Bronze") else: # default metric = count of Medal (number of medals) entities["metric"] = ("Medal", None) # Limit extraction (e.g., top 5) m = re.search(r"top\s+(\d+)", q) if m: entities["limit"] = int(m.group(1)) elif intent == "ranking": entities["limit"] = 1 if "topping" in q or "top of the list" in q or "which country is topping" in q else 5 # Year filter extraction m2 = re.search(r"(\b19|20)\d{2}\b", q) # crude year extractor (e.g., 2016 or 2020) if m2: entities["filters"]["Year"] = int(m2.group(0)) # Country filter extraction (e.g., "for USA" or "for India") # Simple approach: if question contains "for <word>" or "in <word>" (not robust for multiword countries) m3 = re.search(r"(for|in)\s+([A-Za-z' ]{2,30})", q) if m3: candidate = m3.group(2).strip() # quick normalize common short country tokens (optional) entities["filters"]["country_hint"] = candidate return {"intent": intent, "entities": entities, "raw": question} # Build and run pandas query based on parsed intent def run_query(df: pd.DataFrame, parsed: Dict[str, Any]) -> Tuple[pd.DataFrame, str]: intent = parsed["intent"] ent = parsed["entities"] group_by = ent["group_by"] metric_col, metric_value = ent["metric"] where_mask = pd.Series([True]len(df)) # Apply filters for k,v in ent["filters"].items(): if k == "country_hint": # try to match to Country (case-insensitive substring) where_mask &= df["Country"].astype(str).str.lower().str.contains(str(v).lower()) else: if k in df.columns: where_mask &= (df[k] == v) df_f = df[where_mask].copy() # Default SQL-like string (for display) sql_like = "-- Generated SQL-like query will appear here\n" # Ranking / comparison (group by + count) if intent in ("ranking", "comparison", "distribution"): if group_by is None: # default to Country if question mentions country or default group_by = "Country" if "country" in parsed["raw"].lower() else "Sport" if group_by not in ALLOWED_COLUMNS: raise ValueError(f"Cannot group by '{group_by}'. Allowed columns: {sorted(ALLOWED_COLUMNS)}") # If looking for a specific medal color if metric_value: df_metric = df_f[df_f[metric_col].astype(str).str.lower() == str(metric_value).lower()] else: df_metric = df_f result = df_metric.groupby(group_by, dropna=False)[metric_col].count().reset_index().rename(columns={metric_col: "metric_count"}) result = result.sort_values("metric_count", ascending=False).reset_index(drop=True) if ent.get("limit"): result = result.head(ent["limit"]) sql_like = f"SELECT {group_by}, COUNT({metric_col}) AS metric_count FROM dataset WHERE <filters> GROUP BY {group_by} ORDER BY metric_count DESC LIMIT {ent.get('limit')};" return result, sql_like # Trend elif intent == "trend": # group by Year if "Year" not in df.columns: raise ValueError("Dataset has no Year column for trend queries.") result = df_f.groupby("Year", dropna=False)["Medal"].count().reset_index().rename(columns={"Medal":"metric_count"}) result = result.sort_values("Year").reset_index(drop=True) sql_like = "SELECT Year, COUNT(Medal) AS metric_count FROM dataset WHERE <filters> GROUP BY Year ORDER BY Year ASC;" return result, sql_like # Summary (top-level aggregates) elif intent == "summary": # We'll build a small summary table: total medals, distinct countries, distinct sports, year range total_medals = len(df_f) distinct_countries = df_f["Country"].nunique() distinct_sports = df_f["Sport"].nunique() years = sorted(df_f["Year"].dropna().unique().tolist()) result = pd.DataFrame([ {"metric":"total_medals","value": total_medals}, {"metric":"distinct_countries","value": distinct_countries}, {"metric":"distinct_sports","value": distinct_sports}, {"metric":"year_range","value": f"{years[0] if years else 'N/A'} - {years[-1] if years else 'N/A'}"} ]) sql_like = "-- Summary aggregates computed from dataset" return result, sql_like else: # fallback: return empty frame return pd.DataFrame(), "-- no-op" # Generate text summary and 3 insights from the result table def generate_insights(parsed: Dict[str, Any], result_df: pd.DataFrame) -> Tuple[str, list]: intent = parsed["intent"] ent = parsed["entities"] raw = parsed["raw"] insights = [] summary = "" if result_df.empty: return "No results found for the given question.", ["No data to analyze."] if intent in ("ranking","comparison","distribution"): top = result_df.iloc[0] label_col = result_df.columns[0] value_col = result_df.columns[1] summary = f"{top[label_col]} is topping the list with {int(top[value_col])}." # Top 3 top3 = result_df.head(3) bullets = [] for i, row in top3.iterrows(): bullets.append(f"{row[label_col]}: {int(row[value_col])}") insights.append("Top performers: " + "; ".join(bullets)) # Dominance check if len(result_df) > 1: second_val = int(result_df.iloc[1][value_col]) if second_val == 0: insights.append("Second place has 0 count — check data quality or filters.") else: ratio = int(top[value_col]) / (second_val if second_val else 1) if ratio >= 1.5: insights.append(f"{top[label_col]} leads by a large margin (≥ 1.5× second place).") else: insights.append(f"{top[label_col]} leads but margin is modest ({ratio:.2f}× second place).") else: insights.append("Only one row in the result set.") # Add a helpful next action insights.append("Try: 'Show medal breakdown by sport for " + str(top[label_col]) + "'.") elif intent == "trend": # compute pct change from first to last first = result_df.iloc[0] last = result_df.iloc[-1] change = last[result_df.columns[1]] - first[result_df.columns[1]] pct = (change / first[result_df.columns[1]]) 100 if first[result_df.columns[1]] else math.inf summary = f"Medal count changed from {int(first[result_df.columns[1]])} in {first['Year']} to {int(last[result_df.columns[1]])} in {last['Year']} ({pct:.1f}% change)." insights.append("Largest year-over-year changes can be highlighted by checking deltas between consecutive years.") insights.append("Consider splitting trend by Gender or Country for deeper insights.") insights.append("Try: 'Show medals by gender over years'.") elif intent == "summary": # short textual summary from table total_medals = int(result_df[result_df['metric']=="total_medals"]['value'].values[0]) distinct_countries = int(result_df[result_df['metric']=="distinct_countries"]['value'].values[0]) years = result_df[result_df['metric']=="year_range"]['value'].values[0] summary = f"The dataset contains {total_medals} medals across {distinct_countries} countries ({years})." insights.append("You can ask for top countries, top sports, or trends over years.") insights.append("Try: 'Which country won the most medals?' or 'Medal count over years'.") insights.append("Filter by Year to inspect specific editions.") else: summary = "Here's the top result." insights.append("Use clearer language like 'Top 5 athletes by gold medals in 2020' for better answers.") # Ensure we return up to 3–4 concise bullets insights = insights[:4] return summary, insights # Chart generator: returns PNG bytes (matplotlib) def plot_result(result_df: pd.DataFrame, parsed: Dict[str, Any]) -> bytes: intent = parsed["intent"] # For simple ranking/comparison/distribution: bar or pie buf = io.BytesIO() plt.clf() try: if intent in ("ranking","comparison","distribution"): if result_df.shape[1] >= 2: x = result_df.iloc[:,0].astype(str) y = result_df.iloc[:,1].astype(int) # If many categories, use horizontal bar if len(x) > 8: plt.figure(figsize=(8, max(4, 0.5*len(x)))) plt.barh(x[::-1], y[::-1]) plt.xlabel("Count") plt.title("Counts by " + result_df.columns[0]) else: plt.figure(figsize=(8,4)) plt.bar(x, y) plt.ylabel("Count") plt.title("Counts by " + result_df.columns[0]) plt.xticks(rotation=45, ha="right") elif intent == "trend": x = result_df["Year"] y = result_df.iloc[:,1].astype(int) plt.figure(figsize=(8,4)) plt.plot(x, y, marker="o") plt.xlabel("Year") plt.ylabel("Count") plt.title("Trend over Years") plt.grid(True) elif intent == "summary": # show small table as image fig, ax = plt.subplots(figsize=(6,1.5)) ax.axis('off') table = plt.table(cellText=result_df.values, colLabels=result_df.columns, loc='center') table.auto_set_font_size(False) table.set_fontsize(10) table.scale(1, 1.5) else: plt.text(0.1, 0.5, "No chart available for this query.", fontsize=12) plt.tight_layout() plt.savefig(buf, format="png") except Exception as e: plt.clf() plt.text(0.1, 0.5, f"Error plotting: {str(e)}") plt.savefig(buf, format="png") buf.seek(0) return buf.read() # Main function tying everything together def answer_question(question: str): # Load data (cached globally could be added) try: df = load_dataset(EXCEL_PATH, SHEET_NAME) except Exception as e: return f"Error loading dataset: {str(e)}", "", None, None, "" parsed = parse_question(question) try: result_df, sql_like = run_query(df, parsed) except Exception as e: return f"Error while running query: {str(e)}", "", None, None, "" summary, insights = generate_insights(parsed, result_df) # prepare table preview (convert small df to html for Gradio table display) table_preview_html = result_df.head(20).to_html(index=False) # chart bytes chart_bytes = plot_result(result_df, parsed) # Format insights as newline-separated text insights_text = "\n".join([f"- {s}" for s in insights]) # Return values for Gradio: summary, insights, chart image, table (html), sql_like return summary, insights_text, chart_bytes, table_preview_html, sql_like # ---------- Gradio app ---------- def build_ui(): title = "No-Code Conversational Data Tool — Tokyo Olympic Q&A" description = "Ask plain-language questions about the dataset (e.g., 'Which country is topping the list?')" with gr.Blocks() as demo: gr.Markdown(f"# {title}") gr.Markdown(description) with gr.Row(): question = gr.Textbox(lines=2, placeholder="e.g., Which country is topping the list?", label="Ask a question") ask_btn = gr.Button("Ask") with gr.Row(): summary_out = gr.Textbox(label="Summary", interactive=False) insights_out = gr.Textbox(label="Key Insights", interactive=False) with gr.Row(): chart_out = gr.Image(label="Chart", type="pil") with gr.Row(): table_out = gr.HTML(label="Result Table (first 20 rows)") with gr.Row(): sql_out = gr.Textbox(label="Generated SQL-like statement", interactive=False) def on_ask(q): s, ins, chart_bytes, table_html, sql_like = answer_question(q) # chart_bytes might be bytes -> convert to pil image via BytesIO inside gr.Image return s, ins, chart_bytes, table_html, sql_like ask_btn.click(fn=on_ask, inputs=[question], outputs=[summary_out, insights_out, chart_out, table_out, sql_out]) return demo if name == "main": demo = build_ui() demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

by NaturalRojo - opened Dec 12, 2025

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

-49

NaturalRojo

Dec 12, 2025

"""
Gradio Q&A app for your Tokyo-Olympic-Sample-Data.xlsx

Features:

Loads the Excel file (path configured below)
Parses plain-language questions into simple intents (comparison, ranking, distribution, trend)
Runs Pandas aggregation queries against the table
Produces summary, 3 insights, a chart, the result table, and a generated SQL-like string
Minimal dependencies: gradio, pandas, matplotlib, openpyxl
"""

import os
import re
import math
import pandas as pd
import matplotlib.pyplot as plt
import io
import gradio as gr
from typing import Tuple, Dict, Any

---------- CONFIG ----------

EXCEL_PATH = "/mnt/data/Tokyo-Olympic-Sample-Data.xlsx" # change this if your file is in a different location
SHEET_NAME = None # None loads first sheet
MAX_CATEGORIES_FOR_PIE = 6

----------------------------

Load dataset once

def load_dataset(path: str, sheet_name=None) -> pd.DataFrame:
if not os.path.exists(path):
raise FileNotFoundError(f"Excel file not found at: {path}")
df = pd.read_excel(path, sheet_name=sheet_name)
# Normalize column names (strip)
df.columns = [c.strip() for c in df.columns]
return df

Very small whitelist to prevent arbitrary column injection

ALLOWED_COLUMNS = {"Athlete","Country","Sport","Event","Medal","Gender","Age","Height","Weight","Year"}

Simple NLU parser: classify intent & extract entities

def parse_question(question: str) -> Dict[str, Any]:
q = question.strip().lower()
intent = None
entities = {"group_by": None, "metric": None, "filters": {}, "limit": None}

# Intent detection: ranking/comparison/distribution/trend/summary
if any(w in q for w in ["top", "topping", "highest", "most", "lead", "leading"]):
    intent = "ranking"
elif any(w in q for w in ["over years", "over the years", "trend", "trend of", "year-wise", "by year"]):
    intent = "trend"
elif any(w in q for w in ["by gender", "gender", "distribution", "share"]):
    intent = "distribution"
elif any(w in q for w in ["summary", "overview", "insight"]):
    intent = "summary"
elif any(w in q for w in ["which country", "which country is topping", "which country is on top", "country topping", "country top"]):
    intent = "ranking"
else:
    # fallback heuristics
    if "country" in q:
        intent = "comparison"
    elif "medal" in q:
        intent = "comparison"
    else:
        intent = "summary"

# Extract group_by
if "country" in q:
    entities["group_by"] = "Country"
elif "sport" in q:
    entities["group_by"] = "Sport"
elif "athlete" in q:
    entities["group_by"] = "Athlete"
elif "gender" in q:
    entities["group_by"] = "Gender"
elif "year" in q:
    entities["group_by"] = "Year"

# Metric detection
if "gold" in q:
    entities["metric"] = ("Medal", "Gold")
elif "silver" in q:
    entities["metric"] = ("Medal", "Silver")
elif "bronze" in q:
    entities["metric"] = ("Medal", "Bronze")
else:
    # default metric = count of Medal (number of medals)
    entities["metric"] = ("Medal", None)

# Limit extraction (e.g., top 5)
m = re.search(r"top\s+(\d+)", q)
if m:
    entities["limit"] = int(m.group(1))
elif intent == "ranking":
    entities["limit"] = 1 if "topping" in q or "top of the list" in q or "which country is topping" in q else 5

# Year filter extraction
m2 = re.search(r"(\b19|20)\d{2}\b", q)  # crude year extractor (e.g., 2016 or 2020)
if m2:
    entities["filters"]["Year"] = int(m2.group(0))

# Country filter extraction (e.g., "for USA" or "for India")
# Simple approach: if question contains "for <word>" or "in <word>" (not robust for multiword countries)
m3 = re.search(r"(for|in)\s+([A-Za-z' ]{2,30})", q)
if m3:
    candidate = m3.group(2).strip()
    # quick normalize common short country tokens (optional)
    entities["filters"]["country_hint"] = candidate

return {"intent": intent, "entities": entities, "raw": question}

Build and run pandas query based on parsed intent

def run_query(df: pd.DataFrame, parsed: Dict[str, Any]) -> Tuple[pd.DataFrame, str]:
intent = parsed["intent"]
ent = parsed["entities"]
group_by = ent["group_by"]
metric_col, metric_value = ent["metric"]
where_mask = pd.Series([True]*len(df))

# Apply filters
for k,v in ent["filters"].items():
    if k == "country_hint":
        # try to match to Country (case-insensitive substring)
        where_mask &= df["Country"].astype(str).str.lower().str.contains(str(v).lower())
    else:
        if k in df.columns:
            where_mask &= (df[k] == v)

df_f = df[where_mask].copy()

# Default SQL-like string (for display)
sql_like = "-- Generated SQL-like query will appear here\n"

# Ranking / comparison (group by + count)
if intent in ("ranking", "comparison", "distribution"):
    if group_by is None:
        # default to Country if question mentions country or default
        group_by = "Country" if "country" in parsed["raw"].lower() else "Sport"
    if group_by not in ALLOWED_COLUMNS:
        raise ValueError(f"Cannot group by '{group_by}'. Allowed columns: {sorted(ALLOWED_COLUMNS)}")

    # If looking for a specific medal color
    if metric_value:
        df_metric = df_f[df_f[metric_col].astype(str).str.lower() == str(metric_value).lower()]
    else:
        df_metric = df_f

    result = df_metric.groupby(group_by, dropna=False)[metric_col].count().reset_index().rename(columns={metric_col: "metric_count"})
    result = result.sort_values("metric_count", ascending=False).reset_index(drop=True)

    if ent.get("limit"):
        result = result.head(ent["limit"])

    sql_like = f"SELECT {group_by}, COUNT({metric_col}) AS metric_count FROM dataset WHERE <filters> GROUP BY {group_by} ORDER BY metric_count DESC LIMIT {ent.get('limit')};"
    return result, sql_like

# Trend
elif intent == "trend":
    # group by Year
    if "Year" not in df.columns:
        raise ValueError("Dataset has no Year column for trend queries.")
    result = df_f.groupby("Year", dropna=False)["Medal"].count().reset_index().rename(columns={"Medal":"metric_count"})
    result = result.sort_values("Year").reset_index(drop=True)
    sql_like = "SELECT Year, COUNT(Medal) AS metric_count FROM dataset WHERE <filters> GROUP BY Year ORDER BY Year ASC;"
    return result, sql_like

# Summary (top-level aggregates)
elif intent == "summary":
    # We'll build a small summary table: total medals, distinct countries, distinct sports, year range
    total_medals = len(df_f)
    distinct_countries = df_f["Country"].nunique()
    distinct_sports = df_f["Sport"].nunique()
    years = sorted(df_f["Year"].dropna().unique().tolist())
    result = pd.DataFrame([
        {"metric":"total_medals","value": total_medals},
        {"metric":"distinct_countries","value": distinct_countries},
        {"metric":"distinct_sports","value": distinct_sports},
        {"metric":"year_range","value": f"{years[0] if years else 'N/A'} - {years[-1] if years else 'N/A'}"}
    ])
    sql_like = "-- Summary aggregates computed from dataset"
    return result, sql_like

else:
    # fallback: return empty frame
    return pd.DataFrame(), "-- no-op"

Generate text summary and 3 insights from the result table

def generate_insights(parsed: Dict[str, Any], result_df: pd.DataFrame) -> Tuple[str, list]:
intent = parsed["intent"]
ent = parsed["entities"]
raw = parsed["raw"]
insights = []
summary = ""

if result_df.empty:
    return "No results found for the given question.", ["No data to analyze."]

if intent in ("ranking","comparison","distribution"):
    top = result_df.iloc[0]
    label_col = result_df.columns[0]
    value_col = result_df.columns[1]
    summary = f"{top[label_col]} is topping the list with {int(top[value_col])}."
    # Top 3
    top3 = result_df.head(3)
    bullets = []
    for i, row in top3.iterrows():
        bullets.append(f"{row[label_col]}: {int(row[value_col])}")
    insights.append("Top performers: " + "; ".join(bullets))
    # Dominance check
    if len(result_df) > 1:
        second_val = int(result_df.iloc[1][value_col])
        if second_val == 0:
            insights.append("Second place has 0 count — check data quality or filters.")
        else:
            ratio = int(top[value_col]) / (second_val if second_val else 1)
            if ratio >= 1.5:
                insights.append(f"{top[label_col]} leads by a large margin (≥ 1.5× second place).")
            else:
                insights.append(f"{top[label_col]} leads but margin is modest ({ratio:.2f}× second place).")
    else:
        insights.append("Only one row in the result set.")
    # Add a helpful next action
    insights.append("Try: 'Show medal breakdown by sport for " + str(top[label_col]) + "'.")
elif intent == "trend":
    # compute pct change from first to last
    first = result_df.iloc[0]
    last = result_df.iloc[-1]
    change = last[result_df.columns[1]] - first[result_df.columns[1]]
    pct = (change / first[result_df.columns[1]]) * 100 if first[result_df.columns[1]] else math.inf
    summary = f"Medal count changed from {int(first[result_df.columns[1]])} in {first['Year']} to {int(last[result_df.columns[1]])} in {last['Year']} ({pct:.1f}% change)."
    insights.append("Largest year-over-year changes can be highlighted by checking deltas between consecutive years.")
    insights.append("Consider splitting trend by Gender or Country for deeper insights.")
    insights.append("Try: 'Show medals by gender over years'.")
elif intent == "summary":
    # short textual summary from table
    total_medals = int(result_df[result_df['metric']=="total_medals"]['value'].values[0])
    distinct_countries = int(result_df[result_df['metric']=="distinct_countries"]['value'].values[0])
    years = result_df[result_df['metric']=="year_range"]['value'].values[0]
    summary = f"The dataset contains {total_medals} medals across {distinct_countries} countries ({years})."
    insights.append("You can ask for top countries, top sports, or trends over years.")
    insights.append("Try: 'Which country won the most medals?' or 'Medal count over years'.")
    insights.append("Filter by Year to inspect specific editions.")
else:
    summary = "Here's the top result."
    insights.append("Use clearer language like 'Top 5 athletes by gold medals in 2020' for better answers.")

# Ensure we return up to 3–4 concise bullets
insights = insights[:4]
return summary, insights

Chart generator: returns PNG bytes (matplotlib)

def plot_result(result_df: pd.DataFrame, parsed: Dict[str, Any]) -> bytes:
intent = parsed["intent"]
# For simple ranking/comparison/distribution: bar or pie
buf = io.BytesIO()
plt.clf()
try:
if intent in ("ranking","comparison","distribution"):
if result_df.shape[1] >= 2:
x = result_df.iloc[:,0].astype(str)
y = result_df.iloc[:,1].astype(int)
# If many categories, use horizontal bar
if len(x) > 8:
plt.figure(figsize=(8, max(4, 0.5*len(x))))
plt.barh(x[::-1], y[::-1])
plt.xlabel("Count")
plt.title("Counts by " + result_df.columns[0])
else:
plt.figure(figsize=(8,4))
plt.bar(x, y)
plt.ylabel("Count")
plt.title("Counts by " + result_df.columns[0])
plt.xticks(rotation=45, ha="right")
elif intent == "trend":
x = result_df["Year"]
y = result_df.iloc[:,1].astype(int)
plt.figure(figsize=(8,4))
plt.plot(x, y, marker="o")
plt.xlabel("Year")
plt.ylabel("Count")
plt.title("Trend over Years")
plt.grid(True)
elif intent == "summary":
# show small table as image
fig, ax = plt.subplots(figsize=(6,1.5))
ax.axis('off')
table = plt.table(cellText=result_df.values, colLabels=result_df.columns, loc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 1.5)
else:
plt.text(0.1, 0.5, "No chart available for this query.", fontsize=12)
plt.tight_layout()
plt.savefig(buf, format="png")
except Exception as e:
plt.clf()
plt.text(0.1, 0.5, f"Error plotting: {str(e)}")
plt.savefig(buf, format="png")
buf.seek(0)
return buf.read()

Main function tying everything together

def answer_question(question: str):
# Load data (cached globally could be added)
try:
df = load_dataset(EXCEL_PATH, SHEET_NAME)
except Exception as e:
return f"Error loading dataset: {str(e)}", "", None, None, ""

parsed = parse_question(question)
try:
    result_df, sql_like = run_query(df, parsed)
except Exception as e:
    return f"Error while running query: {str(e)}", "", None, None, ""

summary, insights = generate_insights(parsed, result_df)

# prepare table preview (convert small df to html for Gradio table display)
table_preview_html = result_df.head(20).to_html(index=False)

# chart bytes
chart_bytes = plot_result(result_df, parsed)

# Format insights as newline-separated text
insights_text = "\n".join([f"- {s}" for s in insights])

# Return values for Gradio: summary, insights, chart image, table (html), sql_like
return summary, insights_text, chart_bytes, table_preview_html, sql_like

---------- Gradio app ----------

def build_ui():
title = "No-Code Conversational Data Tool — Tokyo Olympic Q&A"
description = "Ask plain-language questions about the dataset (e.g., 'Which country is topping the list?')"

with gr.Blocks() as demo:
    gr.Markdown(f"# {title}")
    gr.Markdown(description)
    with gr.Row():
        question = gr.Textbox(lines=2, placeholder="e.g., Which country is topping the list?", label="Ask a question")
        ask_btn = gr.Button("Ask")
    with gr.Row():
        summary_out = gr.Textbox(label="Summary", interactive=False)
        insights_out = gr.Textbox(label="Key Insights", interactive=False)
    with gr.Row():
        chart_out = gr.Image(label="Chart", type="pil")
    with gr.Row():
        table_out = gr.HTML(label="Result Table (first 20 rows)")
    with gr.Row():
        sql_out = gr.Textbox(label="Generated SQL-like statement", interactive=False)

    def on_ask(q):
        s, ins, chart_bytes, table_html, sql_like = answer_question(q)
        # chart_bytes might be bytes -> convert to pil image via BytesIO inside gr.Image
        return s, ins, chart_bytes, table_html, sql_like

    ask_btn.click(fn=on_ask, inputs=[question], outputs=[summary_out, insights_out, chart_out, table_out, sql_out])
return demo

if name == "main":
demo = build_ui()
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

""" Gradio Q&A app for your Tokyo-Olympic-Sample-Data.xlsx Features: - Loads the Excel file (path configured below) - Parses plain-language questions into simple intents (comparison, ranking, distribution, trend) - Runs Pandas aggregation queries against the table - Produces summary, 3 insights, a chart, the result table, and a generated SQL-like string - Minimal dependencies: gradio, pandas, matplotlib, openpyxl """ import os import re import math import pandas as pd import matplotlib.pyplot as plt import io import gradio as gr from typing import Tuple, Dict, Any # ---------- CONFIG ---------- EXCEL_PATH = "/mnt/data/Tokyo-Olympic-Sample-Data.xlsx" # change this if your file is in a different location SHEET_NAME = None # None loads first sheet MAX_CATEGORIES_FOR_PIE = 6 # ---------------------------- # Load dataset once def load_dataset(path: str, sheet_name=None) -> pd.DataFrame: if not os.path.exists(path): raise FileNotFoundError(f"Excel file not found at: {path}") df = pd.read_excel(path, sheet_name=sheet_name) # Normalize column names (strip) df.columns = [c.strip() for c in df.columns] return df # Very small whitelist to prevent arbitrary column injection ALLOWED_COLUMNS = {"Athlete","Country","Sport","Event","Medal","Gender","Age","Height","Weight","Year"} # Simple NLU parser: classify intent & extract entities def parse_question(question: str) -> Dict[str, Any]: q = question.strip().lower() intent = None entities = {"group_by": None, "metric": None, "filters": {}, "limit": None} # Intent detection: ranking/comparison/distribution/trend/summary if any(w in q for w in ["top", "topping", "highest", "most", "lead", "leading"]): intent = "ranking" elif any(w in q for w in ["over years", "over the years", "trend", "trend of", "year-wise", "by year"]): intent = "trend" elif any(w in q for w in ["by gender", "gender", "distribution", "share"]): intent = "distribution" elif any(w in q for w in ["summary", "overview", "insight"]): intent = "summary" elif any(w in q for w in ["which country", "which country is topping", "which country is on top", "country topping", "country top"]): intent = "ranking" else: # fallback heuristics if "country" in q: intent = "comparison" elif "medal" in q: intent = "comparison" else: intent = "summary" # Extract group_by if "country" in q: entities["group_by"] = "Country" elif "sport" in q: entities["group_by"] = "Sport" elif "athlete" in q: entities["group_by"] = "Athlete" elif "gender" in q: entities["group_by"] = "Gender" elif "year" in q: entities["group_by"] = "Year" # Metric detection if "gold" in q: entities["metric"] = ("Medal", "Gold") elif "silver" in q: entities["metric"] = ("Medal", "Silver") elif "bronze" in q: entities["metric"] = ("Medal", "Bronze") else: # default metric = count of Medal (number of medals) entities["metric"] = ("Medal", None) # Limit extraction (e.g., top 5) m = re.search(r"top\s+(\d+)", q) if m: entities["limit"] = int(m.group(1)) elif intent == "ranking": entities["limit"] = 1 if "topping" in q or "top of the list" in q or "which country is topping" in q else 5 # Year filter extraction m2 = re.search(r"(\b19|20)\d{2}\b", q) # crude year extractor (e.g., 2016 or 2020) if m2: entities["filters"]["Year"] = int(m2.group(0)) # Country filter extraction (e.g., "for USA" or "for India") # Simple approach: if question contains "for <word>" or "in <word>" (not robust for multiword countries) m3 = re.search(r"(for|in)\s+([A-Za-z' ]{2,30})", q) if m3: candidate = m3.group(2).strip() # quick normalize common short country tokens (optional) entities["filters"]["country_hint"] = candidate return {"intent": intent, "entities": entities, "raw": question} # Build and run pandas query based on parsed intent def run_query(df: pd.DataFrame, parsed: Dict[str, Any]) -> Tuple[pd.DataFrame, str]: intent = parsed["intent"] ent = parsed["entities"] group_by = ent["group_by"] metric_col, metric_value = ent["metric"] where_mask = pd.Series([True]*len(df)) # Apply filters for k,v in ent["filters"].items(): if k == "country_hint": # try to match to Country (case-insensitive substring) where_mask &= df["Country"].astype(str).str.lower().str.contains(str(v).lower()) else: if k in df.columns: where_mask &= (df[k] == v) df_f = df[where_mask].copy() # Default SQL-like string (for display) sql_like = "-- Generated SQL-like query will appear here\n" # Ranking / comparison (group by + count) if intent in ("ranking", "comparison", "distribution"): if group_by is None: # default to Country if question mentions country or default group_by = "Country" if "country" in parsed["raw"].lower() else "Sport" if group_by not in ALLOWED_COLUMNS: raise ValueError(f"Cannot group by '{group_by}'. Allowed columns: {sorted(ALLOWED_COLUMNS)}") # If looking for a specific medal color if metric_value: df_metric = df_f[df_f[metric_col].astype(str).str.lower() == str(metric_value).lower()] else: df_metric = df_f result = df_metric.groupby(group_by, dropna=False)[metric_col].count().reset_index().rename(columns={metric_col: "metric_count"}) result = result.sort_values("metric_count", ascending=False).reset_index(drop=True) if ent.get("limit"): result = result.head(ent["limit"]) sql_like = f"SELECT {group_by}, COUNT({metric_col}) AS metric_count FROM dataset WHERE <filters> GROUP BY {group_by} ORDER BY metric_count DESC LIMIT {ent.get('limit')};" return result, sql_like # Trend elif intent == "trend": # group by Year if "Year" not in df.columns: raise ValueError("Dataset has no Year column for trend queries.") result = df_f.groupby("Year", dropna=False)["Medal"].count().reset_index().rename(columns={"Medal":"metric_count"}) result = result.sort_values("Year").reset_index(drop=True) sql_like = "SELECT Year, COUNT(Medal) AS metric_count FROM dataset WHERE <filters> GROUP BY Year ORDER BY Year ASC;" return result, sql_like # Summary (top-level aggregates) elif intent == "summary": # We'll build a small summary table: total medals, distinct countries, distinct sports, year range total_medals = len(df_f) distinct_countries = df_f["Country"].nunique() distinct_sports = df_f["Sport"].nunique() years = sorted(df_f["Year"].dropna().unique().tolist()) result = pd.DataFrame([ {"metric":"total_medals","value": total_medals}, {"metric":"distinct_countries","value": distinct_countries}, {"metric":"distinct_sports","value": distinct_sports}, {"metric":"year_range","value": f"{years[0] if years else 'N/A'} - {years[-1] if years else 'N/A'}"} ]) sql_like = "-- Summary aggregates computed from dataset" return result, sql_like else: # fallback: return empty frame return pd.DataFrame(), "-- no-op" # Generate text summary and 3 insights from the result table def generate_insights(parsed: Dict[str, Any], result_df: pd.DataFrame) -> Tuple[str, list]: intent = parsed["intent"] ent = parsed["entities"] raw = parsed["raw"] insights = [] summary = "" if result_df.empty: return "No results found for the given question.", ["No data to analyze."] if intent in ("ranking","comparison","distribution"): top = result_df.iloc[0] label_col = result_df.columns[0] value_col = result_df.columns[1] summary = f"{top[label_col]} is topping the list with {int(top[value_col])}." # Top 3 top3 = result_df.head(3) bullets = [] for i, row in top3.iterrows(): bullets.append(f"{row[label_col]}: {int(row[value_col])}") insights.append("Top performers: " + "; ".join(bullets)) # Dominance check if len(result_df) > 1: second_val = int(result_df.iloc[1][value_col]) if second_val == 0: insights.append("Second place has 0 count — check data quality or filters.") else: ratio = int(top[value_col]) / (second_val if second_val else 1) if ratio >= 1.5: insights.append(f"{top[label_col]} leads by a large margin (≥ 1.5× second place).") else: insights.append(f"{top[label_col]} leads but margin is modest ({ratio:.2f}× second place).") else: insights.append("Only one row in the result set.") # Add a helpful next action insights.append("Try: 'Show medal breakdown by sport for " + str(top[label_col]) + "'.") elif intent == "trend": # compute pct change from first to last first = result_df.iloc[0] last = result_df.iloc[-1] change = last[result_df.columns[1]] - first[result_df.columns[1]] pct = (change / first[result_df.columns[1]]) * 100 if first[result_df.columns[1]] else math.inf summary = f"Medal count changed from {int(first[result_df.columns[1]])} in {first['Year']} to {int(last[result_df.columns[1]])} in {last['Year']} ({pct:.1f}% change)." insights.append("Largest year-over-year changes can be highlighted by checking deltas between consecutive years.") insights.append("Consider splitting trend by Gender or Country for deeper insights.") insights.append("Try: 'Show medals by gender over years'.") elif intent == "summary": # short textual summary from table total_medals = int(result_df[result_df['metric']=="total_medals"]['value'].values[0]) distinct_countries = int(result_df[result_df['metric']=="distinct_countries"]['value'].values[0]) years = result_df[result_df['metric']=="year_range"]['value'].values[0] summary = f"The dataset contains {total_medals} medals across {distinct_countries} countries ({years})." insights.append("You can ask for top countries, top sports, or trends over years.") insights.append("Try: 'Which country won the most medals?' or 'Medal count over years'.") insights.append("Filter by Year to inspect specific editions.") else: summary = "Here's the top result." insights.append("Use clearer language like 'Top 5 athletes by gold medals in 2020' for better answers.") # Ensure we return up to 3–4 concise bullets insights = insights[:4] return summary, insights # Chart generator: returns PNG bytes (matplotlib) def plot_result(result_df: pd.DataFrame, parsed: Dict[str, Any]) -> bytes: intent = parsed["intent"] # For simple ranking/comparison/distribution: bar or pie buf = io.BytesIO() plt.clf() try: if intent in ("ranking","comparison","distribution"): if result_df.shape[1] >= 2: x = result_df.iloc[:,0].astype(str) y = result_df.iloc[:,1].astype(int) # If many categories, use horizontal bar if len(x) > 8: plt.figure(figsize=(8, max(4, 0.5*len(x)))) plt.barh(x[::-1], y[::-1]) plt.xlabel("Count") plt.title("Counts by " + result_df.columns[0]) else: plt.figure(figsize=(8,4)) plt.bar(x, y) plt.ylabel("Count") plt.title("Counts by " + result_df.columns[0]) plt.xticks(rotation=45, ha="right") elif intent == "trend": x = result_df["Year"] y = result_df.iloc[:,1].astype(int) plt.figure(figsize=(8,4)) plt.plot(x, y, marker="o") plt.xlabel("Year") plt.ylabel("Count") plt.title("Trend over Years") plt.grid(True) elif intent == "summary": # show small table as image fig, ax = plt.subplots(figsize=(6,1.5)) ax.axis('off') table = plt.table(cellText=result_df.values, colLabels=result_df.columns, loc='center') table.auto_set_font_size(False) table.set_fontsize(10) table.scale(1, 1.5) else: plt.text(0.1, 0.5, "No chart available for this query.", fontsize=12) plt.tight_layout() plt.savefig(buf, format="png") except Exception as e: plt.clf() plt.text(0.1, 0.5, f"Error plotting: {str(e)}") plt.savefig(buf, format="png") buf.seek(0) return buf.read() # Main function tying everything together def answer_question(question: str): # Load data (cached globally could be added) try: df = load_dataset(EXCEL_PATH, SHEET_NAME) except Exception as e: return f"Error loading dataset: {str(e)}", "", None, None, "" parsed = parse_question(question) try: result_df, sql_like = run_query(df, parsed) except Exception as e: return f"Error while running query: {str(e)}", "", None, None, "" summary, insights = generate_insights(parsed, result_df) # prepare table preview (convert small df to html for Gradio table display) table_preview_html = result_df.head(20).to_html(index=False) # chart bytes chart_bytes = plot_result(result_df, parsed) # Format insights as newline-separated text insights_text = "\n".join([f"- {s}" for s in insights]) # Return values for Gradio: summary, insights, chart image, table (html), sql_like return summary, insights_text, chart_bytes, table_preview_html, sql_like # ---------- Gradio app ---------- def build_ui(): title = "No-Code Conversational Data Tool — Tokyo Olympic Q&A" description = "Ask plain-language questions about the dataset (e.g., 'Which country is topping the list?')" with gr.Blocks() as demo: gr.Markdown(f"# {title}") gr.Markdown(description) with gr.Row(): question = gr.Textbox(lines=2, placeholder="e.g., Which country is topping the list?", label="Ask a question") ask_btn = gr.Button("Ask") with gr.Row(): summary_out = gr.Textbox(label="Summary", interactive=False) insights_out = gr.Textbox(label="Key Insights", interactive=False) with gr.Row(): chart_out = gr.Image(label="Chart", type="pil") with gr.Row(): table_out = gr.HTML(label="Result Table (first 20 rows)") with gr.Row(): sql_out = gr.Textbox(label="Generated SQL-like statement", interactive=False) def on_ask(q): s, ins, chart_bytes, table_html, sql_like = answer_question(q) # chart_bytes might be bytes -> convert to pil image via BytesIO inside gr.Image return s, ins, chart_bytes, table_html, sql_like ask_btn.click(fn=on_ask, inputs=[question], outputs=[summary_out, insights_out, chart_out, table_out, sql_out]) return demo if __name__ == "__main__": demo = build_ui() demo.launch(server_name="0.0.0.0", server_port=7860, share=False)c5c73831

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

Ready to merge

This branch is ready to get merged automatically.

· Sign up or log in to comment