""" Gradio Q&A app for your Tokyo-Olympic-Sample-Data.xlsx Features: - Loads the Excel file (path configured below) - Parses plain-language questions into simple intents (comparison, ranking, distribution, trend) - Runs Pandas aggregation queries against the table - Produces summary, 3 insights, a chart, the result table, and a generated SQL-like string - Minimal dependencies: gradio, pandas, matplotlib, openpyxl """ import os import re import math import pandas as pd import matplotlib.pyplot as plt import io import gradio as gr from typing import Tuple, Dict, Any # ---------- CONFIG ---------- EXCEL_PATH = "/mnt/data/Tokyo-Olympic-Sample-Data.xlsx" # change this if your file is in a different location SHEET_NAME = None # None loads first sheet MAX_CATEGORIES_FOR_PIE = 6 # ---------------------------- # Load dataset once def load_dataset(path: str, sheet_name=None) -> pd.DataFrame: if not os.path.exists(path): raise FileNotFoundError(f"Excel file not found at: {path}") df = pd.read_excel(path, sheet_name=sheet_name) # Normalize column names (strip) df.columns = [c.strip() for c in df.columns] return df # Very small whitelist to prevent arbitrary column injection ALLOWED_COLUMNS = {"Athlete","Country","Sport","Event","Medal","Gender","Age","Height","Weight","Year"} # Simple NLU parser: classify intent & extract entities def parse_question(question: str) -> Dict[str, Any]: q = question.strip().lower() intent = None entities = {"group_by": None, "metric": None, "filters": {}, "limit": None} # Intent detection: ranking/comparison/distribution/trend/summary if any(w in q for w in ["top", "topping", "highest", "most", "lead", "leading"]): intent = "ranking" elif any(w in q for w in ["over years", "over the years", "trend", "trend of", "year-wise", "by year"]): intent = "trend" elif any(w in q for w in ["by gender", "gender", "distribution", "share"]): intent = "distribution" elif any(w in q for w in ["summary", "overview", "insight"]): intent = "summary" elif any(w in q for w in ["which country", "which country is topping", "which country is on top", "country topping", "country top"]): intent = "ranking" else: # fallback heuristics if "country" in q: intent = "comparison" elif "medal" in q: intent = "comparison" else: intent = "summary" # Extract group_by if "country" in q: entities["group_by"] = "Country" elif "sport" in q: entities["group_by"] = "Sport" elif "athlete" in q: entities["group_by"] = "Athlete" elif "gender" in q: entities["group_by"] = "Gender" elif "year" in q: entities["group_by"] = "Year" # Metric detection if "gold" in q: entities["metric"] = ("Medal", "Gold") elif "silver" in q: entities["metric"] = ("Medal", "Silver") elif "bronze" in q: entities["metric"] = ("Medal", "Bronze") else: # default metric = count of Medal (number of medals) entities["metric"] = ("Medal", None) # Limit extraction (e.g., top 5) m = re.search(r"top\s+(\d+)", q) if m: entities["limit"] = int(m.group(1)) elif intent == "ranking": entities["limit"] = 1 if "topping" in q or "top of the list" in q or "which country is topping" in q else 5 # Year filter extraction m2 = re.search(r"(\b19|20)\d{2}\b", q) # crude year extractor (e.g., 2016 or 2020) if m2: entities["filters"]["Year"] = int(m2.group(0)) # Country filter extraction (e.g., "for USA" or "for India") # Simple approach: if question contains "for <word>" or "in <word>" (not robust for multiword countries) m3 = re.search(r"(for|in)\s+([A-Za-z' ]{2,30})", q) if m3: candidate = m3.group(2).strip() # quick normalize common short country tokens (optional) entities["filters"]["country_hint"] = candidate return {"intent": intent, "entities": entities, "raw": question} # Build and run pandas query based on parsed intent def run_query(df: pd.DataFrame, parsed: Dict[str, Any]) -> Tuple[pd.DataFrame, str]: intent = parsed["intent"] ent = parsed["entities"] group_by = ent["group_by"] metric_col, metric_value = ent["metric"] where_mask = pd.Series([True]*len(df)) # Apply filters for k,v in ent["filters"].items(): if k == "country_hint": # try to match to Country (case-insensitive substring) where_mask &= df["Country"].astype(str).str.lower().str.contains(str(v).lower()) else: if k in df.columns: where_mask &= (df[k] == v) df_f = df[where_mask].copy() # Default SQL-like string (for display) sql_like = "-- Generated SQL-like query will appear here\n" # Ranking / comparison (group by + count) if intent in ("ranking", "comparison", "distribution"): if group_by is None: # default to Country if question mentions country or default group_by = "Country" if "country" in parsed["raw"].lower() else "Sport" if group_by not in ALLOWED_COLUMNS: raise ValueError(f"Cannot group by '{group_by}'. Allowed columns: {sorted(ALLOWED_COLUMNS)}") # If looking for a specific medal color if metric_value: df_metric = df_f[df_f[metric_col].astype(str).str.lower() == str(metric_value).lower()] else: df_metric = df_f result = df_metric.groupby(group_by, dropna=False)[metric_col].count().reset_index().rename(columns={metric_col: "metric_count"}) result = result.sort_values("metric_count", ascending=False).reset_index(drop=True) if ent.get("limit"): result = result.head(ent["limit"]) sql_like = f"SELECT {group_by}, COUNT({metric_col}) AS metric_count FROM dataset WHERE <filters> GROUP BY {group_by} ORDER BY metric_count DESC LIMIT {ent.get('limit')};" return result, sql_like # Trend elif intent == "trend": # group by Year if "Year" not in df.columns: raise ValueError("Dataset has no Year column for trend queries.") result = df_f.groupby("Year", dropna=False)["Medal"].count().reset_index().rename(columns={"Medal":"metric_count"}) result = result.sort_values("Year").reset_index(drop=True) sql_like = "SELECT Year, COUNT(Medal) AS metric_count FROM dataset WHERE <filters> GROUP BY Year ORDER BY Year ASC;" return result, sql_like # Summary (top-level aggregates) elif intent == "summary": # We'll build a small summary table: total medals, distinct countries, distinct sports, year range total_medals = len(df_f) distinct_countries = df_f["Country"].nunique() distinct_sports = df_f["Sport"].nunique() years = sorted(df_f["Year"].dropna().unique().tolist()) result = pd.DataFrame([ {"metric":"total_medals","value": total_medals}, {"metric":"distinct_countries","value": distinct_countries}, {"metric":"distinct_sports","value": distinct_sports}, {"metric":"year_range","value": f"{years[0] if years else 'N/A'} - {years[-1] if years else 'N/A'}"} ]) sql_like = "-- Summary aggregates computed from dataset" return result, sql_like else: # fallback: return empty frame return pd.DataFrame(), "-- no-op" # Generate text summary and 3 insights from the result table def generate_insights(parsed: Dict[str, Any], result_df: pd.DataFrame) -> Tuple[str, list]: intent = parsed["intent"] ent = parsed["entities"] raw = parsed["raw"] insights = [] summary = "" if result_df.empty: return "No results found for the given question.", ["No data to analyze."] if intent in ("ranking","comparison","distribution"): top = result_df.iloc[0] label_col = result_df.columns[0] value_col = result_df.columns[1] summary = f"{top[label_col]} is topping the list with {int(top[value_col])}." # Top 3 top3 = result_df.head(3) bullets = [] for i, row in top3.iterrows(): bullets.append(f"{row[label_col]}: {int(row[value_col])}") insights.append("Top performers: " + "; ".join(bullets)) # Dominance check if len(result_df) > 1: second_val = int(result_df.iloc[1][value_col]) if second_val == 0: insights.append("Second place has 0 count — check data quality or filters.") else: ratio = int(top[value_col]) / (second_val if second_val else 1) if ratio >= 1.5: insights.append(f"{top[label_col]} leads by a large margin (≥ 1.5× second place).") else: insights.append(f"{top[label_col]} leads but margin is modest ({ratio:.2f}× second place).") else: insights.append("Only one row in the result set.") # Add a helpful next action insights.append("Try: 'Show medal breakdown by sport for " + str(top[label_col]) + "'.") elif intent == "trend": # compute pct change from first to last first = result_df.iloc[0] last = result_df.iloc[-1] change = last[result_df.columns[1]] - first[result_df.columns[1]] pct = (change / first[result_df.columns[1]]) * 100 if first[result_df.columns[1]] else math.inf summary = f"Medal count changed from {int(first[result_df.columns[1]])} in {first['Year']} to {int(last[result_df.columns[1]])} in {last['Year']} ({pct:.1f}% change)." insights.append("Largest year-over-year changes can be highlighted by checking deltas between consecutive years.") insights.append("Consider splitting trend by Gender or Country for deeper insights.") insights.append("Try: 'Show medals by gender over years'.") elif intent == "summary": # short textual summary from table total_medals = int(result_df[result_df['metric']=="total_medals"]['value'].values[0]) distinct_countries = int(result_df[result_df['metric']=="distinct_countries"]['value'].values[0]) years = result_df[result_df['metric']=="year_range"]['value'].values[0] summary = f"The dataset contains {total_medals} medals across {distinct_countries} countries ({years})." insights.append("You can ask for top countries, top sports, or trends over years.") insights.append("Try: 'Which country won the most medals?' or 'Medal count over years'.") insights.append("Filter by Year to inspect specific editions.") else: summary = "Here's the top result." insights.append("Use clearer language like 'Top 5 athletes by gold medals in 2020' for better answers.") # Ensure we return up to 3–4 concise bullets insights = insights[:4] return summary, insights # Chart generator: returns PNG bytes (matplotlib) def plot_result(result_df: pd.DataFrame, parsed: Dict[str, Any]) -> bytes: intent = parsed["intent"] # For simple ranking/comparison/distribution: bar or pie buf = io.BytesIO() plt.clf() try: if intent in ("ranking","comparison","distribution"): if result_df.shape[1] >= 2: x = result_df.iloc[:,0].astype(str) y = result_df.iloc[:,1].astype(int) # If many categories, use horizontal bar if len(x) > 8: plt.figure(figsize=(8, max(4, 0.5*len(x)))) plt.barh(x[::-1], y[::-1]) plt.xlabel("Count") plt.title("Counts by " + result_df.columns[0]) else: plt.figure(figsize=(8,4)) plt.bar(x, y) plt.ylabel("Count") plt.title("Counts by " + result_df.columns[0]) plt.xticks(rotation=45, ha="right") elif intent == "trend": x = result_df["Year"] y = result_df.iloc[:,1].astype(int) plt.figure(figsize=(8,4)) plt.plot(x, y, marker="o") plt.xlabel("Year") plt.ylabel("Count") plt.title("Trend over Years") plt.grid(True) elif intent == "summary": # show small table as image fig, ax = plt.subplots(figsize=(6,1.5)) ax.axis('off') table = plt.table(cellText=result_df.values, colLabels=result_df.columns, loc='center') table.auto_set_font_size(False) table.set_fontsize(10) table.scale(1, 1.5) else: plt.text(0.1, 0.5, "No chart available for this query.", fontsize=12) plt.tight_layout() plt.savefig(buf, format="png") except Exception as e: plt.clf() plt.text(0.1, 0.5, f"Error plotting: {str(e)}") plt.savefig(buf, format="png") buf.seek(0) return buf.read() # Main function tying everything together def answer_question(question: str): # Load data (cached globally could be added) try: df = load_dataset(EXCEL_PATH, SHEET_NAME) except Exception as e: return f"Error loading dataset: {str(e)}", "", None, None, "" parsed = parse_question(question) try: result_df, sql_like = run_query(df, parsed) except Exception as e: return f"Error while running query: {str(e)}", "", None, None, "" summary, insights = generate_insights(parsed, result_df) # prepare table preview (convert small df to html for Gradio table display) table_preview_html = result_df.head(20).to_html(index=False) # chart bytes chart_bytes = plot_result(result_df, parsed) # Format insights as newline-separated text insights_text = "\n".join([f"- {s}" for s in insights]) # Return values for Gradio: summary, insights, chart image, table (html), sql_like return summary, insights_text, chart_bytes, table_preview_html, sql_like # ---------- Gradio app ---------- def build_ui(): title = "No-Code Conversational Data Tool — Tokyo Olympic Q&A" description = "Ask plain-language questions about the dataset (e.g., 'Which country is topping the list?')" with gr.Blocks() as demo: gr.Markdown(f"# {title}") gr.Markdown(description) with gr.Row(): question = gr.Textbox(lines=2, placeholder="e.g., Which country is topping the list?", label="Ask a question") ask_btn = gr.Button("Ask") with gr.Row(): summary_out = gr.Textbox(label="Summary", interactive=False) insights_out = gr.Textbox(label="Key Insights", interactive=False) with gr.Row(): chart_out = gr.Image(label="Chart", type="pil") with gr.Row(): table_out = gr.HTML(label="Result Table (first 20 rows)") with gr.Row(): sql_out = gr.Textbox(label="Generated SQL-like statement", interactive=False) def on_ask(q): s, ins, chart_bytes, table_html, sql_like = answer_question(q) # chart_bytes might be bytes -> convert to pil image via BytesIO inside gr.Image return s, ins, chart_bytes, table_html, sql_like ask_btn.click(fn=on_ask, inputs=[question], outputs=[summary_out, insights_out, chart_out, table_out, sql_out]) return demo if __name__ == "__main__": demo = build_ui() demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

#3
by NaturalRojo - opened

"""
Gradio Q&A app for your Tokyo-Olympic-Sample-Data.xlsx

Features:

  • Loads the Excel file (path configured below)
  • Parses plain-language questions into simple intents (comparison, ranking, distribution, trend)
  • Runs Pandas aggregation queries against the table
  • Produces summary, 3 insights, a chart, the result table, and a generated SQL-like string
  • Minimal dependencies: gradio, pandas, matplotlib, openpyxl
    """

import os
import re
import math
import pandas as pd
import matplotlib.pyplot as plt
import io
import gradio as gr
from typing import Tuple, Dict, Any

---------- CONFIG ----------

EXCEL_PATH = "/mnt/data/Tokyo-Olympic-Sample-Data.xlsx" # change this if your file is in a different location
SHEET_NAME = None # None loads first sheet
MAX_CATEGORIES_FOR_PIE = 6

----------------------------

Load dataset once

def load_dataset(path: str, sheet_name=None) -> pd.DataFrame:
if not os.path.exists(path):
raise FileNotFoundError(f"Excel file not found at: {path}")
df = pd.read_excel(path, sheet_name=sheet_name)
# Normalize column names (strip)
df.columns = [c.strip() for c in df.columns]
return df

Very small whitelist to prevent arbitrary column injection

ALLOWED_COLUMNS = {"Athlete","Country","Sport","Event","Medal","Gender","Age","Height","Weight","Year"}

Simple NLU parser: classify intent & extract entities

def parse_question(question: str) -> Dict[str, Any]:
q = question.strip().lower()
intent = None
entities = {"group_by": None, "metric": None, "filters": {}, "limit": None}

# Intent detection: ranking/comparison/distribution/trend/summary
if any(w in q for w in ["top", "topping", "highest", "most", "lead", "leading"]):
    intent = "ranking"
elif any(w in q for w in ["over years", "over the years", "trend", "trend of", "year-wise", "by year"]):
    intent = "trend"
elif any(w in q for w in ["by gender", "gender", "distribution", "share"]):
    intent = "distribution"
elif any(w in q for w in ["summary", "overview", "insight"]):
    intent = "summary"
elif any(w in q for w in ["which country", "which country is topping", "which country is on top", "country topping", "country top"]):
    intent = "ranking"
else:
    # fallback heuristics
    if "country" in q:
        intent = "comparison"
    elif "medal" in q:
        intent = "comparison"
    else:
        intent = "summary"

# Extract group_by
if "country" in q:
    entities["group_by"] = "Country"
elif "sport" in q:
    entities["group_by"] = "Sport"
elif "athlete" in q:
    entities["group_by"] = "Athlete"
elif "gender" in q:
    entities["group_by"] = "Gender"
elif "year" in q:
    entities["group_by"] = "Year"

# Metric detection
if "gold" in q:
    entities["metric"] = ("Medal", "Gold")
elif "silver" in q:
    entities["metric"] = ("Medal", "Silver")
elif "bronze" in q:
    entities["metric"] = ("Medal", "Bronze")
else:
    # default metric = count of Medal (number of medals)
    entities["metric"] = ("Medal", None)

# Limit extraction (e.g., top 5)
m = re.search(r"top\s+(\d+)", q)
if m:
    entities["limit"] = int(m.group(1))
elif intent == "ranking":
    entities["limit"] = 1 if "topping" in q or "top of the list" in q or "which country is topping" in q else 5

# Year filter extraction
m2 = re.search(r"(\b19|20)\d{2}\b", q)  # crude year extractor (e.g., 2016 or 2020)
if m2:
    entities["filters"]["Year"] = int(m2.group(0))

# Country filter extraction (e.g., "for USA" or "for India")
# Simple approach: if question contains "for <word>" or "in <word>" (not robust for multiword countries)
m3 = re.search(r"(for|in)\s+([A-Za-z' ]{2,30})", q)
if m3:
    candidate = m3.group(2).strip()
    # quick normalize common short country tokens (optional)
    entities["filters"]["country_hint"] = candidate

return {"intent": intent, "entities": entities, "raw": question}

Build and run pandas query based on parsed intent

def run_query(df: pd.DataFrame, parsed: Dict[str, Any]) -> Tuple[pd.DataFrame, str]:
intent = parsed["intent"]
ent = parsed["entities"]
group_by = ent["group_by"]
metric_col, metric_value = ent["metric"]
where_mask = pd.Series([True]*len(df))

# Apply filters
for k,v in ent["filters"].items():
    if k == "country_hint":
        # try to match to Country (case-insensitive substring)
        where_mask &= df["Country"].astype(str).str.lower().str.contains(str(v).lower())
    else:
        if k in df.columns:
            where_mask &= (df[k] == v)

df_f = df[where_mask].copy()

# Default SQL-like string (for display)
sql_like = "-- Generated SQL-like query will appear here\n"

# Ranking / comparison (group by + count)
if intent in ("ranking", "comparison", "distribution"):
    if group_by is None:
        # default to Country if question mentions country or default
        group_by = "Country" if "country" in parsed["raw"].lower() else "Sport"
    if group_by not in ALLOWED_COLUMNS:
        raise ValueError(f"Cannot group by '{group_by}'. Allowed columns: {sorted(ALLOWED_COLUMNS)}")

    # If looking for a specific medal color
    if metric_value:
        df_metric = df_f[df_f[metric_col].astype(str).str.lower() == str(metric_value).lower()]
    else:
        df_metric = df_f

    result = df_metric.groupby(group_by, dropna=False)[metric_col].count().reset_index().rename(columns={metric_col: "metric_count"})
    result = result.sort_values("metric_count", ascending=False).reset_index(drop=True)

    if ent.get("limit"):
        result = result.head(ent["limit"])

    sql_like = f"SELECT {group_by}, COUNT({metric_col}) AS metric_count FROM dataset WHERE <filters> GROUP BY {group_by} ORDER BY metric_count DESC LIMIT {ent.get('limit')};"
    return result, sql_like

# Trend
elif intent == "trend":
    # group by Year
    if "Year" not in df.columns:
        raise ValueError("Dataset has no Year column for trend queries.")
    result = df_f.groupby("Year", dropna=False)["Medal"].count().reset_index().rename(columns={"Medal":"metric_count"})
    result = result.sort_values("Year").reset_index(drop=True)
    sql_like = "SELECT Year, COUNT(Medal) AS metric_count FROM dataset WHERE <filters> GROUP BY Year ORDER BY Year ASC;"
    return result, sql_like

# Summary (top-level aggregates)
elif intent == "summary":
    # We'll build a small summary table: total medals, distinct countries, distinct sports, year range
    total_medals = len(df_f)
    distinct_countries = df_f["Country"].nunique()
    distinct_sports = df_f["Sport"].nunique()
    years = sorted(df_f["Year"].dropna().unique().tolist())
    result = pd.DataFrame([
        {"metric":"total_medals","value": total_medals},
        {"metric":"distinct_countries","value": distinct_countries},
        {"metric":"distinct_sports","value": distinct_sports},
        {"metric":"year_range","value": f"{years[0] if years else 'N/A'} - {years[-1] if years else 'N/A'}"}
    ])
    sql_like = "-- Summary aggregates computed from dataset"
    return result, sql_like

else:
    # fallback: return empty frame
    return pd.DataFrame(), "-- no-op"

Generate text summary and 3 insights from the result table

def generate_insights(parsed: Dict[str, Any], result_df: pd.DataFrame) -> Tuple[str, list]:
intent = parsed["intent"]
ent = parsed["entities"]
raw = parsed["raw"]
insights = []
summary = ""

if result_df.empty:
    return "No results found for the given question.", ["No data to analyze."]

if intent in ("ranking","comparison","distribution"):
    top = result_df.iloc[0]
    label_col = result_df.columns[0]
    value_col = result_df.columns[1]
    summary = f"{top[label_col]} is topping the list with {int(top[value_col])}."
    # Top 3
    top3 = result_df.head(3)
    bullets = []
    for i, row in top3.iterrows():
        bullets.append(f"{row[label_col]}: {int(row[value_col])}")
    insights.append("Top performers: " + "; ".join(bullets))
    # Dominance check
    if len(result_df) > 1:
        second_val = int(result_df.iloc[1][value_col])
        if second_val == 0:
            insights.append("Second place has 0 count — check data quality or filters.")
        else:
            ratio = int(top[value_col]) / (second_val if second_val else 1)
            if ratio >= 1.5:
                insights.append(f"{top[label_col]} leads by a large margin (≥ 1.5× second place).")
            else:
                insights.append(f"{top[label_col]} leads but margin is modest ({ratio:.2f}× second place).")
    else:
        insights.append("Only one row in the result set.")
    # Add a helpful next action
    insights.append("Try: 'Show medal breakdown by sport for " + str(top[label_col]) + "'.")
elif intent == "trend":
    # compute pct change from first to last
    first = result_df.iloc[0]
    last = result_df.iloc[-1]
    change = last[result_df.columns[1]] - first[result_df.columns[1]]
    pct = (change / first[result_df.columns[1]]) * 100 if first[result_df.columns[1]] else math.inf
    summary = f"Medal count changed from {int(first[result_df.columns[1]])} in {first['Year']} to {int(last[result_df.columns[1]])} in {last['Year']} ({pct:.1f}% change)."
    insights.append("Largest year-over-year changes can be highlighted by checking deltas between consecutive years.")
    insights.append("Consider splitting trend by Gender or Country for deeper insights.")
    insights.append("Try: 'Show medals by gender over years'.")
elif intent == "summary":
    # short textual summary from table
    total_medals = int(result_df[result_df['metric']=="total_medals"]['value'].values[0])
    distinct_countries = int(result_df[result_df['metric']=="distinct_countries"]['value'].values[0])
    years = result_df[result_df['metric']=="year_range"]['value'].values[0]
    summary = f"The dataset contains {total_medals} medals across {distinct_countries} countries ({years})."
    insights.append("You can ask for top countries, top sports, or trends over years.")
    insights.append("Try: 'Which country won the most medals?' or 'Medal count over years'.")
    insights.append("Filter by Year to inspect specific editions.")
else:
    summary = "Here's the top result."
    insights.append("Use clearer language like 'Top 5 athletes by gold medals in 2020' for better answers.")

# Ensure we return up to 3–4 concise bullets
insights = insights[:4]
return summary, insights

Chart generator: returns PNG bytes (matplotlib)

def plot_result(result_df: pd.DataFrame, parsed: Dict[str, Any]) -> bytes:
intent = parsed["intent"]
# For simple ranking/comparison/distribution: bar or pie
buf = io.BytesIO()
plt.clf()
try:
if intent in ("ranking","comparison","distribution"):
if result_df.shape[1] >= 2:
x = result_df.iloc[:,0].astype(str)
y = result_df.iloc[:,1].astype(int)
# If many categories, use horizontal bar
if len(x) > 8:
plt.figure(figsize=(8, max(4, 0.5*len(x))))
plt.barh(x[::-1], y[::-1])
plt.xlabel("Count")
plt.title("Counts by " + result_df.columns[0])
else:
plt.figure(figsize=(8,4))
plt.bar(x, y)
plt.ylabel("Count")
plt.title("Counts by " + result_df.columns[0])
plt.xticks(rotation=45, ha="right")
elif intent == "trend":
x = result_df["Year"]
y = result_df.iloc[:,1].astype(int)
plt.figure(figsize=(8,4))
plt.plot(x, y, marker="o")
plt.xlabel("Year")
plt.ylabel("Count")
plt.title("Trend over Years")
plt.grid(True)
elif intent == "summary":
# show small table as image
fig, ax = plt.subplots(figsize=(6,1.5))
ax.axis('off')
table = plt.table(cellText=result_df.values, colLabels=result_df.columns, loc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 1.5)
else:
plt.text(0.1, 0.5, "No chart available for this query.", fontsize=12)
plt.tight_layout()
plt.savefig(buf, format="png")
except Exception as e:
plt.clf()
plt.text(0.1, 0.5, f"Error plotting: {str(e)}")
plt.savefig(buf, format="png")
buf.seek(0)
return buf.read()

Main function tying everything together

def answer_question(question: str):
# Load data (cached globally could be added)
try:
df = load_dataset(EXCEL_PATH, SHEET_NAME)
except Exception as e:
return f"Error loading dataset: {str(e)}", "", None, None, ""

parsed = parse_question(question)
try:
    result_df, sql_like = run_query(df, parsed)
except Exception as e:
    return f"Error while running query: {str(e)}", "", None, None, ""

summary, insights = generate_insights(parsed, result_df)

# prepare table preview (convert small df to html for Gradio table display)
table_preview_html = result_df.head(20).to_html(index=False)

# chart bytes
chart_bytes = plot_result(result_df, parsed)

# Format insights as newline-separated text
insights_text = "\n".join([f"- {s}" for s in insights])

# Return values for Gradio: summary, insights, chart image, table (html), sql_like
return summary, insights_text, chart_bytes, table_preview_html, sql_like

---------- Gradio app ----------

def build_ui():
title = "No-Code Conversational Data Tool — Tokyo Olympic Q&A"
description = "Ask plain-language questions about the dataset (e.g., 'Which country is topping the list?')"

with gr.Blocks() as demo:
    gr.Markdown(f"# {title}")
    gr.Markdown(description)
    with gr.Row():
        question = gr.Textbox(lines=2, placeholder="e.g., Which country is topping the list?", label="Ask a question")
        ask_btn = gr.Button("Ask")
    with gr.Row():
        summary_out = gr.Textbox(label="Summary", interactive=False)
        insights_out = gr.Textbox(label="Key Insights", interactive=False)
    with gr.Row():
        chart_out = gr.Image(label="Chart", type="pil")
    with gr.Row():
        table_out = gr.HTML(label="Result Table (first 20 rows)")
    with gr.Row():
        sql_out = gr.Textbox(label="Generated SQL-like statement", interactive=False)

    def on_ask(q):
        s, ins, chart_bytes, table_html, sql_like = answer_question(q)
        # chart_bytes might be bytes -> convert to pil image via BytesIO inside gr.Image
        return s, ins, chart_bytes, table_html, sql_like

    ask_btn.click(fn=on_ask, inputs=[question], outputs=[summary_out, insights_out, chart_out, table_out, sql_out])
return demo

if name == "main":
demo = build_ui()
demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

""" Gradio Q&A app for your Tokyo-Olympic-Sample-Data.xlsx Features: - Loads the Excel file (path configured below) - Parses plain-language questions into simple intents (comparison, ranking, distribution, trend) - Runs Pandas aggregation queries against the table - Produces summary, 3 insights, a chart, the result table, and a generated SQL-like string - Minimal dependencies: gradio, pandas, matplotlib, openpyxl """ import os import re import math import pandas as pd import matplotlib.pyplot as plt import io import gradio as gr from typing import Tuple, Dict, Any # ---------- CONFIG ---------- EXCEL_PATH = "/mnt/data/Tokyo-Olympic-Sample-Data.xlsx" # change this if your file is in a different location SHEET_NAME = None # None loads first sheet MAX_CATEGORIES_FOR_PIE = 6 # ---------------------------- # Load dataset once def load_dataset(path: str, sheet_name=None) -> pd.DataFrame: if not os.path.exists(path): raise FileNotFoundError(f"Excel file not found at: {path}") df = pd.read_excel(path, sheet_name=sheet_name) # Normalize column names (strip) df.columns = [c.strip() for c in df.columns] return df # Very small whitelist to prevent arbitrary column injection ALLOWED_COLUMNS = {"Athlete","Country","Sport","Event","Medal","Gender","Age","Height","Weight","Year"} # Simple NLU parser: classify intent & extract entities def parse_question(question: str) -> Dict[str, Any]: q = question.strip().lower() intent = None entities = {"group_by": None, "metric": None, "filters": {}, "limit": None} # Intent detection: ranking/comparison/distribution/trend/summary if any(w in q for w in ["top", "topping", "highest", "most", "lead", "leading"]): intent = "ranking" elif any(w in q for w in ["over years", "over the years", "trend", "trend of", "year-wise", "by year"]): intent = "trend" elif any(w in q for w in ["by gender", "gender", "distribution", "share"]): intent = "distribution" elif any(w in q for w in ["summary", "overview", "insight"]): intent = "summary" elif any(w in q for w in ["which country", "which country is topping", "which country is on top", "country topping", "country top"]): intent = "ranking" else: # fallback heuristics if "country" in q: intent = "comparison" elif "medal" in q: intent = "comparison" else: intent = "summary" # Extract group_by if "country" in q: entities["group_by"] = "Country" elif "sport" in q: entities["group_by"] = "Sport" elif "athlete" in q: entities["group_by"] = "Athlete" elif "gender" in q: entities["group_by"] = "Gender" elif "year" in q: entities["group_by"] = "Year" # Metric detection if "gold" in q: entities["metric"] = ("Medal", "Gold") elif "silver" in q: entities["metric"] = ("Medal", "Silver") elif "bronze" in q: entities["metric"] = ("Medal", "Bronze") else: # default metric = count of Medal (number of medals) entities["metric"] = ("Medal", None) # Limit extraction (e.g., top 5) m = re.search(r"top\s+(\d+)", q) if m: entities["limit"] = int(m.group(1)) elif intent == "ranking": entities["limit"] = 1 if "topping" in q or "top of the list" in q or "which country is topping" in q else 5 # Year filter extraction m2 = re.search(r"(\b19|20)\d{2}\b", q) # crude year extractor (e.g., 2016 or 2020) if m2: entities["filters"]["Year"] = int(m2.group(0)) # Country filter extraction (e.g., "for USA" or "for India") # Simple approach: if question contains "for <word>" or "in <word>" (not robust for multiword countries) m3 = re.search(r"(for|in)\s+([A-Za-z' ]{2,30})", q) if m3: candidate = m3.group(2).strip() # quick normalize common short country tokens (optional) entities["filters"]["country_hint"] = candidate return {"intent": intent, "entities": entities, "raw": question} # Build and run pandas query based on parsed intent def run_query(df: pd.DataFrame, parsed: Dict[str, Any]) -> Tuple[pd.DataFrame, str]: intent = parsed["intent"] ent = parsed["entities"] group_by = ent["group_by"] metric_col, metric_value = ent["metric"] where_mask = pd.Series([True]*len(df)) # Apply filters for k,v in ent["filters"].items(): if k == "country_hint": # try to match to Country (case-insensitive substring) where_mask &= df["Country"].astype(str).str.lower().str.contains(str(v).lower()) else: if k in df.columns: where_mask &= (df[k] == v) df_f = df[where_mask].copy() # Default SQL-like string (for display) sql_like = "-- Generated SQL-like query will appear here\n" # Ranking / comparison (group by + count) if intent in ("ranking", "comparison", "distribution"): if group_by is None: # default to Country if question mentions country or default group_by = "Country" if "country" in parsed["raw"].lower() else "Sport" if group_by not in ALLOWED_COLUMNS: raise ValueError(f"Cannot group by '{group_by}'. Allowed columns: {sorted(ALLOWED_COLUMNS)}") # If looking for a specific medal color if metric_value: df_metric = df_f[df_f[metric_col].astype(str).str.lower() == str(metric_value).lower()] else: df_metric = df_f result = df_metric.groupby(group_by, dropna=False)[metric_col].count().reset_index().rename(columns={metric_col: "metric_count"}) result = result.sort_values("metric_count", ascending=False).reset_index(drop=True) if ent.get("limit"): result = result.head(ent["limit"]) sql_like = f"SELECT {group_by}, COUNT({metric_col}) AS metric_count FROM dataset WHERE <filters> GROUP BY {group_by} ORDER BY metric_count DESC LIMIT {ent.get('limit')};" return result, sql_like # Trend elif intent == "trend": # group by Year if "Year" not in df.columns: raise ValueError("Dataset has no Year column for trend queries.") result = df_f.groupby("Year", dropna=False)["Medal"].count().reset_index().rename(columns={"Medal":"metric_count"}) result = result.sort_values("Year").reset_index(drop=True) sql_like = "SELECT Year, COUNT(Medal) AS metric_count FROM dataset WHERE <filters> GROUP BY Year ORDER BY Year ASC;" return result, sql_like # Summary (top-level aggregates) elif intent == "summary": # We'll build a small summary table: total medals, distinct countries, distinct sports, year range total_medals = len(df_f) distinct_countries = df_f["Country"].nunique() distinct_sports = df_f["Sport"].nunique() years = sorted(df_f["Year"].dropna().unique().tolist()) result = pd.DataFrame([ {"metric":"total_medals","value": total_medals}, {"metric":"distinct_countries","value": distinct_countries}, {"metric":"distinct_sports","value": distinct_sports}, {"metric":"year_range","value": f"{years[0] if years else 'N/A'} - {years[-1] if years else 'N/A'}"} ]) sql_like = "-- Summary aggregates computed from dataset" return result, sql_like else: # fallback: return empty frame return pd.DataFrame(), "-- no-op" # Generate text summary and 3 insights from the result table def generate_insights(parsed: Dict[str, Any], result_df: pd.DataFrame) -> Tuple[str, list]: intent = parsed["intent"] ent = parsed["entities"] raw = parsed["raw"] insights = [] summary = "" if result_df.empty: return "No results found for the given question.", ["No data to analyze."] if intent in ("ranking","comparison","distribution"): top = result_df.iloc[0] label_col = result_df.columns[0] value_col = result_df.columns[1] summary = f"{top[label_col]} is topping the list with {int(top[value_col])}." # Top 3 top3 = result_df.head(3) bullets = [] for i, row in top3.iterrows(): bullets.append(f"{row[label_col]}: {int(row[value_col])}") insights.append("Top performers: " + "; ".join(bullets)) # Dominance check if len(result_df) > 1: second_val = int(result_df.iloc[1][value_col]) if second_val == 0: insights.append("Second place has 0 count — check data quality or filters.") else: ratio = int(top[value_col]) / (second_val if second_val else 1) if ratio >= 1.5: insights.append(f"{top[label_col]} leads by a large margin (≥ 1.5× second place).") else: insights.append(f"{top[label_col]} leads but margin is modest ({ratio:.2f}× second place).") else: insights.append("Only one row in the result set.") # Add a helpful next action insights.append("Try: 'Show medal breakdown by sport for " + str(top[label_col]) + "'.") elif intent == "trend": # compute pct change from first to last first = result_df.iloc[0] last = result_df.iloc[-1] change = last[result_df.columns[1]] - first[result_df.columns[1]] pct = (change / first[result_df.columns[1]]) * 100 if first[result_df.columns[1]] else math.inf summary = f"Medal count changed from {int(first[result_df.columns[1]])} in {first['Year']} to {int(last[result_df.columns[1]])} in {last['Year']} ({pct:.1f}% change)." insights.append("Largest year-over-year changes can be highlighted by checking deltas between consecutive years.") insights.append("Consider splitting trend by Gender or Country for deeper insights.") insights.append("Try: 'Show medals by gender over years'.") elif intent == "summary": # short textual summary from table total_medals = int(result_df[result_df['metric']=="total_medals"]['value'].values[0]) distinct_countries = int(result_df[result_df['metric']=="distinct_countries"]['value'].values[0]) years = result_df[result_df['metric']=="year_range"]['value'].values[0] summary = f"The dataset contains {total_medals} medals across {distinct_countries} countries ({years})." insights.append("You can ask for top countries, top sports, or trends over years.") insights.append("Try: 'Which country won the most medals?' or 'Medal count over years'.") insights.append("Filter by Year to inspect specific editions.") else: summary = "Here's the top result." insights.append("Use clearer language like 'Top 5 athletes by gold medals in 2020' for better answers.") # Ensure we return up to 3–4 concise bullets insights = insights[:4] return summary, insights # Chart generator: returns PNG bytes (matplotlib) def plot_result(result_df: pd.DataFrame, parsed: Dict[str, Any]) -> bytes: intent = parsed["intent"] # For simple ranking/comparison/distribution: bar or pie buf = io.BytesIO() plt.clf() try: if intent in ("ranking","comparison","distribution"): if result_df.shape[1] >= 2: x = result_df.iloc[:,0].astype(str) y = result_df.iloc[:,1].astype(int) # If many categories, use horizontal bar if len(x) > 8: plt.figure(figsize=(8, max(4, 0.5*len(x)))) plt.barh(x[::-1], y[::-1]) plt.xlabel("Count") plt.title("Counts by " + result_df.columns[0]) else: plt.figure(figsize=(8,4)) plt.bar(x, y) plt.ylabel("Count") plt.title("Counts by " + result_df.columns[0]) plt.xticks(rotation=45, ha="right") elif intent == "trend": x = result_df["Year"] y = result_df.iloc[:,1].astype(int) plt.figure(figsize=(8,4)) plt.plot(x, y, marker="o") plt.xlabel("Year") plt.ylabel("Count") plt.title("Trend over Years") plt.grid(True) elif intent == "summary": # show small table as image fig, ax = plt.subplots(figsize=(6,1.5)) ax.axis('off') table = plt.table(cellText=result_df.values, colLabels=result_df.columns, loc='center') table.auto_set_font_size(False) table.set_fontsize(10) table.scale(1, 1.5) else: plt.text(0.1, 0.5, "No chart available for this query.", fontsize=12) plt.tight_layout() plt.savefig(buf, format="png") except Exception as e: plt.clf() plt.text(0.1, 0.5, f"Error plotting: {str(e)}") plt.savefig(buf, format="png") buf.seek(0) return buf.read() # Main function tying everything together def answer_question(question: str): # Load data (cached globally could be added) try: df = load_dataset(EXCEL_PATH, SHEET_NAME) except Exception as e: return f"Error loading dataset: {str(e)}", "", None, None, "" parsed = parse_question(question) try: result_df, sql_like = run_query(df, parsed) except Exception as e: return f"Error while running query: {str(e)}", "", None, None, "" summary, insights = generate_insights(parsed, result_df) # prepare table preview (convert small df to html for Gradio table display) table_preview_html = result_df.head(20).to_html(index=False) # chart bytes chart_bytes = plot_result(result_df, parsed) # Format insights as newline-separated text insights_text = "\n".join([f"- {s}" for s in insights]) # Return values for Gradio: summary, insights, chart image, table (html), sql_like return summary, insights_text, chart_bytes, table_preview_html, sql_like # ---------- Gradio app ---------- def build_ui(): title = "No-Code Conversational Data Tool — Tokyo Olympic Q&A" description = "Ask plain-language questions about the dataset (e.g., 'Which country is topping the list?')" with gr.Blocks() as demo: gr.Markdown(f"# {title}") gr.Markdown(description) with gr.Row(): question = gr.Textbox(lines=2, placeholder="e.g., Which country is topping the list?", label="Ask a question") ask_btn = gr.Button("Ask") with gr.Row(): summary_out = gr.Textbox(label="Summary", interactive=False) insights_out = gr.Textbox(label="Key Insights", interactive=False) with gr.Row(): chart_out = gr.Image(label="Chart", type="pil") with gr.Row(): table_out = gr.HTML(label="Result Table (first 20 rows)") with gr.Row(): sql_out = gr.Textbox(label="Generated SQL-like statement", interactive=False) def on_ask(q): s, ins, chart_bytes, table_html, sql_like = answer_question(q) # chart_bytes might be bytes -> convert to pil image via BytesIO inside gr.Image return s, ins, chart_bytes, table_html, sql_like ask_btn.click(fn=on_ask, inputs=[question], outputs=[summary_out, insights_out, chart_out, table_out, sql_out]) return demo if __name__ == "__main__": demo = build_ui() demo.launch(server_name="0.0.0.0", server_port=7860, share=False)c5c73831
Ready to merge
This branch is ready to get merged automatically.

Sign up or log in to comment