Rojo Joseph
NaturalRojo
AI & ML interests
None yet
Organizations
None yet
Upload Tokyo-Olympic-Sample-Data.xlsx
#1 opened 3 months ago
by
NaturalRojo
Update app.py
#4 opened 3 months ago
by
NaturalRojo
""" Gradio Q&A app for your Tokyo-Olympic-Sample-Data.xlsx Features: - Loads the Excel file (path configured below) - Parses plain-language questions into simple intents (comparison, ranking, distribution, trend) - Runs Pandas aggregation queries against the table - Produces summary, 3 insights, a chart, the result table, and a generated SQL-like string - Minimal dependencies: gradio, pandas, matplotlib, openpyxl """ import os import re import math import pandas as pd import matplotlib.pyplot as plt import io import gradio as gr from typing import Tuple, Dict, Any # ---------- CONFIG ---------- EXCEL_PATH = "/mnt/data/Tokyo-Olympic-Sample-Data.xlsx" # change this if your file is in a different location SHEET_NAME = None # None loads first sheet MAX_CATEGORIES_FOR_PIE = 6 # ---------------------------- # Load dataset once def load_dataset(path: str, sheet_name=None) -> pd.DataFrame: if not os.path.exists(path): raise FileNotFoundError(f"Excel file not found at: {path}") df = pd.read_excel(path, sheet_name=sheet_name) # Normalize column names (strip) df.columns = [c.strip() for c in df.columns] return df # Very small whitelist to prevent arbitrary column injection ALLOWED_COLUMNS = {"Athlete","Country","Sport","Event","Medal","Gender","Age","Height","Weight","Year"} # Simple NLU parser: classify intent & extract entities def parse_question(question: str) -> Dict[str, Any]: q = question.strip().lower() intent = None entities = {"group_by": None, "metric": None, "filters": {}, "limit": None} # Intent detection: ranking/comparison/distribution/trend/summary if any(w in q for w in ["top", "topping", "highest", "most", "lead", "leading"]): intent = "ranking" elif any(w in q for w in ["over years", "over the years", "trend", "trend of", "year-wise", "by year"]): intent = "trend" elif any(w in q for w in ["by gender", "gender", "distribution", "share"]): intent = "distribution" elif any(w in q for w in ["summary", "overview", "insight"]): intent = "summary" elif any(w in q for w in ["which country", "which country is topping", "which country is on top", "country topping", "country top"]): intent = "ranking" else: # fallback heuristics if "country" in q: intent = "comparison" elif "medal" in q: intent = "comparison" else: intent = "summary" # Extract group_by if "country" in q: entities["group_by"] = "Country" elif "sport" in q: entities["group_by"] = "Sport" elif "athlete" in q: entities["group_by"] = "Athlete" elif "gender" in q: entities["group_by"] = "Gender" elif "year" in q: entities["group_by"] = "Year" # Metric detection if "gold" in q: entities["metric"] = ("Medal", "Gold") elif "silver" in q: entities["metric"] = ("Medal", "Silver") elif "bronze" in q: entities["metric"] = ("Medal", "Bronze") else: # default metric = count of Medal (number of medals) entities["metric"] = ("Medal", None) # Limit extraction (e.g., top 5) m = re.search(r"top\s+(\d+)", q) if m: entities["limit"] = int(m.group(1)) elif intent == "ranking": entities["limit"] = 1 if "topping" in q or "top of the list" in q or "which country is topping" in q else 5 # Year filter extraction m2 = re.search(r"(\b19|20)\d{2}\b", q) # crude year extractor (e.g., 2016 or 2020) if m2: entities["filters"]["Year"] = int(m2.group(0)) # Country filter extraction (e.g., "for USA" or "for India") # Simple approach: if question contains "for <word>" or "in <word>" (not robust for multiword countries) m3 = re.search(r"(for|in)\s+([A-Za-z' ]{2,30})", q) if m3: candidate = m3.group(2).strip() # quick normalize common short country tokens (optional) entities["filters"]["country_hint"] = candidate return {"intent": intent, "entities": entities, "raw": question} # Build and run pandas query based on parsed intent def run_query(df: pd.DataFrame, parsed: Dict[str, Any]) -> Tuple[pd.DataFrame, str]: intent = parsed["intent"] ent = parsed["entities"] group_by = ent["group_by"] metric_col, metric_value = ent["metric"] where_mask = pd.Series([True]*len(df)) # Apply filters for k,v in ent["filters"].items(): if k == "country_hint": # try to match to Country (case-insensitive substring) where_mask &= df["Country"].astype(str).str.lower().str.contains(str(v).lower()) else: if k in df.columns: where_mask &= (df[k] == v) df_f = df[where_mask].copy() # Default SQL-like string (for display) sql_like = "-- Generated SQL-like query will appear here\n" # Ranking / comparison (group by + count) if intent in ("ranking", "comparison", "distribution"): if group_by is None: # default to Country if question mentions country or default group_by = "Country" if "country" in parsed["raw"].lower() else "Sport" if group_by not in ALLOWED_COLUMNS: raise ValueError(f"Cannot group by '{group_by}'. Allowed columns: {sorted(ALLOWED_COLUMNS)}") # If looking for a specific medal color if metric_value: df_metric = df_f[df_f[metric_col].astype(str).str.lower() == str(metric_value).lower()] else: df_metric = df_f result = df_metric.groupby(group_by, dropna=False)[metric_col].count().reset_index().rename(columns={metric_col: "metric_count"}) result = result.sort_values("metric_count", ascending=False).reset_index(drop=True) if ent.get("limit"): result = result.head(ent["limit"]) sql_like = f"SELECT {group_by}, COUNT({metric_col}) AS metric_count FROM dataset WHERE <filters> GROUP BY {group_by} ORDER BY metric_count DESC LIMIT {ent.get('limit')};" return result, sql_like # Trend elif intent == "trend": # group by Year if "Year" not in df.columns: raise ValueError("Dataset has no Year column for trend queries.") result = df_f.groupby("Year", dropna=False)["Medal"].count().reset_index().rename(columns={"Medal":"metric_count"}) result = result.sort_values("Year").reset_index(drop=True) sql_like = "SELECT Year, COUNT(Medal) AS metric_count FROM dataset WHERE <filters> GROUP BY Year ORDER BY Year ASC;" return result, sql_like # Summary (top-level aggregates) elif intent == "summary": # We'll build a small summary table: total medals, distinct countries, distinct sports, year range total_medals = len(df_f) distinct_countries = df_f["Country"].nunique() distinct_sports = df_f["Sport"].nunique() years = sorted(df_f["Year"].dropna().unique().tolist()) result = pd.DataFrame([ {"metric":"total_medals","value": total_medals}, {"metric":"distinct_countries","value": distinct_countries}, {"metric":"distinct_sports","value": distinct_sports}, {"metric":"year_range","value": f"{years[0] if years else 'N/A'} - {years[-1] if years else 'N/A'}"} ]) sql_like = "-- Summary aggregates computed from dataset" return result, sql_like else: # fallback: return empty frame return pd.DataFrame(), "-- no-op" # Generate text summary and 3 insights from the result table def generate_insights(parsed: Dict[str, Any], result_df: pd.DataFrame) -> Tuple[str, list]: intent = parsed["intent"] ent = parsed["entities"] raw = parsed["raw"] insights = [] summary = "" if result_df.empty: return "No results found for the given question.", ["No data to analyze."] if intent in ("ranking","comparison","distribution"): top = result_df.iloc[0] label_col = result_df.columns[0] value_col = result_df.columns[1] summary = f"{top[label_col]} is topping the list with {int(top[value_col])}." # Top 3 top3 = result_df.head(3) bullets = [] for i, row in top3.iterrows(): bullets.append(f"{row[label_col]}: {int(row[value_col])}") insights.append("Top performers: " + "; ".join(bullets)) # Dominance check if len(result_df) > 1: second_val = int(result_df.iloc[1][value_col]) if second_val == 0: insights.append("Second place has 0 count — check data quality or filters.") else: ratio = int(top[value_col]) / (second_val if second_val else 1) if ratio >= 1.5: insights.append(f"{top[label_col]} leads by a large margin (≥ 1.5× second place).") else: insights.append(f"{top[label_col]} leads but margin is modest ({ratio:.2f}× second place).") else: insights.append("Only one row in the result set.") # Add a helpful next action insights.append("Try: 'Show medal breakdown by sport for " + str(top[label_col]) + "'.") elif intent == "trend": # compute pct change from first to last first = result_df.iloc[0] last = result_df.iloc[-1] change = last[result_df.columns[1]] - first[result_df.columns[1]] pct = (change / first[result_df.columns[1]]) * 100 if first[result_df.columns[1]] else math.inf summary = f"Medal count changed from {int(first[result_df.columns[1]])} in {first['Year']} to {int(last[result_df.columns[1]])} in {last['Year']} ({pct:.1f}% change)." insights.append("Largest year-over-year changes can be highlighted by checking deltas between consecutive years.") insights.append("Consider splitting trend by Gender or Country for deeper insights.") insights.append("Try: 'Show medals by gender over years'.") elif intent == "summary": # short textual summary from table total_medals = int(result_df[result_df['metric']=="total_medals"]['value'].values[0]) distinct_countries = int(result_df[result_df['metric']=="distinct_countries"]['value'].values[0]) years = result_df[result_df['metric']=="year_range"]['value'].values[0] summary = f"The dataset contains {total_medals} medals across {distinct_countries} countries ({years})." insights.append("You can ask for top countries, top sports, or trends over years.") insights.append("Try: 'Which country won the most medals?' or 'Medal count over years'.") insights.append("Filter by Year to inspect specific editions.") else: summary = "Here's the top result." insights.append("Use clearer language like 'Top 5 athletes by gold medals in 2020' for better answers.") # Ensure we return up to 3–4 concise bullets insights = insights[:4] return summary, insights # Chart generator: returns PNG bytes (matplotlib) def plot_result(result_df: pd.DataFrame, parsed: Dict[str, Any]) -> bytes: intent = parsed["intent"] # For simple ranking/comparison/distribution: bar or pie buf = io.BytesIO() plt.clf() try: if intent in ("ranking","comparison","distribution"): if result_df.shape[1] >= 2: x = result_df.iloc[:,0].astype(str) y = result_df.iloc[:,1].astype(int) # If many categories, use horizontal bar if len(x) > 8: plt.figure(figsize=(8, max(4, 0.5*len(x)))) plt.barh(x[::-1], y[::-1]) plt.xlabel("Count") plt.title("Counts by " + result_df.columns[0]) else: plt.figure(figsize=(8,4)) plt.bar(x, y) plt.ylabel("Count") plt.title("Counts by " + result_df.columns[0]) plt.xticks(rotation=45, ha="right") elif intent == "trend": x = result_df["Year"] y = result_df.iloc[:,1].astype(int) plt.figure(figsize=(8,4)) plt.plot(x, y, marker="o") plt.xlabel("Year") plt.ylabel("Count") plt.title("Trend over Years") plt.grid(True) elif intent == "summary": # show small table as image fig, ax = plt.subplots(figsize=(6,1.5)) ax.axis('off') table = plt.table(cellText=result_df.values, colLabels=result_df.columns, loc='center') table.auto_set_font_size(False) table.set_fontsize(10) table.scale(1, 1.5) else: plt.text(0.1, 0.5, "No chart available for this query.", fontsize=12) plt.tight_layout() plt.savefig(buf, format="png") except Exception as e: plt.clf() plt.text(0.1, 0.5, f"Error plotting: {str(e)}") plt.savefig(buf, format="png") buf.seek(0) return buf.read() # Main function tying everything together def answer_question(question: str): # Load data (cached globally could be added) try: df = load_dataset(EXCEL_PATH, SHEET_NAME) except Exception as e: return f"Error loading dataset: {str(e)}", "", None, None, "" parsed = parse_question(question) try: result_df, sql_like = run_query(df, parsed) except Exception as e: return f"Error while running query: {str(e)}", "", None, None, "" summary, insights = generate_insights(parsed, result_df) # prepare table preview (convert small df to html for Gradio table display) table_preview_html = result_df.head(20).to_html(index=False) # chart bytes chart_bytes = plot_result(result_df, parsed) # Format insights as newline-separated text insights_text = "\n".join([f"- {s}" for s in insights]) # Return values for Gradio: summary, insights, chart image, table (html), sql_like return summary, insights_text, chart_bytes, table_preview_html, sql_like # ---------- Gradio app ---------- def build_ui(): title = "No-Code Conversational Data Tool — Tokyo Olympic Q&A" description = "Ask plain-language questions about the dataset (e.g., 'Which country is topping the list?')" with gr.Blocks() as demo: gr.Markdown(f"# {title}") gr.Markdown(description) with gr.Row(): question = gr.Textbox(lines=2, placeholder="e.g., Which country is topping the list?", label="Ask a question") ask_btn = gr.Button("Ask") with gr.Row(): summary_out = gr.Textbox(label="Summary", interactive=False) insights_out = gr.Textbox(label="Key Insights", interactive=False) with gr.Row(): chart_out = gr.Image(label="Chart", type="pil") with gr.Row(): table_out = gr.HTML(label="Result Table (first 20 rows)") with gr.Row(): sql_out = gr.Textbox(label="Generated SQL-like statement", interactive=False) def on_ask(q): s, ins, chart_bytes, table_html, sql_like = answer_question(q) # chart_bytes might be bytes -> convert to pil image via BytesIO inside gr.Image return s, ins, chart_bytes, table_html, sql_like ask_btn.click(fn=on_ask, inputs=[question], outputs=[summary_out, insights_out, chart_out, table_out, sql_out]) return demo if __name__ == "__main__": demo = build_ui() demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
#3 opened 3 months ago
by
NaturalRojo
Upload Tokyo-Olympic-Sample-Data.xlsx
#2 opened 3 months ago
by
NaturalRojo
Update app.py
1
#1 opened 3 months ago
by
NaturalRojo
Update app.py
1
#1 opened 3 months ago
by
NaturalRojo