Spaces:

PD03
/

talk_to_data

Sleeping

App Files Files Community

PD03 commited on Jun 25, 2025

Commit

519d64c

verified ·

1 Parent(s): e9af0b4

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -43

app.py CHANGED Viewed

@@ -1,75 +1,68 @@
-# app.py
 import gradio as gr
 import pandas as pd
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-# 1) Load your synthetic profitability dataset
 df = pd.read_csv('synthetic_profit.csv')
-# 2) Ensure numeric types for Revenue, Profit, ProfitMargin
-for col in ["Revenue", "Profit", "ProfitMargin"]:
-    df[col] = pd.to_numeric(df[col], errors='coerce')
-# 3) Build the schema description
-schema_lines = [f"- {col}: {dtype.name}" for col, dtype in df.dtypes.items()]
-schema_text = "Table schema:\n" + "\n".join(schema_lines)
-# 4) Few-shot examples teaching SUM and AVERAGE
-few_shot = """
-Example 1
-Q: Total profit by region?
-A: Group “Profit” by “Region” and sum → EMEA: 30172183.37; APAC: 32301788.32; Latin America: 27585378.50; North America: 25473893.34
-Example 2
-Q: Average profit margin for Product B in Americas?
-A: Filter Product=B & Region=Americas, take mean of “ProfitMargin” → 0.18
-""".strip()
-# 5) Load TAPEX-WikiSQL for table-QA
 MODEL_ID = "microsoft/tapex-base-finetuned-wikisql"
 device   = 0 if torch.cuda.is_available() else -1
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
-table_qa = pipeline(
-    "table-question-answering",
     model=model,
     tokenizer=tokenizer,
     framework="pt",
-    device=device
 )
-# 6) QA function using schema-aware prompting
 def answer_profitability(question: str) -> str:
-    # Cast all values to strings so TAPEX can ingest them
-    table = df.astype(str).to_dict(orient="records")
-    # Assemble prompt with schema + examples + user question
-    prompt = f"""{schema_text}
-{few_shot}
-Q: {question}
-A:"""
     try:
-        out = table_qa(table=table, query=prompt)
-        return out.get("answer", "No answer found.")
     except Exception as e:
-        return f"Error: {e}"
-# 7) Gradio interface
 iface = gr.Interface(
     fn=answer_profitability,
     inputs=gr.Textbox(lines=2, placeholder="Ask a question about profitability…"),
     outputs="text",
-    title="SAP Profitability Q&A (Schema-Aware TAPEX)",
     description=(
-        "Every query is prefixed with your table’s schema and two few-shot examples, "
-        "so the model learns to SUM, AVERAGE, FILTER, etc., without hard-coded fallbacks."
     )
 )

 import gradio as gr
 import pandas as pd
 import torch
+import duckdb
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+# 1) Load data into DuckDB
 df = pd.read_csv('synthetic_profit.csv')
+con = duckdb.connect(':memory:')
+con.register('sap', df)
+# 2) Build a one-line schema for prompting
+schema = ", ".join(df.columns)  # e.g. "Region,Product,FiscalYear,...."
+# 3) Load TAPEX-WikiSQL as a text2text generator
 MODEL_ID = "microsoft/tapex-base-finetuned-wikisql"
 device   = 0 if torch.cuda.is_available() else -1
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
+sql_gen = pipeline(
+    "text2text-generation",
     model=model,
     tokenizer=tokenizer,
     framework="pt",
+    device=device,
+    max_length=128,
 )
+# 4) Core QA fn: NL → SQL → execute → return result
 def answer_profitability(question: str) -> str:
+    # a) Prompt TAPEX to generate SQL
+    prompt = (
+        f"-- Translate to SQL over table `sap` with columns ({schema})\n"
+        f"Question: {question}\n"
+        "SQL:"
+    )
+    sql = sql_gen(prompt)[0]['generated_text'].strip()
+    # b) Execute the generated SQL
     try:
+        result_df = con.execute(sql).df()
     except Exception as e:
+        return f"❌ SQL Error: {e}\n\nGenerated SQL:\n{sql}"
+    # c) Format the output
+    if result_df.empty:
+        return f"No rows returned.\n\nGenerated SQL:\n{sql}"
+    # If it's a single cell result, just return that number
+    if result_df.shape == (1,1):
+        return str(result_df.iat[0,0])
+    # Otherwise pretty-print the DataFrame
+    return result_df.to_string(index=False)
+# 5) Gradio UI
 iface = gr.Interface(
     fn=answer_profitability,
     inputs=gr.Textbox(lines=2, placeholder="Ask a question about profitability…"),
     outputs="text",
+    title="SAP Profitability Q&A (SQL-Generation)",
     description=(
+        "TAPEX converts your natural-language query into SQL,\n"
+        "then runs it via DuckDB—no hard-coded fallbacks."
     )
 )