Spaces:

PD03
/

talk_to_data

Sleeping

App Files Files Community

PD03 commited on Jun 25, 2025

Commit

996ed5a

verified ·

1 Parent(s): 68264bd

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -44

app.py CHANGED Viewed

@@ -3,77 +3,66 @@
 import gradio as gr
 import pandas as pd
 import torch
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
-# 1) Load your synthetic profitability dataset
 df = pd.read_csv('synthetic_profit.csv')
-# 2) Ensure numeric columns for true aggregation
-for col in ["Revenue", "Profit", "ProfitMargin"]:
-    df[col] = pd.to_numeric(df[col], errors='coerce')
-# 3) Build the schema description text
-#    ← replaced .iteritems() with .items() here
-schema_lines = [f"- {col}: {dtype.name}" for col, dtype in df.dtypes.items()]
-schema_text = "Table schema:\n" + "\n".join(schema_lines)
-# 4) Few-shot examples teaching SUM and AVERAGE patterns
-example_block = """
-Example 1
-Q: Total profit by region?
-A: Group “Profit” by “Region” and sum → EMEA: 30172183.37; APAC: 32301788.32; Latin America: 27585378.50; North America: 25473893.34
-Example 2
-Q: Average profit margin for Product B in Americas?
-A: Filter Product=B & Region=Americas, take mean of “ProfitMargin” → 0.18
-""".strip()
-# 5) Model & pipeline setup
 MODEL_ID = "microsoft/tapex-base-finetuned-wikisql"
 device   = 0 if torch.cuda.is_available() else -1
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
-table_qa = pipeline(
-    "table-question-answering",
     model=model,
     tokenizer=tokenizer,
     framework="pt",
     device=device,
 )
-# 6) QA function with schema-aware prompting
 def answer_profitability(question: str) -> str:
-    # cast all cells to string for safety
-    table = df.astype(str).to_dict(orient="records")
-    # assemble the full prompt
-    prompt = f"""{schema_text}
-{example_block}
-Q: {question}
-A:"""
     try:
-        out = table_qa(table=table, query=prompt)
-        return out.get("answer", "No answer found.")
     except Exception as e:
-        return f"Error: {e}"
-# 7) Gradio interface
 iface = gr.Interface(
     fn=answer_profitability,
-    inputs=gr.Textbox(lines=2, placeholder="Ask a question about profitability…"),
-    outputs="text",
-    title="SAP Profitability Q&A (Schema-Aware TAPEX)",
     description=(
-        "Every query is prefixed with your table’s schema and two few-shot examples, "
-        "so the model learns to SUM, AVERAGE, FILTER, etc., without extra Python code."
     )
 )
-# 8) Launch the app
 if __name__ == "__main__":
     iface.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import pandas as pd
 import torch
+import duckdb
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
+# 1) Load data and register it in DuckDB
 df = pd.read_csv('synthetic_profit.csv')
+conn = duckdb.connect(database=':memory:')
+conn.register('sap', df)
+# 2) Build a one-line schema description
+schema = ", ".join(df.columns)  # e.g. "Region, Product, FiscalYear, ..."
+# 3) Load TAPEX (WikiSQL) for SQL generation
 MODEL_ID = "microsoft/tapex-base-finetuned-wikisql"
 device   = 0 if torch.cuda.is_available() else -1
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID)
+sql_generator = pipeline(
+    "text2text-generation",
     model=model,
     tokenizer=tokenizer,
     framework="pt",
     device=device,
+    # limit length so it doesn’t try to output the entire table!
+    max_length=128,
 )
+# 4) Your new QA function
 def answer_profitability(question: str) -> str:
+    # 4a) Prompt the model to generate SQL
+    prompt = (
+        f"Translate to SQL for table `sap` with columns ({schema}):\n"
+        f"Question: {question}\n"
+        "SQL:"
+    )
+    sql = sql_generator(prompt)[0]['generated_text'].strip()
+    # 4b) Execute the generated SQL and return results
     try:
+        result_df = conn.execute(sql).df()
+        # pretty-print as text
+        if result_df.empty:
+            return f"No rows returned. Generated SQL was:\n{sql}"
+        return result_df.to_string(index=False)
     except Exception as e:
+        # if something goes wrong, show you the SQL so you can debug
+        return f"Error executing SQL: {e}\n\nGenerated SQL:\n{sql}"
+# 5) Gradio interface
 iface = gr.Interface(
     fn=answer_profitability,
+    inputs=gr.Textbox(lines=2, placeholder="Ask about your SAP data…"),
+    outputs="textbox",
+    title="SAP Profitability Q&A (SQL-Generation)",
     description=(
+        "Uses TAPEX to translate your natural-language question "
+        "into a SQL query over the `sap` table, then runs it via DuckDB."
     )
 )
 if __name__ == "__main__":
     iface.launch(server_name="0.0.0.0", server_port=7860)