| import gradio as gr |
| import pandas as pd |
| import duckdb |
| from datasets import load_dataset |
| import openai |
| import os |
| from typing import Dict, List, Any |
| import json |
|
|
| class SALTAnalytics: |
| def __init__(self): |
| """Initialize SALT Analytics""" |
| self.con = duckdb.connect(':memory:') |
| self.data_loaded = False |
| self.schema_info = "" |
| self.openai_client = None |
| |
| def setup_openai(self, api_key: str): |
| """Setup OpenAI client with API key""" |
| try: |
| self.openai_client = openai.OpenAI(api_key=api_key) |
| return True |
| except Exception as e: |
| return False |
| |
| def load_salt_dataset(self): |
| """Load SAP SALT dataset from Hugging Face into DuckDB""" |
| if self.data_loaded: |
| return "Dataset already loaded!" |
| |
| try: |
| |
| dataset = load_dataset("SAP/SALT", "joined_table", split="train", streaming=False) |
| df = dataset.to_pandas() |
| |
| |
| if len(df) > 100000: |
| df = df.sample(n=50000, random_state=42) |
| |
| |
| self.con.execute("CREATE TABLE salt_data AS SELECT * FROM df") |
| |
| |
| schema_result = self.con.execute("DESCRIBE salt_data").fetchall() |
| self.schema_info = "\n".join([f"{col[0]}: {col[1]}" for col in schema_result]) |
| |
| self.data_loaded = True |
| return f"β
Successfully loaded {len(df)} records into DuckDB" |
| |
| except Exception as e: |
| return f"β Error loading dataset: {str(e)}" |
| |
| def get_predefined_insights(self): |
| """Generate predefined analytical insights""" |
| if not self.data_loaded: |
| return "Please load the dataset first" |
| |
| try: |
| insights = {} |
| |
| |
| insights['Sales Office Performance'] = self.con.execute(""" |
| SELECT SALESOFFICE, |
| COUNT(*) as total_orders, |
| COUNT(DISTINCT CUSTOMERID) as unique_customers |
| FROM salt_data |
| GROUP BY SALESOFFICE |
| ORDER BY total_orders DESC |
| LIMIT 10 |
| """).fetchdf() |
| |
| |
| insights['Payment Terms Distribution'] = self.con.execute(""" |
| SELECT CUSTOMERPAYMENTTERMS, |
| COUNT(*) as frequency, |
| ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 2) as percentage |
| FROM salt_data |
| GROUP BY CUSTOMERPAYMENTTERMS |
| ORDER BY frequency DESC |
| """).fetchdf() |
| |
| |
| insights['Shipping Conditions'] = self.con.execute(""" |
| SELECT SHIPPINGCONDITION, |
| COUNT(*) as order_count, |
| COUNT(DISTINCT PLANT) as plants_served |
| FROM salt_data |
| GROUP BY SHIPPINGCONDITION |
| ORDER BY order_count DESC |
| """).fetchdf() |
| |
| return insights |
| |
| except Exception as e: |
| return f"Error generating insights: {str(e)}" |
| |
| def natural_language_query(self, question: str, api_key: str): |
| """Convert natural language to SQL and execute""" |
| if not self.data_loaded: |
| return "Please load the dataset first" |
| |
| if not api_key: |
| return "Please provide OpenAI API key" |
| |
| try: |
| |
| client = openai.OpenAI(api_key=api_key) |
| |
| prompt = f""" |
| You are a SQL expert analyzing SAP SALT dataset. The database has a table called 'salt_data' with this schema: |
| |
| {self.schema_info} |
| |
| The SALT dataset contains SAP ERP sales order data. Key fields: |
| - SALESOFFICE, SALESGROUP: Sales organization |
| - CUSTOMERID: Customer identifier |
| - CUSTOMERPAYMENTTERMS: Payment terms (Net30, Net45, etc.) |
| - SHIPPINGCONDITION, SHIPPINGPOINT: Shipping logistics |
| - PLANT: Manufacturing location |
| - HEADERINCOTERMSCLASSIFICATION, ITEMINCOTERMSCLASSIFICATION: Trade terms |
| |
| Convert this question to a DuckDB SQL query: "{question}" |
| |
| Return ONLY the SQL query, no explanation. Limit results to 20 rows. |
| """ |
| |
| response = client.chat.completions.create( |
| model="gpt-4", |
| messages=[{"role": "user", "content": prompt}], |
| temperature=0.1 |
| ) |
| |
| sql_query = response.choices[0].message.content.strip() |
| |
| |
| if sql_query.startswith("``` |
| sql_query = sql_query[6:-3] |
| elif sql_query.startswith("```"): |
| sql_query = sql_query[3:-3] |
| |
| # Execute query |
| result_df = self.con.execute(sql_query).fetchdf() |
| |
| # Get explanation |
| explanation_prompt = f""" |
| Question: {question} |
| Results: {result_df.head(10).to_string()} |
| |
| Provide a clear business explanation of these SAP ERP results in 2-3 sentences. |
| """ |
| |
| explanation_response = client.chat.completions.create( |
| model="gpt-4", |
| messages=[{"role": "user", "content": explanation_prompt}], |
| temperature=0.3 |
| ) |
| |
| explanation = explanation_response.choices[0].message.content |
| |
| return f"**SQL Query:**\n``````\n\n**Results:**\n{result_df.to_string(index=False)}\n\n**Explanation:**\n{explanation}" |
| |
| except Exception as e: |
| return f"Error: {str(e)}" |
| |
| # Initialize analytics |
| analytics = SALTAnalytics() |
| |
| def load_dataset_interface(): |
| """Interface for loading dataset""" |
| result = analytics.load_salt_dataset() |
| return result |
| |
| def show_insights_interface(): |
| """Interface for showing insights""" |
| insights = analytics.get_predefined_insights() |
| |
| if isinstance(insights, str): |
| return insights |
| |
| output = "# π SAP SALT Dataset Insights\n\n" |
| |
| for title, df in insights.items(): |
| output += f"## {title}\n\n" |
| output += df.to_markdown(index=False) |
| output += "\n\n---\n\n" |
| |
| return output |
| |
| def qa_interface(question: str, api_key: str): |
| """Interface for Q&A functionality""" |
| if not question.strip(): |
| return "Please enter a question" |
| |
| result = analytics.natural_language_query(question, api_key) |
| return result |
| |
| # Sample questions for the interface |
| sample_questions = [ |
| "Which sales office has the most customers?", |
| "What are the most common payment terms?", |
| "Show me shipping conditions by plant", |
| "Which customers have the highest number of orders?", |
| "What's the distribution of sales groups?" |
| ] |
| |
| # Create Gradio interface |
| with gr.Blocks(title="SAP SALT Analytics Demo", theme=gr.themes.Soft()) as demo: |
| |
| gr.Markdown(""" |
| |
| |
| |
| This demo showcases how open source tools (DuckDB + OpenAI) can generate massive value for enterprises running SAP ERP systems. |
| """) |
| |
| with gr.Tab("π₯ Load Dataset"): |
| gr.Markdown("### Load SAP SALT Dataset from Hugging Face") |
| |
| load_btn = gr.Button("Load SALT Dataset", variant="primary") |
| load_output = gr.Textbox(label="Status", lines=3) |
| |
| load_btn.click( |
| fn=load_dataset_interface, |
| outputs=load_output |
| ) |
| |
| with gr.Tab("π Insights"): |
| gr.Markdown("### Pre-built Analytics Insights") |
| |
| insights_btn = gr.Button("Generate Insights", variant="primary") |
| insights_output = gr.Markdown() |
| |
| insights_btn.click( |
| fn=show_insights_interface, |
| outputs=insights_output |
| ) |
| |
| with gr.Tab("π€ AI Q&A"): |
| gr.Markdown("### Ask Questions in Natural Language") |
| |
| with gr.Row(): |
| with gr.Column(scale=3): |
| api_key_input = gr.Textbox( |
| label="OpenAI API Key", |
| type="password", |
| placeholder="Enter your OpenAI API key" |
| ) |
| |
| question_input = gr.Textbox( |
| label="Your Question", |
| placeholder="e.g., Which sales office handles the most customers?", |
| lines=2 |
| ) |
| |
| sample_dropdown = gr.Dropdown( |
| choices=sample_questions, |
| label="Or choose a sample question", |
| value=None |
| ) |
| |
| ask_btn = gr.Button("Get Answer", variant="primary") |
| |
| with gr.Column(scale=4): |
| qa_output = gr.Markdown() |
| |
| # Update question input when sample is selected |
| sample_dropdown.change( |
| fn=lambda x: x if x else "", |
| inputs=sample_dropdown, |
| outputs=question_input |
| ) |
| |
| ask_btn.click( |
| fn=qa_interface, |
| inputs=[question_input, api_key_input], |
| outputs=qa_output |
| ) |
| |
| with gr.Tab("βΉοΈ About"): |
| gr.Markdown(""" |
| |
| |
| **Dataset**: SAP SALT (Sales Autocompletion Linked Business Tables) |
| - Real SAP S/4HANA sales order data |
| - 4 linked tables: Sales Documents, Items, Customers, Addresses |
| - 8 classification targets for ML models |
| |
| **Technology Stack**: |
| - **DuckDB**: High-performance analytics database |
| - **OpenAI GPT-4**: Natural language to SQL conversion |
| - **Hugging Face**: Dataset hosting and deployment |
| - **Gradio**: Interactive web interface |
| |
| **Business Value**: |
| - Automate sales order completion (70-80% accuracy) |
| - Optimize customer-to-sales office assignments |
| - Predict shipping and payment preferences |
| - Generate actionable business insights |
| |
| **Open Source Benefits**: |
| - Zero licensing costs vs. proprietary SAP analytics |
| - Full customization and control |
| - Community-driven improvements |
| - Easy integration with existing systems |
| """) |
| |
| if __name__ == "__main__": |
| demo.launch() |
| |