Spaces:

PD03
/

SALT_Dashboard

Sleeping

App Files Files Community

PD03 commited on Sep 3, 2025

Commit

73a7361

verified ·

1 Parent(s): 1403d5d

Update app.py

Browse files

Files changed (1) hide show

app.py +364 -436

app.py CHANGED Viewed

@@ -1,450 +1,378 @@
-import streamlit as st
-import pandas as pd
 import duckdb
 import plotly.express as px
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-from datasets import load_dataset
-import numpy as np
-import openai
-import os
-# Configure page
 st.set_page_config(
-    page_title="SAP SALT Analytics Dashboard",
-    page_icon="📊",
     layout="wide",
-    initial_sidebar_state="expanded"
 )
-# Custom CSS
-st.markdown("""
-<style>
-    .main-header {
-        font-size: 2.5rem;
-        color: #1f77b4;
-        text-align: center;
-        margin-bottom: 2rem;
-    }
-    .metric-card {
-        background-color: #f0f2f6;
-        padding: 1rem;
-        border-radius: 0.5rem;
-        border-left: 4px solid #1f77b4;
-    }
-    .insight-box {
-        background-color: #e8f4f8;
-        padding: 1rem;
-        border-radius: 0.5rem;
-        border-left: 4px solid #17a2b8;
-        margin: 1rem 0;
-    }
-</style>
-""", unsafe_allow_html=True)
-@st.cache_data
-def load_salt_data(hf_token):
-    """Load SAP SALT dataset with authentication"""
-    dataset = load_dataset("SAP/SALT", "joined_table", split="train", token=hf_token)
-    return dataset.to_pandas()
-@st.cache_resource
-def init_duckdb(df):
-    """Initialize DuckDB connection with data"""
-    conn = duckdb.connect(':memory:')
-    conn.register('sales_data', df)
-    return conn
-def analyze_dataset_columns(df):
-    """Analyze dataset columns and identify key fields"""
-    columns = list(df.columns)
-    # Show available columns in sidebar for reference
-    with st.sidebar.expander("📋 Dataset Columns", expanded=False):
-        for i, col in enumerate(columns):
-            st.write(f"{i+1}. {col}")
-    # Detect column types based on content and names
-    date_cols = [col for col in columns if any(word in col.lower() for word in
-                ['date', 'time', 'created', 'modified', 'timestamp'])]
-    value_cols = [col for col in columns if any(word in col.lower() for word in
-                 ['value', 'amount', 'price', 'cost', 'total', 'sum', 'revenue', 'net', 'gross'])]
-    customer_cols = [col for col in columns if any(word in col.lower() for word in
-                    ['customer', 'client', 'buyer', 'account', 'partner'])]
-    sales_cols = [col for col in columns if any(word in col.lower() for word in
-                 ['sales', 'office', 'group', 'region', 'territory', 'division'])]
-    # Get numeric columns as backup for values
-    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
-    if not value_cols and numeric_cols:
-        value_cols = numeric_cols
-    return {
-        'all_columns': columns,
-        'date_columns': date_cols,
-        'value_columns': value_cols,
-        'customer_columns': customer_cols,
-        'sales_columns': sales_cols,
-        'numeric_columns': numeric_cols
-    }
-def generate_ai_insights(data_summary, openai_key=None):
-    """Generate AI-powered business insights"""
-    if not openai_key:
-        return """
-        🤖 **AI-Powered Insights** (Add OpenAI API key for detailed insights):
-        • **Revenue Optimization**: Analyze high-performing segments and scale successful strategies
-        • **Customer Intelligence**: Identify customer behavior patterns and retention opportunities
-        • **Operational Excellence**: Optimize processes based on performance data patterns
-        • **Strategic Growth**: Leverage data insights for market expansion and competitive advantage
-        """
     try:
-        openai.api_key = openai_key
-        response = openai.ChatCompletion.create(
-            model="gpt-3.5-turbo",
-            messages=[{
-                "role": "user",
-                "content": f"""
-                Analyze this SAP ERP sales data and provide strategic business insights:
-                {data_summary}
-                Generate 4 actionable recommendations for:
-                1. Revenue optimization strategies
-                2. Customer relationship management
-                3. Operational efficiency improvements
-                4. Business growth opportunities
-                Format as specific, measurable recommendations.
-                """
-            }],
-            max_tokens=600,
-            temperature=0.7
-        )
-        return f"🤖 **AI-Generated Insights**:\n\n{response.choices[0].message.content}"
-    except Exception as e:
-        return f"🤖 **AI Service Error**: {str(e)}"
-def create_time_series_chart(conn, column_info):
-    """Create time series analysis chart"""
-    if not column_info['date_columns'] or not column_info['value_columns']:
-        return go.Figure().add_annotation(text="Date and value columns required", showarrow=False)
-    date_col = column_info['date_columns'][0]
-    value_col = column_info['value_columns'][0]
-    query = f"""
-    SELECT
-        DATE_TRUNC('month', "{date_col}") as Period,
-        SUM("{value_col}") as TotalValue,
-        COUNT(*) as RecordCount,
-        AVG("{value_col}") as AvgValue
-    FROM sales_data
-    WHERE "{date_col}" IS NOT NULL AND "{value_col}" IS NOT NULL
-    GROUP BY Period
-    ORDER BY Period
     """
-    df_time = conn.execute(query).df()
-    if df_time.empty:
-        return go.Figure().add_annotation(text="No time series data available", showarrow=False)
-    fig = make_subplots(specs=[[{"secondary_y": True}]])
-    fig.add_trace(
-        go.Scatter(x=df_time['Period'], y=df_time['TotalValue'],
-                  mode='lines+markers', name='Total Value', line=dict(color='#1f77b4')),
-        secondary_y=False,
     )
-    fig.add_trace(
-        go.Bar(x=df_time['Period'], y=df_time['RecordCount'],
-               name='Record Count', opacity=0.6, marker_color='#ff7f0e'),
-        secondary_y=True,
     )
-    fig.update_xaxes(title_text="Time Period")
-    fig.update_yaxes(title_text="Total Value", secondary_y=False)
-    fig.update_yaxes(title_text="Record Count", secondary_y=True)
-    fig.update_layout(title_text=f"Time Series Analysis: {value_col} by {date_col}")
-    return fig
-def create_category_performance_chart(conn, column_info):
-    """Create category performance chart"""
-    if not column_info['sales_columns'] or not column_info['value_columns']:
-        return go.Figure().add_annotation(text="Sales category and value columns required", showarrow=False)
-    category_col = column_info['sales_columns'][0]
-    value_col = column_info['value_columns'][0]
-    query = f"""
-    SELECT
-        "{category_col}" as Category,
-        SUM("{value_col}") as TotalValue,
-        COUNT(*) as RecordCount,
-        AVG("{value_col}") as AvgValue
-    FROM sales_data
-    WHERE "{category_col}" IS NOT NULL AND "{value_col}" IS NOT NULL
-    GROUP BY "{category_col}"
-    ORDER BY TotalValue DESC
-    LIMIT 20
-    """
-    df_category = conn.execute(query).df()
-    if df_category.empty:
-        return go.Figure().add_annotation(text="No category data available", showarrow=False)
-    fig = px.bar(df_category, x='Category', y='TotalValue',
-                 title=f'Performance by {category_col}',
-                 color='AvgValue',
-                 color_continuous_scale='Viridis',
-                 hover_data=['RecordCount'])
-    fig.update_layout(xaxis_title=category_col, yaxis_title="Total Value")
-    fig.update_xaxes(tickangle=45)
-    return fig
-def create_customer_analysis_chart(conn, column_info):
-    """Create customer analysis chart"""
-    if not column_info['customer_columns'] or not column_info['value_columns']:
-        return go.Figure().add_annotation(text="Customer and value columns required", showarrow=False)
-    customer_col = column_info['customer_columns'][0]
-    value_col = column_info['value_columns'][0]
-    query = f"""
-    SELECT
-        "{customer_col}" as Customer,
-        SUM("{value_col}") as TotalValue,
-        COUNT(*) as TransactionCount,
-        AVG("{value_col}") as AvgTransactionValue
-    FROM sales_data
-    WHERE "{customer_col}" IS NOT NULL AND "{value_col}" IS NOT NULL
-    GROUP BY "{customer_col}"
-    ORDER BY TotalValue DESC
-    LIMIT 50
-    """
-    df_customer = conn.execute(query).df()
-    if df_customer.empty:
-        return go.Figure().add_annotation(text="No customer data available", showarrow=False)
-    fig = px.scatter(df_customer, x='TransactionCount', y='AvgTransactionValue',
-                     size='TotalValue', hover_name='Customer',
-                     title='Customer Analysis: Transaction Frequency vs Average Value',
-                     labels={'TransactionCount': 'Number of Transactions',
-                            'AvgTransactionValue': 'Average Transaction Value'})
-    return fig
-def create_value_distribution_chart(conn, column_info):
-    """Create value distribution analysis"""
-    if not column_info['value_columns']:
-        return go.Figure().add_annotation(text="Value columns required", showarrow=False)
-    value_col = column_info['value_columns'][0]
-    query = f"""
-    SELECT "{value_col}" as Value
-    FROM sales_data
-    WHERE "{value_col}" IS NOT NULL AND "{value_col}" > 0
-    ORDER BY "{value_col}"
-    """
-    df_values = conn.execute(query).df()
-    if df_values.empty:
-        return go.Figure().add_annotation(text="No value data available", showarrow=False)
-    fig = px.histogram(df_values, x='Value', nbins=50,
-                      title=f'Value Distribution: {value_col}',
-                      labels={'Value': value_col, 'count': 'Frequency'})
-    return fig
-def create_summary_table(conn, column_info):
-    """Create summary statistics table"""
-    if not column_info['value_columns']:
-        return pd.DataFrame()
-    summaries = []
-    for col in column_info['value_columns'][:5]:  # Top 5 value columns
-        query = f"""
-        SELECT
-            '{col}' as Column_Name,
-            COUNT("{col}") as Count,
-            SUM("{col}") as Total,
-            AVG("{col}") as Average,
-            MIN("{col}") as Minimum,
-            MAX("{col}") as Maximum,
-            STDDEV("{col}") as StdDev
-        FROM sales_data
-        WHERE "{col}" IS NOT NULL
-        """
-        result = conn.execute(query).df()
-        if not result.empty:
-            summaries.append(result)
-    if summaries:
-        return pd.concat(summaries, ignore_index=True)
-    return pd.DataFrame()
-def main():
-    # Header
-    st.markdown('<h1 class="main-header">📊 SAP SALT Business Analytics Dashboard</h1>',
-                unsafe_allow_html=True)
-    # Sidebar
-    st.sidebar.header("🎛️ Authentication & Controls")
-    # Authentication
-    hf_token = st.sidebar.text_input(
-        "🤗 Hugging Face Token",
         type="password",
-        help="Required to access SAP SALT dataset: https://huggingface.co/settings/tokens"
     )
-    openai_key = st.sidebar.text_input("🤖 OpenAI API Key (Optional)", type="password",
-                                      help="For AI-powered insights")
-    if not hf_token:
-        st.error("🔐 **Authentication Required**")
-        st.info("""
-        **To access the SAP SALT dataset:**
-        1. Visit: https://huggingface.co/datasets/SAP/SALT
-        2. Accept the dataset terms
-        3. Get your token: https://huggingface.co/settings/tokens
-        4. Enter the token in the sidebar
-        """)
-        st.stop()
-    # Load data
     try:
-        with st.spinner("Loading SAP SALT dataset..."):
-            df = load_salt_data(hf_token)
-        st.success(f"✅ Dataset loaded: {len(df):,} records × {len(df.columns)} columns")
-    except Exception as e:
-        st.error(f"Failed to load dataset: {str(e)}")
-        st.stop()
-    # Analyze columns
-    column_info = analyze_dataset_columns(df)
-    # Initialize DuckDB
-    conn = init_duckdb(df)
-    # Dataset Overview
-    with st.expander("📊 Dataset Overview", expanded=False):
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("Total Records", f"{len(df):,}")
-            st.metric("Total Columns", len(df.columns))
-        with col2:
-            st.metric("Date Columns", len(column_info['date_columns']))
-            st.metric("Value Columns", len(column_info['value_columns']))
-        with col3:
-            st.metric("Customer Columns", len(column_info['customer_columns']))
-            st.metric("Sales Columns", len(column_info['sales_columns']))
-    # Key Metrics
-    st.subheader("📈 Key Business Metrics")
-    # Calculate business metrics
-    if column_info['value_columns']:
-        primary_value_col = column_info['value_columns'][0]
-        total_value = df[primary_value_col].sum()
-        avg_value = df[primary_value_col].mean()
-        max_value = df[primary_value_col].max()
-        col1, col2, col3, col4 = st.columns(4)
-        with col1:
-            st.metric("Total Value", f"€{total_value:,.0f}")
-        with col2:
-            st.metric("Average Value", f"€{avg_value:,.2f}")
-        with col3:
-            st.metric("Maximum Value", f"€{max_value:,.0f}")
-        with col4:
-            unique_customers = df[column_info['customer_columns'][0]].nunique() if column_info['customer_columns'] else 0
-            st.metric("Unique Customers", f"{unique_customers:,}")
-    # Analytics Charts
-    st.subheader("📊 Business Analytics")
-    col1, col2 = st.columns(2)
-    with col1:
-        time_chart = create_time_series_chart(conn, column_info)
-        st.plotly_chart(time_chart, use_container_width=True)
-    with col2:
-        category_chart = create_category_performance_chart(conn, column_info)
-        st.plotly_chart(category_chart, use_container_width=True)
-    col3, col4 = st.columns(2)
-    with col3:
-        customer_chart = create_customer_analysis_chart(conn, column_info)
-        st.plotly_chart(customer_chart, use_container_width=True)
-    with col4:
-        distribution_chart = create_value_distribution_chart(conn, column_info)
-        st.plotly_chart(distribution_chart, use_container_width=True)
-    # Summary Statistics
-    st.subheader("📋 Statistical Summary")
-    summary_df = create_summary_table(conn, column_info)
-    if not summary_df.empty:
-        st.dataframe(summary_df, use_container_width=True)
-    # Data Preview
-    st.subheader("🔍 Data Preview")
-    st.dataframe(df.head(50), use_container_width=True)
-    # AI Insights
-    st.subheader("🧠 AI-Powered Business Insights")
-    # Prepare comprehensive data summary
-    data_summary = f"""
-    SAP SALT Dataset Analysis:
-    - Total Records: {len(df):,}
-    - Total Columns: {len(df.columns)}
-    - Primary Value Column: {column_info['value_columns'][0] if column_info['value_columns'] else 'None'}
-    - Total Business Value: €{df[column_info['value_columns'][0]].sum():,.0f if column_info['value_columns'] else 0}
-    - Average Transaction: €{df[column_info['value_columns'][0]].mean():,.2f if column_info['value_columns'] else 0}
-    - Date Range Coverage: {len(column_info['date_columns'])} temporal columns
-    - Customer Entities: {df[column_info['customer_columns'][0]].nunique() if column_info['customer_columns'] else 0}
-    - Sales Categories: {len(column_info['sales_columns'])} organizational dimensions
-    """
-    insights = generate_ai_insights(data_summary, openai_key)
-    st.markdown(f'<div class="insight-box">{insights}</div>', unsafe_allow_html=True)
-    # Footer
-    st.markdown("---")
-    st.markdown("**Enterprise Analytics Dashboard** | SAP SALT Dataset | Built with Streamlit + DuckDB + OpenAI")
-if __name__ == "__main__":
-    main()

+import os
+import textwrap
 import duckdb
+import pandas as pd
+import streamlit as st
 import plotly.express as px
+from datetime import datetime
+# ----------------------------
+# Page config
+# ----------------------------
 st.set_page_config(
+    page_title="SALT Analytics Dashboard",
+    page_icon="📈",
     layout="wide",
 )
+st.title("📈 SALT Analytics Dashboard")
+st.caption("DuckDB + Streamlit on Hugging Face Spaces · Dataset: SAP/SALT")
+# ----------------------------
+# Helpers
+# ----------------------------
+@st.cache_resource(show_spinner=False)
+def get_conn(db_path: str = None):
+    """Create (and cache) a DuckDB connection, load httpfs extension."""
+    if db_path is None:
+        # Prefer Spaces persistent storage if available
+        root = "/data" if os.path.isdir("/data") else "."
+        db_path = os.path.join(root, "salt.duckdb")
+    con = duckdb.connect(db_path)
+    # Ensure httpfs is available for hf:// access
     try:
+        con.execute("INSTALL httpfs; LOAD httpfs;")
+    except Exception:
+        pass
+    return con
+def _resolve_repo_id():
+    """Support either 'SAP/SALT' or 'sap-ai-research/SALT'."""
+    # Allow override via UI/env for forks
+    default_candidates = [
+        os.environ.get("SALT_DATASET_REPO", "SAP/SALT"),
+        "sap-ai-research/SALT",
+    ]
+    return default_candidates
+@st.cache_data(show_spinner=False)
+def list_columns(con: duckdb.DuckDBPyConnection, table: str) -> list[str]:
+    q = """
+    select lower(name) as name
+    from pragma_table_info(?)
+    order by name
     """
+    return [r[0] for r in con.execute(q, [table]).fetchall()]
+def find_col(cols_lower: list[str], candidates: list[str]):
+    """Return first matching candidate (case-insensitive) or None."""
+    cand_lower = [c.lower() for c in candidates]
+    for c in cand_lower:
+        if c in cols_lower:
+            return c
+    return None
+# ----------------------------
+# Sidebar — Config
+# ----------------------------
+with st.sidebar:
+    st.header("⚙️ Configuration")
+    repo_candidates = _resolve_repo_id()
+    repo_id = st.selectbox("Dataset repo", repo_candidates, index=0,
+                           help="Both IDs are supported on the Hub; choose the one you have access to.")
+    split = st.selectbox("Split", ["train", "test"], index=0)
+    use_joined = st.toggle(
+        "Use joined table (recommended)",
+        value=True,
+        help="If off, you can still analyze the item-level table."
     )
+    hf_token = st.text_input(
+        "HF token (for gated/private access)",
+        type="password",
+        placeholder="hf_xxx (optional if Space has access)",
+        value=os.environ.get("HF_TOKEN", st.secrets.get("HF_TOKEN", "")),
     )
+    openai_key = st.text_input(
+        "OpenAI API key",
         type="password",
+        placeholder="sk-...",
+        value=os.environ.get("OPENAI_API_KEY", st.secrets.get("OPENAI_API_KEY", "")),
+        help="Needed only for the Recommendations section.",
     )
+    st.divider()
+    if st.button("🔄 Rebuild local DB", help="Drop & reload local DuckDB tables from Hugging Face"):
+        st.session_state["rebuild"] = True
+    else:
+        st.session_state.setdefault("rebuild", False)
+# ----------------------------
+# Load data into DuckDB (one-time)
+# ----------------------------
+con = get_conn()
+# Configure HF auth in DuckDB Secrets Manager if provided
+if hf_token:
     try:
+        con.execute("CREATE OR REPLACE SECRET hf_token (TYPE huggingface, TOKEN ?)", [hf_token])
+    except Exception:
+        pass
+joined_table_name = "salt_joined"
+items_table_name = "salt_items"
+if st.session_state["rebuild"]:
+    with st.status("Rebuilding DuckDB tables…", expanded=True):
+        con.execute(f"DROP TABLE IF EXISTS {joined_table_name}")
+        con.execute(f"DROP TABLE IF EXISTS {items_table_name}")
+        st.write("Dropped existing tables.")
+        st.session_state["rebuild"] = False
+# Create tables lazily
+if use_joined and not con.execute(f"SELECT count(*) FROM information_schema.tables WHERE table_name='{joined_table_name}'").fetchone()[0]:
+    with st.status("Loading joined table into DuckDB…", expanded=False):
+        path = f"hf://datasets/{repo_id}/JoinedTables_{split}.parquet"
+        con.execute(f"CREATE TABLE {joined_table_name} AS SELECT * FROM read_parquet(?)", [path])
+        st.success("Joined table loaded.")
+if (not use_joined) and not con.execute(f"SELECT count(*) FROM information_schema.tables WHERE table_name='{items_table_name}'").fetchone()[0]:
+    with st.status("Loading item-level table into DuckDB…", expanded=False):
+        path = f"hf://datasets/{repo_id}/I_SalesDocumentItem_{split}.parquet"
+        con.execute(f"CREATE TABLE {items_table_name} AS SELECT * FROM read_parquet(?)", [path])
+        st.success("Items table loaded.")
+active_table = joined_table_name if use_joined else items_table_name
+cols_lower = list_columns(con, active_table)
+# Heuristic column mapping
+name_map = {
+    "order_id": ["SalesDocument", "SALESORDER", "vbeln"],
+    "order_item": ["SalesDocumentItem", "SALESORDERITEM", "posnr"],
+    "customer": ["SoldToParty", "CUSTOMER", "kunnr", "SoldToParty_PartyNumber"],
+    "country": ["Country", "COUNTRY", "land1", "ShipToCountry", "ShipToPartyCountry"],
+    "date": ["CreationDate", "CREATIONDATE", "CreatedOn", "DocumentDate", "DOCUMENTDATE", "CreatedAt", "CREATEDON"],
+    "plant": ["PLANT", "Plant", "werks"],
+    "shipping_condition": ["SHIPPINGCONDITION", "ShippingCondition"],
+    "shipping_point": ["SHIPPINGPOINT", "ShippingPoint"],
+    "sales_office": ["SALESOFFICE", "SalesOffice"],
+    "sales_group": ["SALESGROUP", "SalesGroup"],
+    "header_incoterms": ["HEADERINCOTERMSCLASSIFICATION", "HeaderIncotermsClassification"],
+    "item_incoterms": ["ITEMINCOTERMSCLASSIFICATION", "ItemIncotermsClassification"],
+}
+resolved = {k: find_col(cols_lower, v) for k, v in name_map.items()}
+# ----------------------------
+# Filters
+# ----------------------------
+with st.container():
+    st.subheader("Filters")
+    left, mid, right = st.columns([2,2,2])
+    # Country filter
+    country_col = resolved.get("country")
+    if country_col:
+        countries = [r[0] for r in con.execute(f"SELECT DISTINCT {country_col} FROM {active_table} WHERE {country_col} IS NOT NULL ORDER BY 1").fetchall()]
+        country_sel = left.multiselect("Country", countries, default=[])
+    else:
+        country_sel = []
+    # Sales office/group
+    sales_office_sel = []
+    if resolved.get("sales_office"):
+        opts = [r[0] for r in con.execute(f"SELECT DISTINCT {resolved['sales_office']} FROM {active_table} WHERE {resolved['sales_office']} IS NOT NULL ORDER BY 1").fetchall()]
+        sales_office_sel = mid.multiselect("Sales office", opts)
+    shipping_cond_sel = []
+    if resolved.get("shipping_condition"):
+        opts = [r[0] for r in con.execute(f"SELECT DISTINCT {resolved['shipping_condition']} FROM {active_table} WHERE {resolved['shipping_condition']} IS NOT NULL ORDER BY 1").fetchall()]
+        shipping_cond_sel = right.multiselect("Shipping condition", opts)
+# Build WHERE clause
+where = []
+params: list = []
+if country_sel and resolved.get("country"):
+    where.append(f"{resolved['country']} IN ({', '.join(['?']*len(country_sel))})")
+    params.extend(country_sel)
+if sales_office_sel and resolved.get("sales_office"):
+    where.append(f"{resolved['sales_office']} IN ({', '.join(['?']*len(sales_office_sel))})")
+    params.extend(sales_office_sel)
+if shipping_cond_sel and resolved.get("shipping_condition"):
+    where.append(f"{resolved['shipping_condition']} IN ({', '.join(['?']*len(shipping_cond_sel))})")
+    params.extend(shipping_cond_sel)
+where_sql = (" WHERE " + " AND ".join(where)) if where else ""
+# ----------------------------
+# KPIs
+# ----------------------------
+st.subheader("Key metrics")
+k1, k2, k3, k4 = st.columns(4)
+# Orders
+if resolved.get("order_id"):
+    n_orders = con.execute(
+        f"SELECT COUNT(DISTINCT {resolved['order_id']}) FROM {active_table}{where_sql}", params
+    ).fetchone()[0]
+else:
+    n_orders = con.execute(f"SELECT COUNT(*) FROM {active_table}{where_sql}", params).fetchone()[0]
+# Customers
+if resolved.get("customer"):
+    n_customers = con.execute(
+        f"SELECT COUNT(DISTINCT {resolved['customer']}) FROM {active_table}{where_sql}", params
+    ).fetchone()[0]
+else:
+    n_customers = None
+# Items per order
+if resolved.get("order_id") and resolved.get("order_item"):
+    avg_items = con.execute(
+        f"SELECT AVG(cnt) FROM (SELECT COUNT(DISTINCT {resolved['order_item']}) AS cnt FROM {active_table}{where_sql} GROUP BY {resolved['order_id']})",
+        params,
+    ).fetchone()[0]
+else:
+    avg_items = None
+# Top plant count
+top_plant = None
+if resolved.get("plant"):
+    row = con.execute(
+        f"SELECT {resolved['plant']}, COUNT(*) AS c FROM {active_table}{where_sql} GROUP BY 1 ORDER BY c DESC LIMIT 1",
+        params,
+    ).fetchone()
+    if row:
+        top_plant = f"{row[0]} ({row[1]})"
+k1.metric("Orders", f"{n_orders:,}")
+k2.metric("Customers", f"{n_customers:,}" if n_customers is not None else "—")
+k3.metric("Avg items / order", f"{avg_items:.2f}" if avg_items else "—")
+k4.metric("Top plant by rows", top_plant or "—")
+# ----------------------------
+# Charts
+# ----------------------------
+with st.container():
+    c1, c2 = st.columns(2)
+    # Orders over time
+    date_col = resolved.get("date")
+    if date_col:
+        df_time = con.execute(
+            f"""
+            SELECT date_trunc('month', cast({date_col} as timestamp)) AS month,
+                   COUNT(*) as rows
+            FROM {active_table}
+            {where_sql}
+            GROUP BY 1
+            ORDER BY 1
+            """,
+            params,
+        ).df()
+        if not df_time.empty:
+            fig = px.line(df_time, x="month", y="rows", markers=True, title="Rows over time (monthly)")
+            c1.plotly_chart(fig, use_container_width=True)
+    # Shipping condition distribution
+    if resolved.get("shipping_condition"):
+        df_ship = con.execute(
+            f"SELECT {resolved['shipping_condition']} as sc, COUNT(*) as rows FROM {active_table}{where_sql} GROUP BY 1 ORDER BY rows DESC LIMIT 15",
+            params,
+        ).df()
+        if not df_ship.empty:
+            fig = px.bar(df_ship, x="sc", y="rows", title="Shipping condition distribution (Top 15)")
+            c2.plotly_chart(fig, use_container_width=True)
+with st.container():
+    c3, c4 = st.columns(2)
+    # Plants by country
+    if resolved.get("plant") and resolved.get("country"):
+        df_pc = con.execute(
+            f"""
+            SELECT {resolved['country']} as country, {resolved['plant']} as plant, COUNT(*) as rows
+            FROM {active_table}
+            {where_sql}
+            GROUP BY 1,2
+            ORDER BY rows DESC
+            LIMIT 250
+            """,
+            params,
+        ).df()
+        if not df_pc.empty:
+            fig = px.treemap(df_pc, path=["country", "plant"], values="rows", title="Volume by Country → Plant")
+            c3.plotly_chart(fig, use_container_width=True)
+    # Incoterms
+    incoterm_col = resolved.get("header_incoterms") or resolved.get("item_incoterms")
+    if incoterm_col:
+        df_inc = con.execute(
+            f"SELECT {incoterm_col} as incoterm, COUNT(*) as rows FROM {active_table}{where_sql} GROUP BY 1 ORDER BY rows DESC LIMIT 20",
+            params,
+        ).df()
+        if not df_inc.empty:
+            fig = px.pie(df_inc, names="incoterm", values="rows", title="Incoterms share (Top 20)")
+            c4.plotly_chart(fig, use_container_width=True)
+# ----------------------------
+# Data Preview
+# ----------------------------
+st.subheader("Data preview")
+preview = con.execute(f"SELECT * FROM {active_table}{where_sql} LIMIT 100", params).df()
+st.dataframe(preview, use_container_width=True, hide_index=True)
+# ----------------------------
+# LLM Insights & Recommendations (OpenAI)
+# ----------------------------
+with st.expander("💡 AI Recommendations (OpenAI)", expanded=True):
+    st.write("Generate action-oriented suggestions based on the visible KPIs and distributions.")
+    if not openai_key:
+        st.info("Add your OpenAI API key in the sidebar to enable this.")
+    else:
+        try:
+            from openai import OpenAI
+            client = OpenAI(api_key=openai_key)
+            # Craft a concise context from metrics and top distributions
+            parts = []
+            parts.append(f"Orders: {n_orders}")
+            if n_customers is not None:
+                parts.append(f"Customers: {n_customers}")
+            if avg_items is not None:
+                parts.append(f"Avg items/order: {avg_items:.2f}")
+            if top_plant:
+                parts.append(f"Top plant: {top_plant}")
+            context = "; ".join(parts)
+            # Small samples from charts to ground model
+            sample_ship = con.execute(
+                f"SELECT {resolved['shipping_condition']} as sc, COUNT(*) as rows FROM {active_table}{where_sql} GROUP BY 1 ORDER BY rows DESC LIMIT 8",
+                params,
+            ).df().to_dict(orient="records") if resolved.get("shipping_condition") else []
+            prompt = textwrap.dedent(f"""
+            You are a senior ops analyst. Based on the SALT dataset analytics summary below,
+            write actionable recommendations. Focus on levers in sales operations, logistics (shipping
+            conditions/points), and master data hygiene. Keep it business-practical and specific.
+            Visible KPIs: {context}
+            Shipping distribution (top sample): {sample_ship}
+            Deliver:
+            - 5 bulletpoint actions (each ≤ 20 words)
+            - 3 watchouts/risks (each ≤ 15 words)
+            - 2 quick experiments to A/B in the next sprint
+            """)
+            resp = client.responses.create(
+                model="gpt-4o-mini",
+                input=prompt,
+            )
+            recos = getattr(resp, "output_text", None) or (
+                resp.output[0].content[0].text if getattr(resp, "output", None) else ""
+            )
+            st.markdown(recos)
+        except Exception as e:
+            st.warning(f"OpenAI call failed: {e}")
+# ----------------------------
+# Footer
+# ----------------------------
+st.caption(
+    "SALT dataset © SAP AI Research — loaded via DuckDB hf:// and analyzed client-side."
+)