Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,450 +1,378 @@
|
|
| 1 |
-
import
|
| 2 |
-
import
|
| 3 |
import duckdb
|
|
|
|
|
|
|
| 4 |
import plotly.express as px
|
| 5 |
-
|
| 6 |
-
from plotly.subplots import make_subplots
|
| 7 |
-
from datasets import load_dataset
|
| 8 |
-
import numpy as np
|
| 9 |
-
import openai
|
| 10 |
-
import os
|
| 11 |
|
| 12 |
-
#
|
|
|
|
|
|
|
| 13 |
st.set_page_config(
|
| 14 |
-
page_title="
|
| 15 |
-
page_icon="
|
| 16 |
layout="wide",
|
| 17 |
-
initial_sidebar_state="expanded"
|
| 18 |
)
|
| 19 |
|
| 20 |
-
|
| 21 |
-
st.
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
.insight-box {
|
| 36 |
-
background-color: #e8f4f8;
|
| 37 |
-
padding: 1rem;
|
| 38 |
-
border-radius: 0.5rem;
|
| 39 |
-
border-left: 4px solid #17a2b8;
|
| 40 |
-
margin: 1rem 0;
|
| 41 |
-
}
|
| 42 |
-
</style>
|
| 43 |
-
""", unsafe_allow_html=True)
|
| 44 |
-
|
| 45 |
-
@st.cache_data
|
| 46 |
-
def load_salt_data(hf_token):
|
| 47 |
-
"""Load SAP SALT dataset with authentication"""
|
| 48 |
-
dataset = load_dataset("SAP/SALT", "joined_table", split="train", token=hf_token)
|
| 49 |
-
return dataset.to_pandas()
|
| 50 |
-
|
| 51 |
-
@st.cache_resource
|
| 52 |
-
def init_duckdb(df):
|
| 53 |
-
"""Initialize DuckDB connection with data"""
|
| 54 |
-
conn = duckdb.connect(':memory:')
|
| 55 |
-
conn.register('sales_data', df)
|
| 56 |
-
return conn
|
| 57 |
-
|
| 58 |
-
def analyze_dataset_columns(df):
|
| 59 |
-
"""Analyze dataset columns and identify key fields"""
|
| 60 |
-
columns = list(df.columns)
|
| 61 |
-
|
| 62 |
-
# Show available columns in sidebar for reference
|
| 63 |
-
with st.sidebar.expander("π Dataset Columns", expanded=False):
|
| 64 |
-
for i, col in enumerate(columns):
|
| 65 |
-
st.write(f"{i+1}. {col}")
|
| 66 |
-
|
| 67 |
-
# Detect column types based on content and names
|
| 68 |
-
date_cols = [col for col in columns if any(word in col.lower() for word in
|
| 69 |
-
['date', 'time', 'created', 'modified', 'timestamp'])]
|
| 70 |
-
|
| 71 |
-
value_cols = [col for col in columns if any(word in col.lower() for word in
|
| 72 |
-
['value', 'amount', 'price', 'cost', 'total', 'sum', 'revenue', 'net', 'gross'])]
|
| 73 |
-
|
| 74 |
-
customer_cols = [col for col in columns if any(word in col.lower() for word in
|
| 75 |
-
['customer', 'client', 'buyer', 'account', 'partner'])]
|
| 76 |
-
|
| 77 |
-
sales_cols = [col for col in columns if any(word in col.lower() for word in
|
| 78 |
-
['sales', 'office', 'group', 'region', 'territory', 'division'])]
|
| 79 |
-
|
| 80 |
-
# Get numeric columns as backup for values
|
| 81 |
-
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 82 |
-
if not value_cols and numeric_cols:
|
| 83 |
-
value_cols = numeric_cols
|
| 84 |
-
|
| 85 |
-
return {
|
| 86 |
-
'all_columns': columns,
|
| 87 |
-
'date_columns': date_cols,
|
| 88 |
-
'value_columns': value_cols,
|
| 89 |
-
'customer_columns': customer_cols,
|
| 90 |
-
'sales_columns': sales_cols,
|
| 91 |
-
'numeric_columns': numeric_cols
|
| 92 |
-
}
|
| 93 |
-
|
| 94 |
-
def generate_ai_insights(data_summary, openai_key=None):
|
| 95 |
-
"""Generate AI-powered business insights"""
|
| 96 |
-
if not openai_key:
|
| 97 |
-
return """
|
| 98 |
-
π€ **AI-Powered Insights** (Add OpenAI API key for detailed insights):
|
| 99 |
-
|
| 100 |
-
β’ **Revenue Optimization**: Analyze high-performing segments and scale successful strategies
|
| 101 |
-
β’ **Customer Intelligence**: Identify customer behavior patterns and retention opportunities
|
| 102 |
-
β’ **Operational Excellence**: Optimize processes based on performance data patterns
|
| 103 |
-
β’ **Strategic Growth**: Leverage data insights for market expansion and competitive advantage
|
| 104 |
-
"""
|
| 105 |
-
|
| 106 |
try:
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
return f"π€ **AI-Generated Insights**:\n\n{response.choices[0].message.content}"
|
| 132 |
-
|
| 133 |
-
except Exception as e:
|
| 134 |
-
return f"π€ **AI Service Error**: {str(e)}"
|
| 135 |
-
|
| 136 |
-
def create_time_series_chart(conn, column_info):
|
| 137 |
-
"""Create time series analysis chart"""
|
| 138 |
-
if not column_info['date_columns'] or not column_info['value_columns']:
|
| 139 |
-
return go.Figure().add_annotation(text="Date and value columns required", showarrow=False)
|
| 140 |
-
|
| 141 |
-
date_col = column_info['date_columns'][0]
|
| 142 |
-
value_col = column_info['value_columns'][0]
|
| 143 |
-
|
| 144 |
-
query = f"""
|
| 145 |
-
SELECT
|
| 146 |
-
DATE_TRUNC('month', "{date_col}") as Period,
|
| 147 |
-
SUM("{value_col}") as TotalValue,
|
| 148 |
-
COUNT(*) as RecordCount,
|
| 149 |
-
AVG("{value_col}") as AvgValue
|
| 150 |
-
FROM sales_data
|
| 151 |
-
WHERE "{date_col}" IS NOT NULL AND "{value_col}" IS NOT NULL
|
| 152 |
-
GROUP BY Period
|
| 153 |
-
ORDER BY Period
|
| 154 |
"""
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
)
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
|
|
|
| 173 |
)
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
fig.update_yaxes(title_text="Record Count", secondary_y=True)
|
| 178 |
-
fig.update_layout(title_text=f"Time Series Analysis: {value_col} by {date_col}")
|
| 179 |
-
|
| 180 |
-
return fig
|
| 181 |
-
|
| 182 |
-
def create_category_performance_chart(conn, column_info):
|
| 183 |
-
"""Create category performance chart"""
|
| 184 |
-
if not column_info['sales_columns'] or not column_info['value_columns']:
|
| 185 |
-
return go.Figure().add_annotation(text="Sales category and value columns required", showarrow=False)
|
| 186 |
-
|
| 187 |
-
category_col = column_info['sales_columns'][0]
|
| 188 |
-
value_col = column_info['value_columns'][0]
|
| 189 |
-
|
| 190 |
-
query = f"""
|
| 191 |
-
SELECT
|
| 192 |
-
"{category_col}" as Category,
|
| 193 |
-
SUM("{value_col}") as TotalValue,
|
| 194 |
-
COUNT(*) as RecordCount,
|
| 195 |
-
AVG("{value_col}") as AvgValue
|
| 196 |
-
FROM sales_data
|
| 197 |
-
WHERE "{category_col}" IS NOT NULL AND "{value_col}" IS NOT NULL
|
| 198 |
-
GROUP BY "{category_col}"
|
| 199 |
-
ORDER BY TotalValue DESC
|
| 200 |
-
LIMIT 20
|
| 201 |
-
"""
|
| 202 |
-
|
| 203 |
-
df_category = conn.execute(query).df()
|
| 204 |
-
|
| 205 |
-
if df_category.empty:
|
| 206 |
-
return go.Figure().add_annotation(text="No category data available", showarrow=False)
|
| 207 |
-
|
| 208 |
-
fig = px.bar(df_category, x='Category', y='TotalValue',
|
| 209 |
-
title=f'Performance by {category_col}',
|
| 210 |
-
color='AvgValue',
|
| 211 |
-
color_continuous_scale='Viridis',
|
| 212 |
-
hover_data=['RecordCount'])
|
| 213 |
-
|
| 214 |
-
fig.update_layout(xaxis_title=category_col, yaxis_title="Total Value")
|
| 215 |
-
fig.update_xaxes(tickangle=45)
|
| 216 |
-
|
| 217 |
-
return fig
|
| 218 |
-
|
| 219 |
-
def create_customer_analysis_chart(conn, column_info):
|
| 220 |
-
"""Create customer analysis chart"""
|
| 221 |
-
if not column_info['customer_columns'] or not column_info['value_columns']:
|
| 222 |
-
return go.Figure().add_annotation(text="Customer and value columns required", showarrow=False)
|
| 223 |
-
|
| 224 |
-
customer_col = column_info['customer_columns'][0]
|
| 225 |
-
value_col = column_info['value_columns'][0]
|
| 226 |
-
|
| 227 |
-
query = f"""
|
| 228 |
-
SELECT
|
| 229 |
-
"{customer_col}" as Customer,
|
| 230 |
-
SUM("{value_col}") as TotalValue,
|
| 231 |
-
COUNT(*) as TransactionCount,
|
| 232 |
-
AVG("{value_col}") as AvgTransactionValue
|
| 233 |
-
FROM sales_data
|
| 234 |
-
WHERE "{customer_col}" IS NOT NULL AND "{value_col}" IS NOT NULL
|
| 235 |
-
GROUP BY "{customer_col}"
|
| 236 |
-
ORDER BY TotalValue DESC
|
| 237 |
-
LIMIT 50
|
| 238 |
-
"""
|
| 239 |
-
|
| 240 |
-
df_customer = conn.execute(query).df()
|
| 241 |
-
|
| 242 |
-
if df_customer.empty:
|
| 243 |
-
return go.Figure().add_annotation(text="No customer data available", showarrow=False)
|
| 244 |
-
|
| 245 |
-
fig = px.scatter(df_customer, x='TransactionCount', y='AvgTransactionValue',
|
| 246 |
-
size='TotalValue', hover_name='Customer',
|
| 247 |
-
title='Customer Analysis: Transaction Frequency vs Average Value',
|
| 248 |
-
labels={'TransactionCount': 'Number of Transactions',
|
| 249 |
-
'AvgTransactionValue': 'Average Transaction Value'})
|
| 250 |
-
|
| 251 |
-
return fig
|
| 252 |
-
|
| 253 |
-
def create_value_distribution_chart(conn, column_info):
|
| 254 |
-
"""Create value distribution analysis"""
|
| 255 |
-
if not column_info['value_columns']:
|
| 256 |
-
return go.Figure().add_annotation(text="Value columns required", showarrow=False)
|
| 257 |
-
|
| 258 |
-
value_col = column_info['value_columns'][0]
|
| 259 |
-
|
| 260 |
-
query = f"""
|
| 261 |
-
SELECT "{value_col}" as Value
|
| 262 |
-
FROM sales_data
|
| 263 |
-
WHERE "{value_col}" IS NOT NULL AND "{value_col}" > 0
|
| 264 |
-
ORDER BY "{value_col}"
|
| 265 |
-
"""
|
| 266 |
-
|
| 267 |
-
df_values = conn.execute(query).df()
|
| 268 |
-
|
| 269 |
-
if df_values.empty:
|
| 270 |
-
return go.Figure().add_annotation(text="No value data available", showarrow=False)
|
| 271 |
-
|
| 272 |
-
fig = px.histogram(df_values, x='Value', nbins=50,
|
| 273 |
-
title=f'Value Distribution: {value_col}',
|
| 274 |
-
labels={'Value': value_col, 'count': 'Frequency'})
|
| 275 |
-
|
| 276 |
-
return fig
|
| 277 |
-
|
| 278 |
-
def create_summary_table(conn, column_info):
|
| 279 |
-
"""Create summary statistics table"""
|
| 280 |
-
if not column_info['value_columns']:
|
| 281 |
-
return pd.DataFrame()
|
| 282 |
-
|
| 283 |
-
summaries = []
|
| 284 |
-
|
| 285 |
-
for col in column_info['value_columns'][:5]: # Top 5 value columns
|
| 286 |
-
query = f"""
|
| 287 |
-
SELECT
|
| 288 |
-
'{col}' as Column_Name,
|
| 289 |
-
COUNT("{col}") as Count,
|
| 290 |
-
SUM("{col}") as Total,
|
| 291 |
-
AVG("{col}") as Average,
|
| 292 |
-
MIN("{col}") as Minimum,
|
| 293 |
-
MAX("{col}") as Maximum,
|
| 294 |
-
STDDEV("{col}") as StdDev
|
| 295 |
-
FROM sales_data
|
| 296 |
-
WHERE "{col}" IS NOT NULL
|
| 297 |
-
"""
|
| 298 |
-
|
| 299 |
-
result = conn.execute(query).df()
|
| 300 |
-
if not result.empty:
|
| 301 |
-
summaries.append(result)
|
| 302 |
-
|
| 303 |
-
if summaries:
|
| 304 |
-
return pd.concat(summaries, ignore_index=True)
|
| 305 |
-
return pd.DataFrame()
|
| 306 |
-
|
| 307 |
-
def main():
|
| 308 |
-
# Header
|
| 309 |
-
st.markdown('<h1 class="main-header">π SAP SALT Business Analytics Dashboard</h1>',
|
| 310 |
-
unsafe_allow_html=True)
|
| 311 |
-
|
| 312 |
-
# Sidebar
|
| 313 |
-
st.sidebar.header("ποΈ Authentication & Controls")
|
| 314 |
-
|
| 315 |
-
# Authentication
|
| 316 |
-
hf_token = st.sidebar.text_input(
|
| 317 |
-
"π€ Hugging Face Token",
|
| 318 |
type="password",
|
| 319 |
-
|
|
|
|
|
|
|
| 320 |
)
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
st.
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
# Load data
|
| 337 |
try:
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
with st.
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import textwrap
|
| 3 |
import duckdb
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import streamlit as st
|
| 6 |
import plotly.express as px
|
| 7 |
+
from datetime import datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
# ----------------------------
|
| 10 |
+
# Page config
|
| 11 |
+
# ----------------------------
|
| 12 |
st.set_page_config(
|
| 13 |
+
page_title="SALT Analytics Dashboard",
|
| 14 |
+
page_icon="π",
|
| 15 |
layout="wide",
|
|
|
|
| 16 |
)
|
| 17 |
|
| 18 |
+
st.title("π SALT Analytics Dashboard")
|
| 19 |
+
st.caption("DuckDB + Streamlit on Hugging Face Spaces Β· Dataset: SAP/SALT")
|
| 20 |
+
|
| 21 |
+
# ----------------------------
|
| 22 |
+
# Helpers
|
| 23 |
+
# ----------------------------
|
| 24 |
+
@st.cache_resource(show_spinner=False)
|
| 25 |
+
def get_conn(db_path: str = None):
|
| 26 |
+
"""Create (and cache) a DuckDB connection, load httpfs extension."""
|
| 27 |
+
if db_path is None:
|
| 28 |
+
# Prefer Spaces persistent storage if available
|
| 29 |
+
root = "/data" if os.path.isdir("/data") else "."
|
| 30 |
+
db_path = os.path.join(root, "salt.duckdb")
|
| 31 |
+
con = duckdb.connect(db_path)
|
| 32 |
+
# Ensure httpfs is available for hf:// access
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
try:
|
| 34 |
+
con.execute("INSTALL httpfs; LOAD httpfs;")
|
| 35 |
+
except Exception:
|
| 36 |
+
pass
|
| 37 |
+
return con
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _resolve_repo_id():
|
| 41 |
+
"""Support either 'SAP/SALT' or 'sap-ai-research/SALT'."""
|
| 42 |
+
# Allow override via UI/env for forks
|
| 43 |
+
default_candidates = [
|
| 44 |
+
os.environ.get("SALT_DATASET_REPO", "SAP/SALT"),
|
| 45 |
+
"sap-ai-research/SALT",
|
| 46 |
+
]
|
| 47 |
+
return default_candidates
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
@st.cache_data(show_spinner=False)
|
| 51 |
+
def list_columns(con: duckdb.DuckDBPyConnection, table: str) -> list[str]:
|
| 52 |
+
q = """
|
| 53 |
+
select lower(name) as name
|
| 54 |
+
from pragma_table_info(?)
|
| 55 |
+
order by name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
"""
|
| 57 |
+
return [r[0] for r in con.execute(q, [table]).fetchall()]
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def find_col(cols_lower: list[str], candidates: list[str]):
|
| 61 |
+
"""Return first matching candidate (case-insensitive) or None."""
|
| 62 |
+
cand_lower = [c.lower() for c in candidates]
|
| 63 |
+
for c in cand_lower:
|
| 64 |
+
if c in cols_lower:
|
| 65 |
+
return c
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# ----------------------------
|
| 70 |
+
# Sidebar β Config
|
| 71 |
+
# ----------------------------
|
| 72 |
+
with st.sidebar:
|
| 73 |
+
st.header("βοΈ Configuration")
|
| 74 |
+
repo_candidates = _resolve_repo_id()
|
| 75 |
+
repo_id = st.selectbox("Dataset repo", repo_candidates, index=0,
|
| 76 |
+
help="Both IDs are supported on the Hub; choose the one you have access to.")
|
| 77 |
+
|
| 78 |
+
split = st.selectbox("Split", ["train", "test"], index=0)
|
| 79 |
+
|
| 80 |
+
use_joined = st.toggle(
|
| 81 |
+
"Use joined table (recommended)",
|
| 82 |
+
value=True,
|
| 83 |
+
help="If off, you can still analyze the item-level table."
|
| 84 |
)
|
| 85 |
+
|
| 86 |
+
hf_token = st.text_input(
|
| 87 |
+
"HF token (for gated/private access)",
|
| 88 |
+
type="password",
|
| 89 |
+
placeholder="hf_xxx (optional if Space has access)",
|
| 90 |
+
value=os.environ.get("HF_TOKEN", st.secrets.get("HF_TOKEN", "")),
|
| 91 |
)
|
| 92 |
+
|
| 93 |
+
openai_key = st.text_input(
|
| 94 |
+
"OpenAI API key",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
type="password",
|
| 96 |
+
placeholder="sk-...",
|
| 97 |
+
value=os.environ.get("OPENAI_API_KEY", st.secrets.get("OPENAI_API_KEY", "")),
|
| 98 |
+
help="Needed only for the Recommendations section.",
|
| 99 |
)
|
| 100 |
+
|
| 101 |
+
st.divider()
|
| 102 |
+
if st.button("π Rebuild local DB", help="Drop & reload local DuckDB tables from Hugging Face"):
|
| 103 |
+
st.session_state["rebuild"] = True
|
| 104 |
+
else:
|
| 105 |
+
st.session_state.setdefault("rebuild", False)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
# ----------------------------
|
| 109 |
+
# Load data into DuckDB (one-time)
|
| 110 |
+
# ----------------------------
|
| 111 |
+
con = get_conn()
|
| 112 |
+
|
| 113 |
+
# Configure HF auth in DuckDB Secrets Manager if provided
|
| 114 |
+
if hf_token:
|
|
|
|
| 115 |
try:
|
| 116 |
+
con.execute("CREATE OR REPLACE SECRET hf_token (TYPE huggingface, TOKEN ?)", [hf_token])
|
| 117 |
+
except Exception:
|
| 118 |
+
pass
|
| 119 |
+
|
| 120 |
+
joined_table_name = "salt_joined"
|
| 121 |
+
items_table_name = "salt_items"
|
| 122 |
+
|
| 123 |
+
if st.session_state["rebuild"]:
|
| 124 |
+
with st.status("Rebuilding DuckDB tablesβ¦", expanded=True):
|
| 125 |
+
con.execute(f"DROP TABLE IF EXISTS {joined_table_name}")
|
| 126 |
+
con.execute(f"DROP TABLE IF EXISTS {items_table_name}")
|
| 127 |
+
st.write("Dropped existing tables.")
|
| 128 |
+
st.session_state["rebuild"] = False
|
| 129 |
+
|
| 130 |
+
# Create tables lazily
|
| 131 |
+
if use_joined and not con.execute(f"SELECT count(*) FROM information_schema.tables WHERE table_name='{joined_table_name}'").fetchone()[0]:
|
| 132 |
+
with st.status("Loading joined table into DuckDBβ¦", expanded=False):
|
| 133 |
+
path = f"hf://datasets/{repo_id}/JoinedTables_{split}.parquet"
|
| 134 |
+
con.execute(f"CREATE TABLE {joined_table_name} AS SELECT * FROM read_parquet(?)", [path])
|
| 135 |
+
st.success("Joined table loaded.")
|
| 136 |
+
|
| 137 |
+
if (not use_joined) and not con.execute(f"SELECT count(*) FROM information_schema.tables WHERE table_name='{items_table_name}'").fetchone()[0]:
|
| 138 |
+
with st.status("Loading item-level table into DuckDBβ¦", expanded=False):
|
| 139 |
+
path = f"hf://datasets/{repo_id}/I_SalesDocumentItem_{split}.parquet"
|
| 140 |
+
con.execute(f"CREATE TABLE {items_table_name} AS SELECT * FROM read_parquet(?)", [path])
|
| 141 |
+
st.success("Items table loaded.")
|
| 142 |
+
|
| 143 |
+
active_table = joined_table_name if use_joined else items_table_name
|
| 144 |
+
cols_lower = list_columns(con, active_table)
|
| 145 |
+
|
| 146 |
+
# Heuristic column mapping
|
| 147 |
+
name_map = {
|
| 148 |
+
"order_id": ["SalesDocument", "SALESORDER", "vbeln"],
|
| 149 |
+
"order_item": ["SalesDocumentItem", "SALESORDERITEM", "posnr"],
|
| 150 |
+
"customer": ["SoldToParty", "CUSTOMER", "kunnr", "SoldToParty_PartyNumber"],
|
| 151 |
+
"country": ["Country", "COUNTRY", "land1", "ShipToCountry", "ShipToPartyCountry"],
|
| 152 |
+
"date": ["CreationDate", "CREATIONDATE", "CreatedOn", "DocumentDate", "DOCUMENTDATE", "CreatedAt", "CREATEDON"],
|
| 153 |
+
"plant": ["PLANT", "Plant", "werks"],
|
| 154 |
+
"shipping_condition": ["SHIPPINGCONDITION", "ShippingCondition"],
|
| 155 |
+
"shipping_point": ["SHIPPINGPOINT", "ShippingPoint"],
|
| 156 |
+
"sales_office": ["SALESOFFICE", "SalesOffice"],
|
| 157 |
+
"sales_group": ["SALESGROUP", "SalesGroup"],
|
| 158 |
+
"header_incoterms": ["HEADERINCOTERMSCLASSIFICATION", "HeaderIncotermsClassification"],
|
| 159 |
+
"item_incoterms": ["ITEMINCOTERMSCLASSIFICATION", "ItemIncotermsClassification"],
|
| 160 |
+
}
|
| 161 |
+
|
| 162 |
+
resolved = {k: find_col(cols_lower, v) for k, v in name_map.items()}
|
| 163 |
+
|
| 164 |
+
# ----------------------------
|
| 165 |
+
# Filters
|
| 166 |
+
# ----------------------------
|
| 167 |
+
with st.container():
|
| 168 |
+
st.subheader("Filters")
|
| 169 |
+
left, mid, right = st.columns([2,2,2])
|
| 170 |
+
|
| 171 |
+
# Country filter
|
| 172 |
+
country_col = resolved.get("country")
|
| 173 |
+
if country_col:
|
| 174 |
+
countries = [r[0] for r in con.execute(f"SELECT DISTINCT {country_col} FROM {active_table} WHERE {country_col} IS NOT NULL ORDER BY 1").fetchall()]
|
| 175 |
+
country_sel = left.multiselect("Country", countries, default=[])
|
| 176 |
+
else:
|
| 177 |
+
country_sel = []
|
| 178 |
+
|
| 179 |
+
# Sales office/group
|
| 180 |
+
sales_office_sel = []
|
| 181 |
+
if resolved.get("sales_office"):
|
| 182 |
+
opts = [r[0] for r in con.execute(f"SELECT DISTINCT {resolved['sales_office']} FROM {active_table} WHERE {resolved['sales_office']} IS NOT NULL ORDER BY 1").fetchall()]
|
| 183 |
+
sales_office_sel = mid.multiselect("Sales office", opts)
|
| 184 |
+
|
| 185 |
+
shipping_cond_sel = []
|
| 186 |
+
if resolved.get("shipping_condition"):
|
| 187 |
+
opts = [r[0] for r in con.execute(f"SELECT DISTINCT {resolved['shipping_condition']} FROM {active_table} WHERE {resolved['shipping_condition']} IS NOT NULL ORDER BY 1").fetchall()]
|
| 188 |
+
shipping_cond_sel = right.multiselect("Shipping condition", opts)
|
| 189 |
+
|
| 190 |
+
# Build WHERE clause
|
| 191 |
+
where = []
|
| 192 |
+
params: list = []
|
| 193 |
+
if country_sel and resolved.get("country"):
|
| 194 |
+
where.append(f"{resolved['country']} IN ({', '.join(['?']*len(country_sel))})")
|
| 195 |
+
params.extend(country_sel)
|
| 196 |
+
if sales_office_sel and resolved.get("sales_office"):
|
| 197 |
+
where.append(f"{resolved['sales_office']} IN ({', '.join(['?']*len(sales_office_sel))})")
|
| 198 |
+
params.extend(sales_office_sel)
|
| 199 |
+
if shipping_cond_sel and resolved.get("shipping_condition"):
|
| 200 |
+
where.append(f"{resolved['shipping_condition']} IN ({', '.join(['?']*len(shipping_cond_sel))})")
|
| 201 |
+
params.extend(shipping_cond_sel)
|
| 202 |
+
where_sql = (" WHERE " + " AND ".join(where)) if where else ""
|
| 203 |
+
|
| 204 |
+
# ----------------------------
|
| 205 |
+
# KPIs
|
| 206 |
+
# ----------------------------
|
| 207 |
+
st.subheader("Key metrics")
|
| 208 |
+
|
| 209 |
+
k1, k2, k3, k4 = st.columns(4)
|
| 210 |
+
|
| 211 |
+
# Orders
|
| 212 |
+
if resolved.get("order_id"):
|
| 213 |
+
n_orders = con.execute(
|
| 214 |
+
f"SELECT COUNT(DISTINCT {resolved['order_id']}) FROM {active_table}{where_sql}", params
|
| 215 |
+
).fetchone()[0]
|
| 216 |
+
else:
|
| 217 |
+
n_orders = con.execute(f"SELECT COUNT(*) FROM {active_table}{where_sql}", params).fetchone()[0]
|
| 218 |
+
|
| 219 |
+
# Customers
|
| 220 |
+
if resolved.get("customer"):
|
| 221 |
+
n_customers = con.execute(
|
| 222 |
+
f"SELECT COUNT(DISTINCT {resolved['customer']}) FROM {active_table}{where_sql}", params
|
| 223 |
+
).fetchone()[0]
|
| 224 |
+
else:
|
| 225 |
+
n_customers = None
|
| 226 |
+
|
| 227 |
+
# Items per order
|
| 228 |
+
if resolved.get("order_id") and resolved.get("order_item"):
|
| 229 |
+
avg_items = con.execute(
|
| 230 |
+
f"SELECT AVG(cnt) FROM (SELECT COUNT(DISTINCT {resolved['order_item']}) AS cnt FROM {active_table}{where_sql} GROUP BY {resolved['order_id']})",
|
| 231 |
+
params,
|
| 232 |
+
).fetchone()[0]
|
| 233 |
+
else:
|
| 234 |
+
avg_items = None
|
| 235 |
+
|
| 236 |
+
# Top plant count
|
| 237 |
+
top_plant = None
|
| 238 |
+
if resolved.get("plant"):
|
| 239 |
+
row = con.execute(
|
| 240 |
+
f"SELECT {resolved['plant']}, COUNT(*) AS c FROM {active_table}{where_sql} GROUP BY 1 ORDER BY c DESC LIMIT 1",
|
| 241 |
+
params,
|
| 242 |
+
).fetchone()
|
| 243 |
+
if row:
|
| 244 |
+
top_plant = f"{row[0]} ({row[1]})"
|
| 245 |
+
|
| 246 |
+
k1.metric("Orders", f"{n_orders:,}")
|
| 247 |
+
k2.metric("Customers", f"{n_customers:,}" if n_customers is not None else "β")
|
| 248 |
+
k3.metric("Avg items / order", f"{avg_items:.2f}" if avg_items else "β")
|
| 249 |
+
k4.metric("Top plant by rows", top_plant or "β")
|
| 250 |
+
|
| 251 |
+
# ----------------------------
|
| 252 |
+
# Charts
|
| 253 |
+
# ----------------------------
|
| 254 |
+
with st.container():
|
| 255 |
+
c1, c2 = st.columns(2)
|
| 256 |
+
# Orders over time
|
| 257 |
+
date_col = resolved.get("date")
|
| 258 |
+
if date_col:
|
| 259 |
+
df_time = con.execute(
|
| 260 |
+
f"""
|
| 261 |
+
SELECT date_trunc('month', cast({date_col} as timestamp)) AS month,
|
| 262 |
+
COUNT(*) as rows
|
| 263 |
+
FROM {active_table}
|
| 264 |
+
{where_sql}
|
| 265 |
+
GROUP BY 1
|
| 266 |
+
ORDER BY 1
|
| 267 |
+
""",
|
| 268 |
+
params,
|
| 269 |
+
).df()
|
| 270 |
+
if not df_time.empty:
|
| 271 |
+
fig = px.line(df_time, x="month", y="rows", markers=True, title="Rows over time (monthly)")
|
| 272 |
+
c1.plotly_chart(fig, use_container_width=True)
|
| 273 |
+
# Shipping condition distribution
|
| 274 |
+
if resolved.get("shipping_condition"):
|
| 275 |
+
df_ship = con.execute(
|
| 276 |
+
f"SELECT {resolved['shipping_condition']} as sc, COUNT(*) as rows FROM {active_table}{where_sql} GROUP BY 1 ORDER BY rows DESC LIMIT 15",
|
| 277 |
+
params,
|
| 278 |
+
).df()
|
| 279 |
+
if not df_ship.empty:
|
| 280 |
+
fig = px.bar(df_ship, x="sc", y="rows", title="Shipping condition distribution (Top 15)")
|
| 281 |
+
c2.plotly_chart(fig, use_container_width=True)
|
| 282 |
+
|
| 283 |
+
with st.container():
|
| 284 |
+
c3, c4 = st.columns(2)
|
| 285 |
+
# Plants by country
|
| 286 |
+
if resolved.get("plant") and resolved.get("country"):
|
| 287 |
+
df_pc = con.execute(
|
| 288 |
+
f"""
|
| 289 |
+
SELECT {resolved['country']} as country, {resolved['plant']} as plant, COUNT(*) as rows
|
| 290 |
+
FROM {active_table}
|
| 291 |
+
{where_sql}
|
| 292 |
+
GROUP BY 1,2
|
| 293 |
+
ORDER BY rows DESC
|
| 294 |
+
LIMIT 250
|
| 295 |
+
""",
|
| 296 |
+
params,
|
| 297 |
+
).df()
|
| 298 |
+
if not df_pc.empty:
|
| 299 |
+
fig = px.treemap(df_pc, path=["country", "plant"], values="rows", title="Volume by Country β Plant")
|
| 300 |
+
c3.plotly_chart(fig, use_container_width=True)
|
| 301 |
+
|
| 302 |
+
# Incoterms
|
| 303 |
+
incoterm_col = resolved.get("header_incoterms") or resolved.get("item_incoterms")
|
| 304 |
+
if incoterm_col:
|
| 305 |
+
df_inc = con.execute(
|
| 306 |
+
f"SELECT {incoterm_col} as incoterm, COUNT(*) as rows FROM {active_table}{where_sql} GROUP BY 1 ORDER BY rows DESC LIMIT 20",
|
| 307 |
+
params,
|
| 308 |
+
).df()
|
| 309 |
+
if not df_inc.empty:
|
| 310 |
+
fig = px.pie(df_inc, names="incoterm", values="rows", title="Incoterms share (Top 20)")
|
| 311 |
+
c4.plotly_chart(fig, use_container_width=True)
|
| 312 |
+
|
| 313 |
+
# ----------------------------
|
| 314 |
+
# Data Preview
|
| 315 |
+
# ----------------------------
|
| 316 |
+
st.subheader("Data preview")
|
| 317 |
+
preview = con.execute(f"SELECT * FROM {active_table}{where_sql} LIMIT 100", params).df()
|
| 318 |
+
st.dataframe(preview, use_container_width=True, hide_index=True)
|
| 319 |
+
|
| 320 |
+
# ----------------------------
|
| 321 |
+
# LLM Insights & Recommendations (OpenAI)
|
| 322 |
+
# ----------------------------
|
| 323 |
+
with st.expander("π‘ AI Recommendations (OpenAI)", expanded=True):
|
| 324 |
+
st.write("Generate action-oriented suggestions based on the visible KPIs and distributions.")
|
| 325 |
+
if not openai_key:
|
| 326 |
+
st.info("Add your OpenAI API key in the sidebar to enable this.")
|
| 327 |
+
else:
|
| 328 |
+
try:
|
| 329 |
+
from openai import OpenAI
|
| 330 |
+
client = OpenAI(api_key=openai_key)
|
| 331 |
+
# Craft a concise context from metrics and top distributions
|
| 332 |
+
parts = []
|
| 333 |
+
parts.append(f"Orders: {n_orders}")
|
| 334 |
+
if n_customers is not None:
|
| 335 |
+
parts.append(f"Customers: {n_customers}")
|
| 336 |
+
if avg_items is not None:
|
| 337 |
+
parts.append(f"Avg items/order: {avg_items:.2f}")
|
| 338 |
+
if top_plant:
|
| 339 |
+
parts.append(f"Top plant: {top_plant}")
|
| 340 |
+
context = "; ".join(parts)
|
| 341 |
+
|
| 342 |
+
# Small samples from charts to ground model
|
| 343 |
+
sample_ship = con.execute(
|
| 344 |
+
f"SELECT {resolved['shipping_condition']} as sc, COUNT(*) as rows FROM {active_table}{where_sql} GROUP BY 1 ORDER BY rows DESC LIMIT 8",
|
| 345 |
+
params,
|
| 346 |
+
).df().to_dict(orient="records") if resolved.get("shipping_condition") else []
|
| 347 |
+
|
| 348 |
+
prompt = textwrap.dedent(f"""
|
| 349 |
+
You are a senior ops analyst. Based on the SALT dataset analytics summary below,
|
| 350 |
+
write actionable recommendations. Focus on levers in sales operations, logistics (shipping
|
| 351 |
+
conditions/points), and master data hygiene. Keep it business-practical and specific.
|
| 352 |
+
|
| 353 |
+
Visible KPIs: {context}
|
| 354 |
+
Shipping distribution (top sample): {sample_ship}
|
| 355 |
+
|
| 356 |
+
Deliver:
|
| 357 |
+
- 5 bulletpoint actions (each β€ 20 words)
|
| 358 |
+
- 3 watchouts/risks (each β€ 15 words)
|
| 359 |
+
- 2 quick experiments to A/B in the next sprint
|
| 360 |
+
""")
|
| 361 |
+
|
| 362 |
+
resp = client.responses.create(
|
| 363 |
+
model="gpt-4o-mini",
|
| 364 |
+
input=prompt,
|
| 365 |
+
)
|
| 366 |
+
recos = getattr(resp, "output_text", None) or (
|
| 367 |
+
resp.output[0].content[0].text if getattr(resp, "output", None) else ""
|
| 368 |
+
)
|
| 369 |
+
st.markdown(recos)
|
| 370 |
+
except Exception as e:
|
| 371 |
+
st.warning(f"OpenAI call failed: {e}")
|
| 372 |
+
|
| 373 |
+
# ----------------------------
|
| 374 |
+
# Footer
|
| 375 |
+
# ----------------------------
|
| 376 |
+
st.caption(
|
| 377 |
+
"SALT dataset Β© SAP AI Research β loaded via DuckDB hf:// and analyzed client-side."
|
| 378 |
+
)
|