PD03 commited on
Commit
73a7361
Β·
verified Β·
1 Parent(s): 1403d5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +364 -436
app.py CHANGED
@@ -1,450 +1,378 @@
1
- import streamlit as st
2
- import pandas as pd
3
  import duckdb
 
 
4
  import plotly.express as px
5
- import plotly.graph_objects as go
6
- from plotly.subplots import make_subplots
7
- from datasets import load_dataset
8
- import numpy as np
9
- import openai
10
- import os
11
 
12
- # Configure page
 
 
13
  st.set_page_config(
14
- page_title="SAP SALT Analytics Dashboard",
15
- page_icon="πŸ“Š",
16
  layout="wide",
17
- initial_sidebar_state="expanded"
18
  )
19
 
20
- # Custom CSS
21
- st.markdown("""
22
- <style>
23
- .main-header {
24
- font-size: 2.5rem;
25
- color: #1f77b4;
26
- text-align: center;
27
- margin-bottom: 2rem;
28
- }
29
- .metric-card {
30
- background-color: #f0f2f6;
31
- padding: 1rem;
32
- border-radius: 0.5rem;
33
- border-left: 4px solid #1f77b4;
34
- }
35
- .insight-box {
36
- background-color: #e8f4f8;
37
- padding: 1rem;
38
- border-radius: 0.5rem;
39
- border-left: 4px solid #17a2b8;
40
- margin: 1rem 0;
41
- }
42
- </style>
43
- """, unsafe_allow_html=True)
44
-
45
- @st.cache_data
46
- def load_salt_data(hf_token):
47
- """Load SAP SALT dataset with authentication"""
48
- dataset = load_dataset("SAP/SALT", "joined_table", split="train", token=hf_token)
49
- return dataset.to_pandas()
50
-
51
- @st.cache_resource
52
- def init_duckdb(df):
53
- """Initialize DuckDB connection with data"""
54
- conn = duckdb.connect(':memory:')
55
- conn.register('sales_data', df)
56
- return conn
57
-
58
- def analyze_dataset_columns(df):
59
- """Analyze dataset columns and identify key fields"""
60
- columns = list(df.columns)
61
-
62
- # Show available columns in sidebar for reference
63
- with st.sidebar.expander("πŸ“‹ Dataset Columns", expanded=False):
64
- for i, col in enumerate(columns):
65
- st.write(f"{i+1}. {col}")
66
-
67
- # Detect column types based on content and names
68
- date_cols = [col for col in columns if any(word in col.lower() for word in
69
- ['date', 'time', 'created', 'modified', 'timestamp'])]
70
-
71
- value_cols = [col for col in columns if any(word in col.lower() for word in
72
- ['value', 'amount', 'price', 'cost', 'total', 'sum', 'revenue', 'net', 'gross'])]
73
-
74
- customer_cols = [col for col in columns if any(word in col.lower() for word in
75
- ['customer', 'client', 'buyer', 'account', 'partner'])]
76
-
77
- sales_cols = [col for col in columns if any(word in col.lower() for word in
78
- ['sales', 'office', 'group', 'region', 'territory', 'division'])]
79
-
80
- # Get numeric columns as backup for values
81
- numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
82
- if not value_cols and numeric_cols:
83
- value_cols = numeric_cols
84
-
85
- return {
86
- 'all_columns': columns,
87
- 'date_columns': date_cols,
88
- 'value_columns': value_cols,
89
- 'customer_columns': customer_cols,
90
- 'sales_columns': sales_cols,
91
- 'numeric_columns': numeric_cols
92
- }
93
-
94
- def generate_ai_insights(data_summary, openai_key=None):
95
- """Generate AI-powered business insights"""
96
- if not openai_key:
97
- return """
98
- πŸ€– **AI-Powered Insights** (Add OpenAI API key for detailed insights):
99
-
100
- β€’ **Revenue Optimization**: Analyze high-performing segments and scale successful strategies
101
- β€’ **Customer Intelligence**: Identify customer behavior patterns and retention opportunities
102
- β€’ **Operational Excellence**: Optimize processes based on performance data patterns
103
- β€’ **Strategic Growth**: Leverage data insights for market expansion and competitive advantage
104
- """
105
-
106
  try:
107
- openai.api_key = openai_key
108
-
109
- response = openai.ChatCompletion.create(
110
- model="gpt-3.5-turbo",
111
- messages=[{
112
- "role": "user",
113
- "content": f"""
114
- Analyze this SAP ERP sales data and provide strategic business insights:
115
-
116
- {data_summary}
117
-
118
- Generate 4 actionable recommendations for:
119
- 1. Revenue optimization strategies
120
- 2. Customer relationship management
121
- 3. Operational efficiency improvements
122
- 4. Business growth opportunities
123
-
124
- Format as specific, measurable recommendations.
125
- """
126
- }],
127
- max_tokens=600,
128
- temperature=0.7
129
- )
130
-
131
- return f"πŸ€– **AI-Generated Insights**:\n\n{response.choices[0].message.content}"
132
-
133
- except Exception as e:
134
- return f"πŸ€– **AI Service Error**: {str(e)}"
135
-
136
- def create_time_series_chart(conn, column_info):
137
- """Create time series analysis chart"""
138
- if not column_info['date_columns'] or not column_info['value_columns']:
139
- return go.Figure().add_annotation(text="Date and value columns required", showarrow=False)
140
-
141
- date_col = column_info['date_columns'][0]
142
- value_col = column_info['value_columns'][0]
143
-
144
- query = f"""
145
- SELECT
146
- DATE_TRUNC('month', "{date_col}") as Period,
147
- SUM("{value_col}") as TotalValue,
148
- COUNT(*) as RecordCount,
149
- AVG("{value_col}") as AvgValue
150
- FROM sales_data
151
- WHERE "{date_col}" IS NOT NULL AND "{value_col}" IS NOT NULL
152
- GROUP BY Period
153
- ORDER BY Period
154
  """
155
-
156
- df_time = conn.execute(query).df()
157
-
158
- if df_time.empty:
159
- return go.Figure().add_annotation(text="No time series data available", showarrow=False)
160
-
161
- fig = make_subplots(specs=[[{"secondary_y": True}]])
162
-
163
- fig.add_trace(
164
- go.Scatter(x=df_time['Period'], y=df_time['TotalValue'],
165
- mode='lines+markers', name='Total Value', line=dict(color='#1f77b4')),
166
- secondary_y=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  )
168
-
169
- fig.add_trace(
170
- go.Bar(x=df_time['Period'], y=df_time['RecordCount'],
171
- name='Record Count', opacity=0.6, marker_color='#ff7f0e'),
172
- secondary_y=True,
 
173
  )
174
-
175
- fig.update_xaxes(title_text="Time Period")
176
- fig.update_yaxes(title_text="Total Value", secondary_y=False)
177
- fig.update_yaxes(title_text="Record Count", secondary_y=True)
178
- fig.update_layout(title_text=f"Time Series Analysis: {value_col} by {date_col}")
179
-
180
- return fig
181
-
182
- def create_category_performance_chart(conn, column_info):
183
- """Create category performance chart"""
184
- if not column_info['sales_columns'] or not column_info['value_columns']:
185
- return go.Figure().add_annotation(text="Sales category and value columns required", showarrow=False)
186
-
187
- category_col = column_info['sales_columns'][0]
188
- value_col = column_info['value_columns'][0]
189
-
190
- query = f"""
191
- SELECT
192
- "{category_col}" as Category,
193
- SUM("{value_col}") as TotalValue,
194
- COUNT(*) as RecordCount,
195
- AVG("{value_col}") as AvgValue
196
- FROM sales_data
197
- WHERE "{category_col}" IS NOT NULL AND "{value_col}" IS NOT NULL
198
- GROUP BY "{category_col}"
199
- ORDER BY TotalValue DESC
200
- LIMIT 20
201
- """
202
-
203
- df_category = conn.execute(query).df()
204
-
205
- if df_category.empty:
206
- return go.Figure().add_annotation(text="No category data available", showarrow=False)
207
-
208
- fig = px.bar(df_category, x='Category', y='TotalValue',
209
- title=f'Performance by {category_col}',
210
- color='AvgValue',
211
- color_continuous_scale='Viridis',
212
- hover_data=['RecordCount'])
213
-
214
- fig.update_layout(xaxis_title=category_col, yaxis_title="Total Value")
215
- fig.update_xaxes(tickangle=45)
216
-
217
- return fig
218
-
219
- def create_customer_analysis_chart(conn, column_info):
220
- """Create customer analysis chart"""
221
- if not column_info['customer_columns'] or not column_info['value_columns']:
222
- return go.Figure().add_annotation(text="Customer and value columns required", showarrow=False)
223
-
224
- customer_col = column_info['customer_columns'][0]
225
- value_col = column_info['value_columns'][0]
226
-
227
- query = f"""
228
- SELECT
229
- "{customer_col}" as Customer,
230
- SUM("{value_col}") as TotalValue,
231
- COUNT(*) as TransactionCount,
232
- AVG("{value_col}") as AvgTransactionValue
233
- FROM sales_data
234
- WHERE "{customer_col}" IS NOT NULL AND "{value_col}" IS NOT NULL
235
- GROUP BY "{customer_col}"
236
- ORDER BY TotalValue DESC
237
- LIMIT 50
238
- """
239
-
240
- df_customer = conn.execute(query).df()
241
-
242
- if df_customer.empty:
243
- return go.Figure().add_annotation(text="No customer data available", showarrow=False)
244
-
245
- fig = px.scatter(df_customer, x='TransactionCount', y='AvgTransactionValue',
246
- size='TotalValue', hover_name='Customer',
247
- title='Customer Analysis: Transaction Frequency vs Average Value',
248
- labels={'TransactionCount': 'Number of Transactions',
249
- 'AvgTransactionValue': 'Average Transaction Value'})
250
-
251
- return fig
252
-
253
- def create_value_distribution_chart(conn, column_info):
254
- """Create value distribution analysis"""
255
- if not column_info['value_columns']:
256
- return go.Figure().add_annotation(text="Value columns required", showarrow=False)
257
-
258
- value_col = column_info['value_columns'][0]
259
-
260
- query = f"""
261
- SELECT "{value_col}" as Value
262
- FROM sales_data
263
- WHERE "{value_col}" IS NOT NULL AND "{value_col}" > 0
264
- ORDER BY "{value_col}"
265
- """
266
-
267
- df_values = conn.execute(query).df()
268
-
269
- if df_values.empty:
270
- return go.Figure().add_annotation(text="No value data available", showarrow=False)
271
-
272
- fig = px.histogram(df_values, x='Value', nbins=50,
273
- title=f'Value Distribution: {value_col}',
274
- labels={'Value': value_col, 'count': 'Frequency'})
275
-
276
- return fig
277
-
278
- def create_summary_table(conn, column_info):
279
- """Create summary statistics table"""
280
- if not column_info['value_columns']:
281
- return pd.DataFrame()
282
-
283
- summaries = []
284
-
285
- for col in column_info['value_columns'][:5]: # Top 5 value columns
286
- query = f"""
287
- SELECT
288
- '{col}' as Column_Name,
289
- COUNT("{col}") as Count,
290
- SUM("{col}") as Total,
291
- AVG("{col}") as Average,
292
- MIN("{col}") as Minimum,
293
- MAX("{col}") as Maximum,
294
- STDDEV("{col}") as StdDev
295
- FROM sales_data
296
- WHERE "{col}" IS NOT NULL
297
- """
298
-
299
- result = conn.execute(query).df()
300
- if not result.empty:
301
- summaries.append(result)
302
-
303
- if summaries:
304
- return pd.concat(summaries, ignore_index=True)
305
- return pd.DataFrame()
306
-
307
- def main():
308
- # Header
309
- st.markdown('<h1 class="main-header">πŸ“Š SAP SALT Business Analytics Dashboard</h1>',
310
- unsafe_allow_html=True)
311
-
312
- # Sidebar
313
- st.sidebar.header("πŸŽ›οΈ Authentication & Controls")
314
-
315
- # Authentication
316
- hf_token = st.sidebar.text_input(
317
- "πŸ€— Hugging Face Token",
318
  type="password",
319
- help="Required to access SAP SALT dataset: https://huggingface.co/settings/tokens"
 
 
320
  )
321
-
322
- openai_key = st.sidebar.text_input("πŸ€– OpenAI API Key (Optional)", type="password",
323
- help="For AI-powered insights")
324
-
325
- if not hf_token:
326
- st.error("πŸ” **Authentication Required**")
327
- st.info("""
328
- **To access the SAP SALT dataset:**
329
- 1. Visit: https://huggingface.co/datasets/SAP/SALT
330
- 2. Accept the dataset terms
331
- 3. Get your token: https://huggingface.co/settings/tokens
332
- 4. Enter the token in the sidebar
333
- """)
334
- st.stop()
335
-
336
- # Load data
337
  try:
338
- with st.spinner("Loading SAP SALT dataset..."):
339
- df = load_salt_data(hf_token)
340
-
341
- st.success(f"βœ… Dataset loaded: {len(df):,} records Γ— {len(df.columns)} columns")
342
-
343
- except Exception as e:
344
- st.error(f"Failed to load dataset: {str(e)}")
345
- st.stop()
346
-
347
- # Analyze columns
348
- column_info = analyze_dataset_columns(df)
349
-
350
- # Initialize DuckDB
351
- conn = init_duckdb(df)
352
-
353
- # Dataset Overview
354
- with st.expander("πŸ“Š Dataset Overview", expanded=False):
355
- col1, col2, col3 = st.columns(3)
356
-
357
- with col1:
358
- st.metric("Total Records", f"{len(df):,}")
359
- st.metric("Total Columns", len(df.columns))
360
-
361
- with col2:
362
- st.metric("Date Columns", len(column_info['date_columns']))
363
- st.metric("Value Columns", len(column_info['value_columns']))
364
-
365
- with col3:
366
- st.metric("Customer Columns", len(column_info['customer_columns']))
367
- st.metric("Sales Columns", len(column_info['sales_columns']))
368
-
369
- # Key Metrics
370
- st.subheader("πŸ“ˆ Key Business Metrics")
371
-
372
- # Calculate business metrics
373
- if column_info['value_columns']:
374
- primary_value_col = column_info['value_columns'][0]
375
-
376
- total_value = df[primary_value_col].sum()
377
- avg_value = df[primary_value_col].mean()
378
- max_value = df[primary_value_col].max()
379
-
380
- col1, col2, col3, col4 = st.columns(4)
381
-
382
- with col1:
383
- st.metric("Total Value", f"€{total_value:,.0f}")
384
- with col2:
385
- st.metric("Average Value", f"€{avg_value:,.2f}")
386
- with col3:
387
- st.metric("Maximum Value", f"€{max_value:,.0f}")
388
- with col4:
389
- unique_customers = df[column_info['customer_columns'][0]].nunique() if column_info['customer_columns'] else 0
390
- st.metric("Unique Customers", f"{unique_customers:,}")
391
-
392
- # Analytics Charts
393
- st.subheader("πŸ“Š Business Analytics")
394
-
395
- col1, col2 = st.columns(2)
396
-
397
- with col1:
398
- time_chart = create_time_series_chart(conn, column_info)
399
- st.plotly_chart(time_chart, use_container_width=True)
400
-
401
- with col2:
402
- category_chart = create_category_performance_chart(conn, column_info)
403
- st.plotly_chart(category_chart, use_container_width=True)
404
-
405
- col3, col4 = st.columns(2)
406
-
407
- with col3:
408
- customer_chart = create_customer_analysis_chart(conn, column_info)
409
- st.plotly_chart(customer_chart, use_container_width=True)
410
-
411
- with col4:
412
- distribution_chart = create_value_distribution_chart(conn, column_info)
413
- st.plotly_chart(distribution_chart, use_container_width=True)
414
-
415
- # Summary Statistics
416
- st.subheader("πŸ“‹ Statistical Summary")
417
-
418
- summary_df = create_summary_table(conn, column_info)
419
- if not summary_df.empty:
420
- st.dataframe(summary_df, use_container_width=True)
421
-
422
- # Data Preview
423
- st.subheader("πŸ” Data Preview")
424
- st.dataframe(df.head(50), use_container_width=True)
425
-
426
- # AI Insights
427
- st.subheader("🧠 AI-Powered Business Insights")
428
-
429
- # Prepare comprehensive data summary
430
- data_summary = f"""
431
- SAP SALT Dataset Analysis:
432
- - Total Records: {len(df):,}
433
- - Total Columns: {len(df.columns)}
434
- - Primary Value Column: {column_info['value_columns'][0] if column_info['value_columns'] else 'None'}
435
- - Total Business Value: €{df[column_info['value_columns'][0]].sum():,.0f if column_info['value_columns'] else 0}
436
- - Average Transaction: €{df[column_info['value_columns'][0]].mean():,.2f if column_info['value_columns'] else 0}
437
- - Date Range Coverage: {len(column_info['date_columns'])} temporal columns
438
- - Customer Entities: {df[column_info['customer_columns'][0]].nunique() if column_info['customer_columns'] else 0}
439
- - Sales Categories: {len(column_info['sales_columns'])} organizational dimensions
440
- """
441
-
442
- insights = generate_ai_insights(data_summary, openai_key)
443
- st.markdown(f'<div class="insight-box">{insights}</div>', unsafe_allow_html=True)
444
-
445
- # Footer
446
- st.markdown("---")
447
- st.markdown("**Enterprise Analytics Dashboard** | SAP SALT Dataset | Built with Streamlit + DuckDB + OpenAI")
448
-
449
- if __name__ == "__main__":
450
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import textwrap
3
  import duckdb
4
+ import pandas as pd
5
+ import streamlit as st
6
  import plotly.express as px
7
+ from datetime import datetime
 
 
 
 
 
8
 
9
+ # ----------------------------
10
+ # Page config
11
+ # ----------------------------
12
  st.set_page_config(
13
+ page_title="SALT Analytics Dashboard",
14
+ page_icon="πŸ“ˆ",
15
  layout="wide",
 
16
  )
17
 
18
+ st.title("πŸ“ˆ SALT Analytics Dashboard")
19
+ st.caption("DuckDB + Streamlit on Hugging Face Spaces Β· Dataset: SAP/SALT")
20
+
21
+ # ----------------------------
22
+ # Helpers
23
+ # ----------------------------
24
+ @st.cache_resource(show_spinner=False)
25
+ def get_conn(db_path: str = None):
26
+ """Create (and cache) a DuckDB connection, load httpfs extension."""
27
+ if db_path is None:
28
+ # Prefer Spaces persistent storage if available
29
+ root = "/data" if os.path.isdir("/data") else "."
30
+ db_path = os.path.join(root, "salt.duckdb")
31
+ con = duckdb.connect(db_path)
32
+ # Ensure httpfs is available for hf:// access
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  try:
34
+ con.execute("INSTALL httpfs; LOAD httpfs;")
35
+ except Exception:
36
+ pass
37
+ return con
38
+
39
+
40
+ def _resolve_repo_id():
41
+ """Support either 'SAP/SALT' or 'sap-ai-research/SALT'."""
42
+ # Allow override via UI/env for forks
43
+ default_candidates = [
44
+ os.environ.get("SALT_DATASET_REPO", "SAP/SALT"),
45
+ "sap-ai-research/SALT",
46
+ ]
47
+ return default_candidates
48
+
49
+
50
+ @st.cache_data(show_spinner=False)
51
+ def list_columns(con: duckdb.DuckDBPyConnection, table: str) -> list[str]:
52
+ q = """
53
+ select lower(name) as name
54
+ from pragma_table_info(?)
55
+ order by name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  """
57
+ return [r[0] for r in con.execute(q, [table]).fetchall()]
58
+
59
+
60
+ def find_col(cols_lower: list[str], candidates: list[str]):
61
+ """Return first matching candidate (case-insensitive) or None."""
62
+ cand_lower = [c.lower() for c in candidates]
63
+ for c in cand_lower:
64
+ if c in cols_lower:
65
+ return c
66
+ return None
67
+
68
+
69
+ # ----------------------------
70
+ # Sidebar β€” Config
71
+ # ----------------------------
72
+ with st.sidebar:
73
+ st.header("βš™οΈ Configuration")
74
+ repo_candidates = _resolve_repo_id()
75
+ repo_id = st.selectbox("Dataset repo", repo_candidates, index=0,
76
+ help="Both IDs are supported on the Hub; choose the one you have access to.")
77
+
78
+ split = st.selectbox("Split", ["train", "test"], index=0)
79
+
80
+ use_joined = st.toggle(
81
+ "Use joined table (recommended)",
82
+ value=True,
83
+ help="If off, you can still analyze the item-level table."
84
  )
85
+
86
+ hf_token = st.text_input(
87
+ "HF token (for gated/private access)",
88
+ type="password",
89
+ placeholder="hf_xxx (optional if Space has access)",
90
+ value=os.environ.get("HF_TOKEN", st.secrets.get("HF_TOKEN", "")),
91
  )
92
+
93
+ openai_key = st.text_input(
94
+ "OpenAI API key",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  type="password",
96
+ placeholder="sk-...",
97
+ value=os.environ.get("OPENAI_API_KEY", st.secrets.get("OPENAI_API_KEY", "")),
98
+ help="Needed only for the Recommendations section.",
99
  )
100
+
101
+ st.divider()
102
+ if st.button("πŸ”„ Rebuild local DB", help="Drop & reload local DuckDB tables from Hugging Face"):
103
+ st.session_state["rebuild"] = True
104
+ else:
105
+ st.session_state.setdefault("rebuild", False)
106
+
107
+
108
+ # ----------------------------
109
+ # Load data into DuckDB (one-time)
110
+ # ----------------------------
111
+ con = get_conn()
112
+
113
+ # Configure HF auth in DuckDB Secrets Manager if provided
114
+ if hf_token:
 
115
  try:
116
+ con.execute("CREATE OR REPLACE SECRET hf_token (TYPE huggingface, TOKEN ?)", [hf_token])
117
+ except Exception:
118
+ pass
119
+
120
+ joined_table_name = "salt_joined"
121
+ items_table_name = "salt_items"
122
+
123
+ if st.session_state["rebuild"]:
124
+ with st.status("Rebuilding DuckDB tables…", expanded=True):
125
+ con.execute(f"DROP TABLE IF EXISTS {joined_table_name}")
126
+ con.execute(f"DROP TABLE IF EXISTS {items_table_name}")
127
+ st.write("Dropped existing tables.")
128
+ st.session_state["rebuild"] = False
129
+
130
+ # Create tables lazily
131
+ if use_joined and not con.execute(f"SELECT count(*) FROM information_schema.tables WHERE table_name='{joined_table_name}'").fetchone()[0]:
132
+ with st.status("Loading joined table into DuckDB…", expanded=False):
133
+ path = f"hf://datasets/{repo_id}/JoinedTables_{split}.parquet"
134
+ con.execute(f"CREATE TABLE {joined_table_name} AS SELECT * FROM read_parquet(?)", [path])
135
+ st.success("Joined table loaded.")
136
+
137
+ if (not use_joined) and not con.execute(f"SELECT count(*) FROM information_schema.tables WHERE table_name='{items_table_name}'").fetchone()[0]:
138
+ with st.status("Loading item-level table into DuckDB…", expanded=False):
139
+ path = f"hf://datasets/{repo_id}/I_SalesDocumentItem_{split}.parquet"
140
+ con.execute(f"CREATE TABLE {items_table_name} AS SELECT * FROM read_parquet(?)", [path])
141
+ st.success("Items table loaded.")
142
+
143
+ active_table = joined_table_name if use_joined else items_table_name
144
+ cols_lower = list_columns(con, active_table)
145
+
146
+ # Heuristic column mapping
147
+ name_map = {
148
+ "order_id": ["SalesDocument", "SALESORDER", "vbeln"],
149
+ "order_item": ["SalesDocumentItem", "SALESORDERITEM", "posnr"],
150
+ "customer": ["SoldToParty", "CUSTOMER", "kunnr", "SoldToParty_PartyNumber"],
151
+ "country": ["Country", "COUNTRY", "land1", "ShipToCountry", "ShipToPartyCountry"],
152
+ "date": ["CreationDate", "CREATIONDATE", "CreatedOn", "DocumentDate", "DOCUMENTDATE", "CreatedAt", "CREATEDON"],
153
+ "plant": ["PLANT", "Plant", "werks"],
154
+ "shipping_condition": ["SHIPPINGCONDITION", "ShippingCondition"],
155
+ "shipping_point": ["SHIPPINGPOINT", "ShippingPoint"],
156
+ "sales_office": ["SALESOFFICE", "SalesOffice"],
157
+ "sales_group": ["SALESGROUP", "SalesGroup"],
158
+ "header_incoterms": ["HEADERINCOTERMSCLASSIFICATION", "HeaderIncotermsClassification"],
159
+ "item_incoterms": ["ITEMINCOTERMSCLASSIFICATION", "ItemIncotermsClassification"],
160
+ }
161
+
162
+ resolved = {k: find_col(cols_lower, v) for k, v in name_map.items()}
163
+
164
+ # ----------------------------
165
+ # Filters
166
+ # ----------------------------
167
+ with st.container():
168
+ st.subheader("Filters")
169
+ left, mid, right = st.columns([2,2,2])
170
+
171
+ # Country filter
172
+ country_col = resolved.get("country")
173
+ if country_col:
174
+ countries = [r[0] for r in con.execute(f"SELECT DISTINCT {country_col} FROM {active_table} WHERE {country_col} IS NOT NULL ORDER BY 1").fetchall()]
175
+ country_sel = left.multiselect("Country", countries, default=[])
176
+ else:
177
+ country_sel = []
178
+
179
+ # Sales office/group
180
+ sales_office_sel = []
181
+ if resolved.get("sales_office"):
182
+ opts = [r[0] for r in con.execute(f"SELECT DISTINCT {resolved['sales_office']} FROM {active_table} WHERE {resolved['sales_office']} IS NOT NULL ORDER BY 1").fetchall()]
183
+ sales_office_sel = mid.multiselect("Sales office", opts)
184
+
185
+ shipping_cond_sel = []
186
+ if resolved.get("shipping_condition"):
187
+ opts = [r[0] for r in con.execute(f"SELECT DISTINCT {resolved['shipping_condition']} FROM {active_table} WHERE {resolved['shipping_condition']} IS NOT NULL ORDER BY 1").fetchall()]
188
+ shipping_cond_sel = right.multiselect("Shipping condition", opts)
189
+
190
+ # Build WHERE clause
191
+ where = []
192
+ params: list = []
193
+ if country_sel and resolved.get("country"):
194
+ where.append(f"{resolved['country']} IN ({', '.join(['?']*len(country_sel))})")
195
+ params.extend(country_sel)
196
+ if sales_office_sel and resolved.get("sales_office"):
197
+ where.append(f"{resolved['sales_office']} IN ({', '.join(['?']*len(sales_office_sel))})")
198
+ params.extend(sales_office_sel)
199
+ if shipping_cond_sel and resolved.get("shipping_condition"):
200
+ where.append(f"{resolved['shipping_condition']} IN ({', '.join(['?']*len(shipping_cond_sel))})")
201
+ params.extend(shipping_cond_sel)
202
+ where_sql = (" WHERE " + " AND ".join(where)) if where else ""
203
+
204
+ # ----------------------------
205
+ # KPIs
206
+ # ----------------------------
207
+ st.subheader("Key metrics")
208
+
209
+ k1, k2, k3, k4 = st.columns(4)
210
+
211
+ # Orders
212
+ if resolved.get("order_id"):
213
+ n_orders = con.execute(
214
+ f"SELECT COUNT(DISTINCT {resolved['order_id']}) FROM {active_table}{where_sql}", params
215
+ ).fetchone()[0]
216
+ else:
217
+ n_orders = con.execute(f"SELECT COUNT(*) FROM {active_table}{where_sql}", params).fetchone()[0]
218
+
219
+ # Customers
220
+ if resolved.get("customer"):
221
+ n_customers = con.execute(
222
+ f"SELECT COUNT(DISTINCT {resolved['customer']}) FROM {active_table}{where_sql}", params
223
+ ).fetchone()[0]
224
+ else:
225
+ n_customers = None
226
+
227
+ # Items per order
228
+ if resolved.get("order_id") and resolved.get("order_item"):
229
+ avg_items = con.execute(
230
+ f"SELECT AVG(cnt) FROM (SELECT COUNT(DISTINCT {resolved['order_item']}) AS cnt FROM {active_table}{where_sql} GROUP BY {resolved['order_id']})",
231
+ params,
232
+ ).fetchone()[0]
233
+ else:
234
+ avg_items = None
235
+
236
+ # Top plant count
237
+ top_plant = None
238
+ if resolved.get("plant"):
239
+ row = con.execute(
240
+ f"SELECT {resolved['plant']}, COUNT(*) AS c FROM {active_table}{where_sql} GROUP BY 1 ORDER BY c DESC LIMIT 1",
241
+ params,
242
+ ).fetchone()
243
+ if row:
244
+ top_plant = f"{row[0]} ({row[1]})"
245
+
246
+ k1.metric("Orders", f"{n_orders:,}")
247
+ k2.metric("Customers", f"{n_customers:,}" if n_customers is not None else "β€”")
248
+ k3.metric("Avg items / order", f"{avg_items:.2f}" if avg_items else "β€”")
249
+ k4.metric("Top plant by rows", top_plant or "β€”")
250
+
251
+ # ----------------------------
252
+ # Charts
253
+ # ----------------------------
254
+ with st.container():
255
+ c1, c2 = st.columns(2)
256
+ # Orders over time
257
+ date_col = resolved.get("date")
258
+ if date_col:
259
+ df_time = con.execute(
260
+ f"""
261
+ SELECT date_trunc('month', cast({date_col} as timestamp)) AS month,
262
+ COUNT(*) as rows
263
+ FROM {active_table}
264
+ {where_sql}
265
+ GROUP BY 1
266
+ ORDER BY 1
267
+ """,
268
+ params,
269
+ ).df()
270
+ if not df_time.empty:
271
+ fig = px.line(df_time, x="month", y="rows", markers=True, title="Rows over time (monthly)")
272
+ c1.plotly_chart(fig, use_container_width=True)
273
+ # Shipping condition distribution
274
+ if resolved.get("shipping_condition"):
275
+ df_ship = con.execute(
276
+ f"SELECT {resolved['shipping_condition']} as sc, COUNT(*) as rows FROM {active_table}{where_sql} GROUP BY 1 ORDER BY rows DESC LIMIT 15",
277
+ params,
278
+ ).df()
279
+ if not df_ship.empty:
280
+ fig = px.bar(df_ship, x="sc", y="rows", title="Shipping condition distribution (Top 15)")
281
+ c2.plotly_chart(fig, use_container_width=True)
282
+
283
+ with st.container():
284
+ c3, c4 = st.columns(2)
285
+ # Plants by country
286
+ if resolved.get("plant") and resolved.get("country"):
287
+ df_pc = con.execute(
288
+ f"""
289
+ SELECT {resolved['country']} as country, {resolved['plant']} as plant, COUNT(*) as rows
290
+ FROM {active_table}
291
+ {where_sql}
292
+ GROUP BY 1,2
293
+ ORDER BY rows DESC
294
+ LIMIT 250
295
+ """,
296
+ params,
297
+ ).df()
298
+ if not df_pc.empty:
299
+ fig = px.treemap(df_pc, path=["country", "plant"], values="rows", title="Volume by Country β†’ Plant")
300
+ c3.plotly_chart(fig, use_container_width=True)
301
+
302
+ # Incoterms
303
+ incoterm_col = resolved.get("header_incoterms") or resolved.get("item_incoterms")
304
+ if incoterm_col:
305
+ df_inc = con.execute(
306
+ f"SELECT {incoterm_col} as incoterm, COUNT(*) as rows FROM {active_table}{where_sql} GROUP BY 1 ORDER BY rows DESC LIMIT 20",
307
+ params,
308
+ ).df()
309
+ if not df_inc.empty:
310
+ fig = px.pie(df_inc, names="incoterm", values="rows", title="Incoterms share (Top 20)")
311
+ c4.plotly_chart(fig, use_container_width=True)
312
+
313
+ # ----------------------------
314
+ # Data Preview
315
+ # ----------------------------
316
+ st.subheader("Data preview")
317
+ preview = con.execute(f"SELECT * FROM {active_table}{where_sql} LIMIT 100", params).df()
318
+ st.dataframe(preview, use_container_width=True, hide_index=True)
319
+
320
+ # ----------------------------
321
+ # LLM Insights & Recommendations (OpenAI)
322
+ # ----------------------------
323
+ with st.expander("πŸ’‘ AI Recommendations (OpenAI)", expanded=True):
324
+ st.write("Generate action-oriented suggestions based on the visible KPIs and distributions.")
325
+ if not openai_key:
326
+ st.info("Add your OpenAI API key in the sidebar to enable this.")
327
+ else:
328
+ try:
329
+ from openai import OpenAI
330
+ client = OpenAI(api_key=openai_key)
331
+ # Craft a concise context from metrics and top distributions
332
+ parts = []
333
+ parts.append(f"Orders: {n_orders}")
334
+ if n_customers is not None:
335
+ parts.append(f"Customers: {n_customers}")
336
+ if avg_items is not None:
337
+ parts.append(f"Avg items/order: {avg_items:.2f}")
338
+ if top_plant:
339
+ parts.append(f"Top plant: {top_plant}")
340
+ context = "; ".join(parts)
341
+
342
+ # Small samples from charts to ground model
343
+ sample_ship = con.execute(
344
+ f"SELECT {resolved['shipping_condition']} as sc, COUNT(*) as rows FROM {active_table}{where_sql} GROUP BY 1 ORDER BY rows DESC LIMIT 8",
345
+ params,
346
+ ).df().to_dict(orient="records") if resolved.get("shipping_condition") else []
347
+
348
+ prompt = textwrap.dedent(f"""
349
+ You are a senior ops analyst. Based on the SALT dataset analytics summary below,
350
+ write actionable recommendations. Focus on levers in sales operations, logistics (shipping
351
+ conditions/points), and master data hygiene. Keep it business-practical and specific.
352
+
353
+ Visible KPIs: {context}
354
+ Shipping distribution (top sample): {sample_ship}
355
+
356
+ Deliver:
357
+ - 5 bulletpoint actions (each ≀ 20 words)
358
+ - 3 watchouts/risks (each ≀ 15 words)
359
+ - 2 quick experiments to A/B in the next sprint
360
+ """)
361
+
362
+ resp = client.responses.create(
363
+ model="gpt-4o-mini",
364
+ input=prompt,
365
+ )
366
+ recos = getattr(resp, "output_text", None) or (
367
+ resp.output[0].content[0].text if getattr(resp, "output", None) else ""
368
+ )
369
+ st.markdown(recos)
370
+ except Exception as e:
371
+ st.warning(f"OpenAI call failed: {e}")
372
+
373
+ # ----------------------------
374
+ # Footer
375
+ # ----------------------------
376
+ st.caption(
377
+ "SALT dataset Β© SAP AI Research β€” loaded via DuckDB hf:// and analyzed client-side."
378
+ )