Spaces:

PD03
/

SALT_Dashboard

Sleeping

App Files Files Community

PD03 commited on Sep 6, 2025

Commit

ec9c243

verified ·

1 Parent(s): a930538

Update app.py

Browse files

Files changed (1) hide show

app.py +166 -187

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 # app.py
 import streamlit as st
 import pandas as pd
 import numpy as np
@@ -8,31 +7,28 @@ import os
 import warnings
 warnings.filterwarnings('ignore')
-# --- Page Configuration ---
-st.set_page_config(
-    page_title="SAP Sales KPI Dashboard",
-    page_icon="📊",
-    layout="wide"
-)
-# --- Custom CSS ---
 st.markdown("""
 <style>
-    .main-header { font-size: 2.8rem; font-weight: bold; color: #1f4e79; text-align: center; }
-    .kpi-card { background: #f8f9fa; padding: 1.5rem; border-radius: 10px; border-left: 5px solid #667eea; margin-bottom: 1rem; }
-    .kpi-value { font-size: 2.5rem; font-weight: bold; color: #1f4e79; }
-    .kpi-label { font-size: 1rem; color: #555; }
 </style>
 """, unsafe_allow_html=True)
-# --- Kaggle API Setup & Data Loading ---
 @st.cache_data(ttl=3600)
 def load_kaggle_sap_data():
     try:
-        # Check for secrets
         if 'KAGGLE_USERNAME' not in st.secrets or 'KAGGLE_KEY' not in st.secrets:
             return "Kaggle credentials not found in Streamlit secrets."
         os.environ['KAGGLE_USERNAME'] = st.secrets['KAGGLE_USERNAME']
         os.environ['KAGGLE_KEY'] = st.secrets['KAGGLE_KEY']
@@ -40,236 +36,219 @@ def load_kaggle_sap_data():
         dataset_name = "mustafakeser4/sap-dataset-bigquery-dataset"
         download_path = "./kaggle_data"
-        # Download only if files don't exist
         if not os.path.exists(os.path.join(download_path, 'vbak.csv')):
-            with st.spinner("Downloading dataset from Kaggle... This may take a moment."):
                 kaggle.api.authenticate()
                 kaggle.api.dataset_download_files(dataset_name, path=download_path, unzip=True)
-        # Load tables
         tables = {}
-        for name, filename in {'vbak': 'vbak.csv', 'vbap': 'vbap.csv', 'kna1': 'kna1.csv', 'makt': 'makt.csv'}.items():
-            file_path = os.path.join(download_path, filename)
-            if os.path.exists(file_path):
-                tables[name] = pd.read_csv(file_path, low_memory=False)
-            else:
-                return f"Expected file missing: {filename}"
         return tables
     except Exception as e:
         return f"Error during Kaggle data loading: {e}"
-# --- Data Processing & Analytics (FIXED) ---
 @st.cache_data
 def create_sales_analytics(_tables):
     try:
-        # copies
         vbak = _tables['vbak'].copy()
         vbap = _tables['vbap'].copy()
         kna1 = _tables['kna1'].copy()
         makt = _tables['makt'].copy()
-        # normalize col names
         for df in [vbak, vbap, kna1, makt]:
             df.columns = [c.upper().strip() for c in df.columns]
-        # keep keys as strings (prevents leading-zero loss)
-        def _as_str(df, cols):
             for c in cols:
                 if c in df.columns:
                     df[c] = df[c].astype(str).str.strip()
-        _as_str(vbak, ['VBELN', 'KUNNR', 'VKORG', 'VTWEG'])
-        _as_str(vbap, ['VBELN', 'MATNR'])
-        _as_str(kna1, ['KUNNR'])
-        _as_str(makt, ['MATNR'])
-        # language filter for product text (English)
         makt_en = makt[makt['SPRAS'].eq('E')] if 'SPRAS' in makt.columns else makt
-        # --- CRITICAL: use item-level NETWR from VBAP and parse robustly ---
-        if 'NETWR' not in vbap.columns:
-            return "Expected NETWR in VBAP but didn't find it."
-        # clean currency-like strings: remove anything not digit/.,-
-        netwr_raw = vbap['NETWR'].astype(str).str.replace(r'[^\d,.\-]', '', regex=True)
-        # handle European decimals like "1.234,56" → "1234.56" else remove commas as thousands
-        vbap['NETWR'] = np.where(
-            netwr_raw.str.contains(',') & netwr_raw.str.contains(r'\.'),
-            netwr_raw.str.replace('.', '', regex=False).str.replace(',', '.', regex=False),
-            netwr_raw.str.replace(',', '', regex=False)
-        )
-        vbap['NETWR'] = pd.to_numeric(vbap['NETWR'], errors='coerce').fillna(0.0)
-        # build narrow tables to avoid duplicate columns / accidental overwrites
-        vbak_small = vbak[['VBELN', 'KUNNR', 'VKORG', 'VTWEG', 'ERDAT']].drop_duplicates('VBELN')
-        vbap_small = vbap[['VBELN', 'MATNR', 'NETWR']]
-        kna1_small = (
-            kna1[['KUNNR', 'NAME1', 'LAND1']]
-            if {'KUNNR', 'NAME1', 'LAND1'}.issubset(kna1.columns)
-            else pd.DataFrame(columns=['KUNNR', 'NAME1', 'LAND1'])
-        )
-        makt_small = (
-            makt_en[['MATNR', 'MAKTX']].drop_duplicates('MATNR')
-            if {'MATNR', 'MAKTX'}.issubset(makt_en.columns)
-            else pd.DataFrame(columns=['MATNR', 'MAKTX'])
-        )
-        # enrich items with header fields, then customer & material text
-        sales = (
-            vbap_small
-            .merge(vbak_small, on='VBELN', how='inner')
-            .merge(kna1_small, on='KUNNR', how='left')
-            .merge(makt_small, on='MATNR', how='left')
-        )
-        # dates
         if 'ERDAT' in sales.columns:
             sales['ERDAT'] = pd.to_datetime(sales['ERDAT'], errors='coerce')
-        # ensure expected analysis columns exist
-        for col in ['NETWR', 'LAND1', 'VTWEG', 'NAME1', 'MAKTX', 'VBELN', 'KUNNR']:
             if col not in sales.columns:
-                sales[col] = np.nan if col != 'NETWR' else 0.0
         return sales
     except Exception as e:
         return f"Error processing sales data: {e}"
-# --- UI Components ---
-def create_kpi_card(title, value, format_type="currency"):
-    """Simple KPI card with direct formatting"""
-    if format_type == "currency":
-        formatted_value = f"${value:,.0f}" if isinstance(value, (int, float)) else "$0"
-    else:  # number
-        formatted_value = f"{value:,.0f}" if isinstance(value, (int, float)) else "0"
-    st.markdown(f"""
-    <div class="kpi-card">
-        <div class="kpi-value">{formatted_value}</div>
-        <div class="kpi-label">{title}</div>
-    </div>
-    """, unsafe_allow_html=True)
-# --- Main App Logic ---
-st.markdown('<h1 class="main-header">🎯 SAP Sales KPI Dashboard</h1>', unsafe_allow_html=True)
-# Cache clearing button
-if st.sidebar.button("🔄 Clear Cache & Rerun"):
-    st.cache_data.clear()
-    st.rerun()
-st.sidebar.title("Dashboard Controls")
-# Load data and handle errors
-raw_tables = load_kaggle_sap_data()
-if isinstance(raw_tables, str):
-    st.error(raw_tables)
     st.stop()
-sales_df = create_sales_analytics(raw_tables)
 if isinstance(sales_df, str):
     st.error(sales_df)
     st.stop()
-st.success(f"✅ Loaded and processed {len(sales_df):,} real SAP sales records!")
-# --- Sidebar Filters ---
-st.sidebar.header("Filters")
-top_n_countries = st.sidebar.slider("Top N Countries to Display", 5, 20, 10)
-unique_countries = sorted(sales_df['LAND1'].dropna().unique())
-selected_region = st.sidebar.multiselect(
-    "Select Region (Country)",
-    options=unique_countries,
-    default=unique_countries
-)
-filtered_df = sales_df[sales_df['LAND1'].isin(selected_region)].copy()
-# --- Main KPIs ---
-st.subheader("Sales KPIs from Real SAP Data")
-col1, col2, col3, col4 = st.columns(4)
-with col1:
-    create_kpi_card("Total Revenue", float(filtered_df['NETWR'].sum()))
-with col2:
-    create_kpi_card("Active Customers", int(filtered_df['KUNNR'].nunique()), format_type="number")
-with col3:
-    avg_order_value = float(filtered_df.loc[filtered_df['NETWR'] > 0, 'NETWR'].mean() or 0.0)
-    create_kpi_card("Avg Order Value", avg_order_value)
-with col4:
-    create_kpi_card("Sales Orders", int(filtered_df['VBELN'].nunique()), format_type="number")
-# --- Analytics Tabs ---
 tab1, tab2, tab3, tab4 = st.tabs(["👥 Top Customers", "🌍 Regional Analysis", "📈 Distribution Channels", "🛍️ Top Products"])
 with tab1:
     st.subheader("Top 10 Customers by Revenue")
-    customer_summary = (
-        filtered_df.dropna(subset=['NAME1'])
-        .groupby('NAME1', as_index=False)['NETWR'].sum()
-        .nlargest(10, 'NETWR')
-    )
-    if not customer_summary.empty:
-        fig = px.bar(
-            customer_summary, x='NETWR', y='NAME1', orientation='h',
-            labels={'NETWR': 'Revenue ($)', 'NAME1': 'Customer'},
-            color='NETWR', color_continuous_scale='Blues'
-        )
-        st.plotly_chart(fig.update_layout(yaxis={'categoryorder': 'total ascending'}), use_container_width=True)
     else:
-        st.info("No customer data to display for the selected filters.")
 with tab2:
     st.subheader("Revenue by Country")
-    regional_summary = (
-        filtered_df.dropna(subset=['LAND1'])
-        .groupby('LAND1', as_index=False)['NETWR'].sum()
-        .nlargest(top_n_countries, 'NETWR')
-    )
-    if not regional_summary.empty:
-        fig = px.pie(regional_summary, values='NETWR', names='LAND1', title=f"Top {top_n_countries} Countries by Revenue")
-        st.plotly_chart(fig, use_container_width=True)
     else:
-        st.info("No country data to display for the selected filters.")
 with tab3:
     st.subheader("Revenue by Distribution Channel")
-    channel_summary = (
-        filtered_df.dropna(subset=['VTWEG'])
-        .groupby('VTWEG', as_index=False)['NETWR'].sum()
-    )
     channel_summary['VTWEG'] = channel_summary['VTWEG'].astype(str)
-    if not channel_summary.empty:
-        fig = px.bar(
-            channel_summary, x='VTWEG', y='NETWR',
-            title="Total Revenue by Distribution Channel",
-            labels={'NETWR': 'Total Revenue ($)', 'VTWEG': 'Distribution Channel'},
-            color='NETWR', color_continuous_scale='Plasma'
-        )
-        st.plotly_chart(fig, use_container_width=True)
     else:
-        st.info("No distribution channel data to display for the selected filters.")
 with tab4:
     st.subheader("Top 10 Products by Revenue")
-    product_summary = (
-        filtered_df.dropna(subset=['MAKTX'])
-        .groupby('MAKTX', as_index=False)['NETWR'].sum()
-        .nlargest(10, 'NETWR')
-    )
-    if not product_summary.empty:
-        fig = px.bar(
-            product_summary, x='NETWR', y='MAKTX', orientation='h',
-            labels={'NETWR': 'Revenue ($)', 'MAKTX': 'Product'},
-            color='NETWR', color_continuous_scale='Greens'
-        )
-        st.plotly_chart(fig.update_layout(yaxis={'categoryorder': 'total ascending'}), use_container_width=True)
     else:
-        st.info("No product data to display for the selected filters.")
 st.markdown("---")
-st.markdown("<p style='text-align: center;'>Built with Streamlit • 100% Real SAP ERP Data from Kaggle</p>", unsafe_allow_html=True)

 # app.py
 import streamlit as st
 import pandas as pd
 import numpy as np
 import warnings
 warnings.filterwarnings('ignore')
+# ---------- Page & Styles ----------
+st.set_page_config(page_title="SAP Sales KPI Dashboard", page_icon="📊", layout="wide")
 st.markdown("""
 <style>
+  /* hide default sidebar entirely */
+  [data-testid="stSidebar"] { display: none; }
+  .main-header { font-size: 2.2rem; font-weight: 800; color: #1f4e79; text-align: left; margin-bottom: .25rem; }
+  .subtle { color:#6b7280; margin-bottom:1.25rem; }
+  .filter-card { background:#f8f9fa; padding: .9rem 1rem; border-radius:12px; border:1px solid #edf2f7; }
+  .kpi-card { background: #ffffff; padding: 1.25rem; border-radius: 14px; border:1px solid #e5e7eb; box-shadow: 0 1px 2px rgba(0,0,0,.03); }
+  .kpi-value { font-size: 2.1rem; font-weight: 800; color: #1f4e79; line-height:1; }
+  .kpi-label { font-size: .95rem; color: #6b7280; }
+  .block-container { padding-top: 1.2rem; }
 </style>
 """, unsafe_allow_html=True)
+# ---------- Kaggle load ----------
 @st.cache_data(ttl=3600)
 def load_kaggle_sap_data():
     try:
         if 'KAGGLE_USERNAME' not in st.secrets or 'KAGGLE_KEY' not in st.secrets:
             return "Kaggle credentials not found in Streamlit secrets."
         os.environ['KAGGLE_USERNAME'] = st.secrets['KAGGLE_USERNAME']
         os.environ['KAGGLE_KEY'] = st.secrets['KAGGLE_KEY']
         dataset_name = "mustafakeser4/sap-dataset-bigquery-dataset"
         download_path = "./kaggle_data"
         if not os.path.exists(os.path.join(download_path, 'vbak.csv')):
+            with st.spinner("Downloading dataset from Kaggle..."):
                 kaggle.api.authenticate()
                 kaggle.api.dataset_download_files(dataset_name, path=download_path, unzip=True)
+        needed = {'vbak': 'vbak.csv', 'vbap': 'vbap.csv', 'kna1': 'kna1.csv', 'makt': 'makt.csv'}
         tables = {}
+        for k, fn in needed.items():
+            fp = os.path.join(download_path, fn)
+            if not os.path.exists(fp):
+                return f"Expected file missing: {fn}"
+            tables[k] = pd.read_csv(fp, low_memory=False)
         return tables
     except Exception as e:
         return f"Error during Kaggle data loading: {e}"
+# ---------- Processing (robust revenue + safe merges) ----------
 @st.cache_data
 def create_sales_analytics(_tables):
     try:
         vbak = _tables['vbak'].copy()
         vbap = _tables['vbap'].copy()
         kna1 = _tables['kna1'].copy()
         makt = _tables['makt'].copy()
+        # normalize column names
         for df in [vbak, vbap, kna1, makt]:
             df.columns = [c.upper().strip() for c in df.columns]
+        # keep SAP keys as strings (avoid leading-zero loss)
+        def as_str(df, cols):
             for c in cols:
                 if c in df.columns:
                     df[c] = df[c].astype(str).str.strip()
+        as_str(vbak, ['VBELN','KUNNR','VKORG','VTWEG'])
+        as_str(vbap, ['VBELN','MATNR'])
+        as_str(kna1, ['KUNNR'])
+        as_str(makt, ['MATNR'])
+        # choose numeric helper
+        def pick_numeric(df, cols):
+            for c in cols:
+                if c in df.columns:
+                    s = pd.to_numeric(df[c], errors='coerce')
+                    if s.notna().sum() > 0 and s.abs().sum() > 0:
+                        return s
+            return pd.Series(0.0, index=df.index)
+        # Build item-level REVENUE
+        # primary: NETWR at item level (VBAP)
+        netwr_item = pick_numeric(vbap, ['NETWR'])
+        # fallback: price * qty using common SAP columns
+        price = pick_numeric(vbap, ['NETPR', 'KBETR', 'NETPR_I'])
+        qty   = pick_numeric(vbap, ['KWMENG', 'KTMNG', 'MENGE'])
+        fallback_rev = (price.fillna(0) * qty.fillna(0)).fillna(0)
+        vbap['REVENUE'] = np.where(netwr_item > 0, netwr_item, fallback_rev).astype(float)
+        # header fields (include currency if present)
+        keep_vbak = ['VBELN','KUNNR','VKORG','VTWEG','ERDAT'] + (['WAERK'] if 'WAERK' in vbak.columns else [])
+        vbak_small = vbak[keep_vbak].drop_duplicates('VBELN')
+        vbap_small = vbap[['VBELN','MATNR','REVENUE']]
+        kna1_small = kna1[['KUNNR','NAME1','LAND1']] if {'KUNNR','NAME1','LAND1'}.issubset(kna1.columns) else pd.DataFrame(columns=['KUNNR','NAME1','LAND1'])
+        # product text in English
         makt_en = makt[makt['SPRAS'].eq('E')] if 'SPRAS' in makt.columns else makt
+        makt_small = makt_en[['MATNR','MAKTX']].drop_duplicates('MATNR') if {'MATNR','MAKTX'}.issubset(makt_en.columns) else pd.DataFrame(columns=['MATNR','MAKTX'])
+        # final sales table
+        sales = (vbap_small
+                 .merge(vbak_small, on='VBELN', how='inner')
+                 .merge(kna1_small, on='KUNNR', how='left')
+                 .merge(makt_small, on='MATNR', how='left'))
         if 'ERDAT' in sales.columns:
             sales['ERDAT'] = pd.to_datetime(sales['ERDAT'], errors='coerce')
+        # ensure columns exist
+        for col in ['REVENUE','LAND1','VTWEG','NAME1','MAKTX','VBELN','KUNNR','VKORG']:
             if col not in sales.columns:
+                sales[col] = np.nan if col != 'REVENUE' else 0.0
+        if 'WAERK' not in sales.columns:
+            sales['WAERK'] = 'N/A'
+        # drop obvious junk rows
+        sales = sales.replace([np.inf, -np.inf], np.nan).dropna(subset=['REVENUE'])
         return sales
     except Exception as e:
         return f"Error processing sales data: {e}"
+# ---------- App ----------
+st.markdown('<div class="main-header">🎯 SAP Sales KPI Dashboard</div><div class="subtle">Real SAP ERP sample data (Kaggle)</div>', unsafe_allow_html=True)
+tables = load_kaggle_sap_data()
+if isinstance(tables, str):
+    st.error(tables)
     st.stop()
+sales_df = create_sales_analytics(tables)
 if isinstance(sales_df, str):
     st.error(sales_df)
     st.stop()
+# ---------- Filter Bar (no sidebar) ----------
+with st.container():
+    st.markdown('<div class="filter-card">', unsafe_allow_html=True)
+    c1, c2, c3, c4 = st.columns([1.2, 1.2, 3, 0.9])
+    # currency filter
+    currencies = [c for c in sales_df['WAERK'].dropna().unique().tolist() if c != 'N/A']
+    default_cur = sales_df['WAERK'].mode().iat[0] if len(sales_df) and sales_df['WAERK'].notna().any() else 'N/A'
+    with c1:
+        currency = st.selectbox("Currency", options=(['All'] + sorted(currencies)) if currencies else ['All'], index=0 if currencies else 0)
+    # Top N
+    with c2:
+        top_n_countries = st.slider("Top N Countries", 5, 20, 10)
+    # Region multiselect inside expander to keep tidy
+    with c3:
+        with st.expander("Region (Country) – click to choose", expanded=False):
+            all_countries = sorted(sales_df['LAND1'].dropna().unique().tolist())
+            # buttons to select/clear
+            b1, b2 = st.columns([1,1])
+            if 'selected_countries' not in st.session_state:
+                st.session_state.selected_countries = all_countries
+            with b1:
+                if st.button("Select All"):
+                    st.session_state.selected_countries = all_countries
+            with b2:
+                if st.button("Clear"):
+                    st.session_state.selected_countries = []
+            selected_region = st.multiselect("Countries", options=all_countries, default=st.session_state.selected_countries, key="countries_ms")
+    with c4:
+        if st.button("🔄 Clear Cache"):
+            st.cache_data.clear()
+            st.rerun()
+    st.markdown('</div>', unsafe_allow_html=True)
+# apply filters
+filtered_df = sales_df.copy()
+if currency and currency != 'All':
+    filtered_df = filtered_df[filtered_df['WAERK'] == currency]
+if 'countries_ms' in st.session_state:
+    filtered_df = filtered_df[filtered_df['LAND1'].isin(st.session_state.countries_ms)]
+st.success(f"✅ Loaded and processed {len(filtered_df):,} sales line-items after filters.")
+# ---------- KPIs ----------
+st.subheader("Sales KPIs")
+k1,k2,k3,k4 = st.columns(4)
+with k1: st.markdown(f'<div class="kpi-card"><div class="kpi-value">${float(filtered_df["REVENUE"].sum()):,.0f}</div><div class="kpi-label">Total Revenue</div></div>', unsafe_allow_html=True)
+with k2: st.markdown(f'<div class="kpi-card"><div class="kpi-value">{int(filtered_df["KUNNR"].nunique())}</div><div class="kpi-label">Active Customers</div></div>', unsafe_allow_html=True)
+with k3:
+    aov = float(filtered_df.loc[filtered_df['REVENUE']>0,'REVENUE'].mean() or 0.0)
+    st.markdown(f'<div class="kpi-card"><div class="kpi-value">${aov:,.0f}</div><div class="kpi-label">Avg Order Value (item)</div></div>', unsafe_allow_html=True)
+with k4: st.markdown(f'<div class="kpi-card"><div class="kpi-value">{int(filtered_df["VBELN"].nunique())}</div><div class="kpi-label">Sales Orders</div></div>', unsafe_allow_html=True)
+# ---------- Tabs ----------
 tab1, tab2, tab3, tab4 = st.tabs(["👥 Top Customers", "🌍 Regional Analysis", "📈 Distribution Channels", "🛍️ Top Products"])
 with tab1:
     st.subheader("Top 10 Customers by Revenue")
+    customer_summary = (filtered_df.dropna(subset=['NAME1'])
+                        .groupby('NAME1', as_index=False)['REVENUE'].sum()
+                        .nlargest(10, 'REVENUE'))
+    if customer_summary.empty:
+        st.info("No customer data to display.")
     else:
+        fig = px.bar(customer_summary, x='REVENUE', y='NAME1', orientation='h',
+                     labels={'REVENUE':'Revenue','NAME1':'Customer'}, color='REVENUE')
+        st.plotly_chart(fig.update_layout(yaxis={'categoryorder':'total ascending'}), use_container_width=True)
 with tab2:
     st.subheader("Revenue by Country")
+    regional_summary = (filtered_df.dropna(subset=['LAND1'])
+                        .groupby('LAND1', as_index=False)['REVENUE'].sum()
+                        .nlargest(top_n_countries, 'REVENUE'))
+    if regional_summary.empty:
+        st.info("No country data to display.")
     else:
+        fig = px.pie(regional_summary, values='REVENUE', names='LAND1',
+                     title=f"Top {top_n_countries} Countries by Revenue")
+        st.plotly_chart(fig, use_container_width=True)
 with tab3:
     st.subheader("Revenue by Distribution Channel")
+    channel_summary = (filtered_df.dropna(subset=['VTWEG'])
+                       .groupby('VTWEG', as_index=False)['REVENUE'].sum())
     channel_summary['VTWEG'] = channel_summary['VTWEG'].astype(str)
+    if channel_summary.empty:
+        st.info("No distribution channel data to display.")
     else:
+        fig = px.bar(channel_summary, x='VTWEG', y='REVENUE',
+                     title="Total Revenue by Distribution Channel",
+                     labels={'REVENUE':'Total Revenue','VTWEG':'Distribution Channel'},
+                     color='REVENUE')
+        st.plotly_chart(fig, use_container_width=True)
 with tab4:
     st.subheader("Top 10 Products by Revenue")
+    product_summary = (filtered_df.dropna(subset=['MAKTX'])
+                       .groupby('MAKTX', as_index=False)['REVENUE'].sum()
+                       .nlargest(10, 'REVENUE'))
+    if product_summary.empty:
+        st.info("No product data to display.")
     else:
+        fig = px.bar(product_summary, x='REVENUE', y='MAKTX', orientation='h',
+                     labels={'REVENUE':'Revenue','MAKTX':'Product'}, color='REVENUE')
+        st.plotly_chart(fig.update_layout(yaxis={'categoryorder':'total ascending'}), use_container_width=True)
 st.markdown("---")
+st.markdown("<p style='text-align:center;'>Built with Streamlit • Real SAP ERP sample data (Kaggle)</p>", unsafe_allow_html=True)