scotty369's picture
Update streamlit.py
118a828 verified
import streamlit as st
import polars as pl
import plotly.express as px
import plotly.graph_objects as go
import altair as alt
@st.cache_data
def load_data():
gtin = pl.read_parquet('data/cstore_master_ctin.parquet')
discounts = pl.read_parquet('data/cstore_discounts.parquet')
stores = pl.read_parquet('data/cstore_stores.parquet')
payments = pl.read_parquet('data/cstore_payments.parquet')
daily = pl.read_parquet('data/cstore_transactions_daily_agg.parquet')
shopper = pl.read_parquet('data/cstore_shopper.parquet')
sets = pl.read_parquet('data/cstore_transaction_sets.parquet')
status = pl.read_parquet('data/cstore_store_status.parquet')
items = pl.scan_parquet("data/transaction_items/*.parquet").collect()
daily = daily.with_columns([
pl.col("DATE").dt.week().alias("WEEK_NUM"),
pl.col("DATE").dt.year().alias("YEAR"),
pl.col("DATE").dt.month().alias("MONTH"),
pl.col("DATE").dt.strftime("%Y-%m").alias("YEAR_MONTH")
])
items = items.with_columns([
pl.col("DATE_TIME").dt.year().alias("YEAR"),
pl.col("DATE_TIME").dt.month().alias("MONTH"),
pl.col("DATE_TIME").dt.week().alias("WEEK_NUM"),
pl.col("DATE_TIME").dt.strftime("%Y-%m").alias("YEAR_MONTH")
])
return gtin, discounts, stores, payments, daily, shopper, sets, status, items
gtin, discounts, stores, payments, daily, shopper, sets, status, items = load_data()
st.title("C-Store Analytics Dashboard")
available_months = sorted(daily["MONTH"].unique().to_list())
selected_months = st.sidebar.multiselect("Filter by Month:", available_months, default=available_months)
available_years_daily = sorted(daily["YEAR"].unique().to_list())
selected_years_daily = st.sidebar.multiselect("Filter by Year (Tab 1 & 2):", available_years_daily, default=available_years_daily)
available_years_items = sorted(items["YEAR"].unique().to_list())
selected_years_items = st.sidebar.multiselect("Filter by Year (Tab 3):", available_years_items, default=available_years_items)
selected_categories = st.sidebar.multiselect("Filter by Category:", daily["CATEGORY"].unique().to_list(), default=daily["CATEGORY"].unique().to_list())
selected_brands = st.sidebar.multiselect("Filter by Brand:", daily["BRAND"].unique().to_list(), default=daily["BRAND"].unique().to_list())
interest_level_tab1 = st.sidebar.slider("Interest Level (Tab 1):", 0, 10, 5)
interest_level_tab2 = st.sidebar.slider("Interest Level (Tab 2):", 0, 10, 5)
interest_level_tab3 = st.sidebar.slider("Interest Level (Tab 3):", 0, 10, 5)
tabs = st.tabs(["Top 5 Products by Weekly Sales (Excl. Fuels)", "Brands to Drop in Packaged Beverages", "Cash vs Credit Customer Comparison"])
with tabs[0]:
filtered_daily = daily.filter(pl.col("MONTH").is_in(selected_months) & pl.col("YEAR").is_in(selected_years_daily) & pl.col("CATEGORY").is_in(selected_categories) & pl.col("BRAND").is_in(selected_brands))
dsmerged = filtered_daily.join(gtin.select(["GTIN", "CATEGORY", "SKUPOS_DESCRIPTION"]), on="GTIN", how="left")
filtered = dsmerged.filter(pl.col("CATEGORY").str.to_lowercase() != "fuel")
weekly_sales = (
filtered
.group_by(["SKUPOS_DESCRIPTION", "WEEK_NUM", "YEAR"])
.agg(pl.col("TOTAL_REVENUE_AMOUNT").sum().alias("WEEKLY_SALES"))
)
top_products = (
weekly_sales
.group_by("SKUPOS_DESCRIPTION")
.agg(pl.col("WEEKLY_SALES").sum().alias("TOTAL_SALES"))
.sort("TOTAL_SALES", descending=True)
.head(5)
)
top_product_names = top_products["SKUPOS_DESCRIPTION"].to_list()
top_weekly_sales = weekly_sales.filter(pl.col("SKUPOS_DESCRIPTION").is_in(top_product_names)).to_pandas()
table = top_weekly_sales.groupby("SKUPOS_DESCRIPTION")["WEEKLY_SALES"].sum().sort_values(ascending=False).reset_index()
kpi_1 = top_weekly_sales["WEEKLY_SALES"].sum()
kpi_2 = top_products["TOTAL_SALES"].sum()
kpi_3 = top_products["SKUPOS_DESCRIPTION"].n_unique()
kpi_4 = top_weekly_sales["WEEKLY_SALES"].mean()
kpi_5 = top_weekly_sales["WEEKLY_SALES"].std()
col1, col2, col3, col4, col5 = st.columns(5)
col1.metric("Total Sales for Top 5 Products", f"${kpi_1:,.2f}")
col2.metric("Total Sales Value (Top Products)", f"${kpi_2:,.2f}")
col3.metric("Unique Products in Top 5", kpi_3)
col4.metric("Avg Weekly Sales", f"${kpi_4:,.2f}")
col5.metric("Weekly Sales Std Dev", f"${kpi_5:,.2f}")
st.write(table)
fig_line = px.line(
top_weekly_sales,
x="WEEK_NUM",
y="WEEKLY_SALES",
color="SKUPOS_DESCRIPTION",
title="Weekly Sales Over Time - Top 5 Products (Excl. Fuel)",
labels={"WEEK_NUM": "Week Number", "WEEKLY_SALES": "Weekly Sales"},
)
st.plotly_chart(fig_line)
box_chart = alt.Chart(top_weekly_sales).mark_boxplot().encode(
x="SKUPOS_DESCRIPTION:N",
y="WEEKLY_SALES:Q"
).properties(
title="Distribution of Weekly Sales - Top 5 Products"
)
st.altair_chart(box_chart)
fig_overlay = px.histogram(
top_weekly_sales,
x="WEEKLY_SALES",
color="SKUPOS_DESCRIPTION",
nbins=20,
title="Distribution of Weekly Sales - Top 5 Products (Excl. Fuel)",
labels={"WEEKLY_SALES": "Weekly Sales", "count": "Frequency"},
opacity=0.7,
barmode="overlay"
)
st.plotly_chart(fig_overlay)
with tabs[1]:
filtered_daily = daily.filter(pl.col("MONTH").is_in(selected_months) & pl.col("YEAR").is_in(selected_years_daily) & pl.col("CATEGORY").is_in(selected_categories) & pl.col("BRAND").is_in(selected_brands))
dsmerged = filtered_daily.join(gtin.select(["GTIN", "CATEGORY", "SKUPOS_DESCRIPTION"]), on="GTIN", how="left")
beverages = dsmerged.filter(pl.col("CATEGORY").str.contains("Beverage"))
weekly_sales = (
beverages
.group_by(["SKUPOS_DESCRIPTION", "WEEK_NUM", "YEAR"])
.agg(pl.col("TOTAL_REVENUE_AMOUNT").sum().alias("WEEKLY_SALES"))
)
bottom_products = (
weekly_sales
.group_by("SKUPOS_DESCRIPTION")
.agg(pl.col("WEEKLY_SALES").sum().alias("TOTAL_SALES"))
.sort("TOTAL_SALES")
.head(5)
)
bottom_product_names = bottom_products["SKUPOS_DESCRIPTION"].to_list()
bottom_weekly_sales = weekly_sales.filter(pl.col("SKUPOS_DESCRIPTION").is_in(bottom_product_names)).to_pandas()
table = bottom_weekly_sales.groupby("SKUPOS_DESCRIPTION")["WEEKLY_SALES"].sum().sort_values().reset_index()
kpi_1 = bottom_weekly_sales["WEEKLY_SALES"].sum()
kpi_2 = bottom_products["TOTAL_SALES"].sum()
kpi_3 = bottom_products["SKUPOS_DESCRIPTION"].n_unique()
kpi_4 = bottom_weekly_sales["WEEKLY_SALES"].mean()
kpi_5 = bottom_weekly_sales["WEEKLY_SALES"].std()
col1, col2, col3, col4, col5 = st.columns(5)
col1.metric("Total Sales for Bottom Products", f"${kpi_1:,.2f}")
col2.metric("Total Sales Value (Bottom Products)", f"${kpi_2:,.2f}")
col3.metric("Unique Products in Bottom 5", kpi_3)
col4.metric("Avg Weekly Sales", f"${kpi_4:,.2f}")
col5.metric("Weekly Sales Std Dev", f"${kpi_5:,.2f}")
st.write("Least Popular Packaged Beverages:")
st.write(table)
fig = px.treemap(
table,
path=["SKUPOS_DESCRIPTION"],
values="WEEKLY_SALES",
title="Treemap - Total Sales for Least Popular Packaged Beverages"
)
st.plotly_chart(fig)
fig = px.density_contour(
bottom_weekly_sales,
x="WEEKLY_SALES",
y="SKUPOS_DESCRIPTION",
title="Density Contour - Sales Concentration by Product"
)
fig.update_traces(contours_coloring="fill", showscale=True)
st.plotly_chart(fig)
fig_violin = px.violin(
bottom_weekly_sales,
y="WEEKLY_SALES",
x="SKUPOS_DESCRIPTION",
box=True,
points="all",
title="Violin Plot - Weekly Sales Distribution (Bottom 5 Beverages)"
)
fig_violin.update_layout(
xaxis_title="Product",
yaxis_title="Weekly Sales"
)
st.plotly_chart(fig_violin)
with tabs[2]:
filtered_items = items.filter(pl.col("MONTH").is_in(selected_months) & pl.col("YEAR").is_in(selected_years_items))
merged = filtered_items.join(
sets.select(["TRANSACTION_SET_ID", "PAYMENT_TYPE", "GRAND_TOTAL_AMOUNT"]),
on="TRANSACTION_SET_ID",
how="left"
).join(
gtin.select(["GTIN", "SKUPOS_DESCRIPTION"]),
on="GTIN",
how="left"
)
grouped = merged.group_by(["PAYMENT_TYPE", "SKUPOS_DESCRIPTION"]).agg([
pl.col("UNIT_QUANTITY").sum().alias("TOTAL_ITEMS"),
pl.col("GRAND_TOTAL_AMOUNT").sum().alias("TOTAL_SPENT")
])
top_products_by_payment = (
grouped.sort("TOTAL_ITEMS", descending=True)
.group_by("PAYMENT_TYPE")
.head(5)
)
total_metrics = merged.group_by("PAYMENT_TYPE").agg([
pl.col("UNIT_QUANTITY").sum().alias("TOTAL_ITEMS"),
pl.col("GRAND_TOTAL_AMOUNT").sum().alias("TOTAL_SPENT")
])
kpi_1 = total_metrics["TOTAL_SPENT"].sum()
kpi_2 = total_metrics["TOTAL_ITEMS"].sum()
kpi_3 = total_metrics["PAYMENT_TYPE"].n_unique()
kpi_4 = total_metrics["TOTAL_ITEMS"].mean()
kpi_5 = total_metrics["TOTAL_SPENT"].std()
col1, col2, col3, col4, col5 = st.columns(5)
col1.metric("Total Spend", f"${kpi_1:,.2f}")
col2.metric("Total Items Sold", kpi_2)
col3.metric("Unique Payment Methods", kpi_3)
col4.metric("Avg Items Sold", kpi_4)
col5.metric("Spend Std Dev", f"${kpi_5:,.2f}")
st.write("Top Products for Each Payment Type:")
st.write(top_products_by_payment)
st.write("\nTotal Comparison - Cash vs Credit:")
st.write(total_metrics)
time_series = merged.group_by(["YEAR_MONTH", "PAYMENT_TYPE"]).agg(
pl.col("GRAND_TOTAL_AMOUNT").sum().alias("TOTAL_SPENT")
).sort("YEAR_MONTH").to_pandas()
fig = px.line(
time_series,
x="YEAR_MONTH",
y="TOTAL_SPENT",
color="PAYMENT_TYPE",
title="Monthly Spend by Payment Type"
)
st.plotly_chart(fig)
treemap_data = (
top_products_by_payment
.filter(
pl.col("PAYMENT_TYPE").is_not_null() &
pl.col("SKUPOS_DESCRIPTION").is_not_null() &
(pl.col("SKUPOS_DESCRIPTION") != "")
)
.to_pandas()
)
fig = px.treemap(
treemap_data,
path=["PAYMENT_TYPE", "SKUPOS_DESCRIPTION"],
values="TOTAL_SPENT",
title="Treemap: Product Spend by Payment Type"
)
st.plotly_chart(fig)
heatmap_data = top_products_by_payment.to_pandas().pivot(
index="SKUPOS_DESCRIPTION",
columns="PAYMENT_TYPE",
values="TOTAL_ITEMS"
)
fig = go.Figure(data=go.Heatmap(
z=heatmap_data.values,
x=heatmap_data.columns,
y=heatmap_data.index,
colorscale='YlGnBu',
colorbar=dict(title="Total Items"),
))
fig.update_layout(
title="Top Products by Total Items Purchased (Cash vs Credit)",
xaxis_title="Payment Type",
yaxis_title="Product",
xaxis=dict(tickmode='array', tickvals=list(range(len(heatmap_data.columns))), ticktext=heatmap_data.columns),
yaxis=dict(tickmode='array', tickvals=list(range(len(heatmap_data.index))), ticktext=heatmap_data.index),
height=600,
width=1000
)
st.plotly_chart(fig)