import streamlit as st import polars as pl import plotly.express as px import plotly.graph_objects as go import altair as alt @st.cache_data def load_data(): gtin = pl.read_parquet('data/cstore_master_ctin.parquet') discounts = pl.read_parquet('data/cstore_discounts.parquet') stores = pl.read_parquet('data/cstore_stores.parquet') payments = pl.read_parquet('data/cstore_payments.parquet') daily = pl.read_parquet('data/cstore_transactions_daily_agg.parquet') shopper = pl.read_parquet('data/cstore_shopper.parquet') sets = pl.read_parquet('data/cstore_transaction_sets.parquet') status = pl.read_parquet('data/cstore_store_status.parquet') items = pl.scan_parquet("data/transaction_items/*.parquet").collect() daily = daily.with_columns([ pl.col("DATE").dt.week().alias("WEEK_NUM"), pl.col("DATE").dt.year().alias("YEAR"), pl.col("DATE").dt.month().alias("MONTH"), pl.col("DATE").dt.strftime("%Y-%m").alias("YEAR_MONTH") ]) items = items.with_columns([ pl.col("DATE_TIME").dt.year().alias("YEAR"), pl.col("DATE_TIME").dt.month().alias("MONTH"), pl.col("DATE_TIME").dt.week().alias("WEEK_NUM"), pl.col("DATE_TIME").dt.strftime("%Y-%m").alias("YEAR_MONTH") ]) return gtin, discounts, stores, payments, daily, shopper, sets, status, items gtin, discounts, stores, payments, daily, shopper, sets, status, items = load_data() st.title("C-Store Analytics Dashboard") available_months = sorted(daily["MONTH"].unique().to_list()) selected_months = st.sidebar.multiselect("Filter by Month:", available_months, default=available_months) available_years_daily = sorted(daily["YEAR"].unique().to_list()) selected_years_daily = st.sidebar.multiselect("Filter by Year (Tab 1 & 2):", available_years_daily, default=available_years_daily) available_years_items = sorted(items["YEAR"].unique().to_list()) selected_years_items = st.sidebar.multiselect("Filter by Year (Tab 3):", available_years_items, default=available_years_items) selected_categories = st.sidebar.multiselect("Filter by Category:", daily["CATEGORY"].unique().to_list(), default=daily["CATEGORY"].unique().to_list()) selected_brands = st.sidebar.multiselect("Filter by Brand:", daily["BRAND"].unique().to_list(), default=daily["BRAND"].unique().to_list()) interest_level_tab1 = st.sidebar.slider("Interest Level (Tab 1):", 0, 10, 5) interest_level_tab2 = st.sidebar.slider("Interest Level (Tab 2):", 0, 10, 5) interest_level_tab3 = st.sidebar.slider("Interest Level (Tab 3):", 0, 10, 5) tabs = st.tabs(["Top 5 Products by Weekly Sales (Excl. Fuels)", "Brands to Drop in Packaged Beverages", "Cash vs Credit Customer Comparison"]) with tabs[0]: filtered_daily = daily.filter(pl.col("MONTH").is_in(selected_months) & pl.col("YEAR").is_in(selected_years_daily) & pl.col("CATEGORY").is_in(selected_categories) & pl.col("BRAND").is_in(selected_brands)) dsmerged = filtered_daily.join(gtin.select(["GTIN", "CATEGORY", "SKUPOS_DESCRIPTION"]), on="GTIN", how="left") filtered = dsmerged.filter(pl.col("CATEGORY").str.to_lowercase() != "fuel") weekly_sales = ( filtered .group_by(["SKUPOS_DESCRIPTION", "WEEK_NUM", "YEAR"]) .agg(pl.col("TOTAL_REVENUE_AMOUNT").sum().alias("WEEKLY_SALES")) ) top_products = ( weekly_sales .group_by("SKUPOS_DESCRIPTION") .agg(pl.col("WEEKLY_SALES").sum().alias("TOTAL_SALES")) .sort("TOTAL_SALES", descending=True) .head(5) ) top_product_names = top_products["SKUPOS_DESCRIPTION"].to_list() top_weekly_sales = weekly_sales.filter(pl.col("SKUPOS_DESCRIPTION").is_in(top_product_names)).to_pandas() table = top_weekly_sales.groupby("SKUPOS_DESCRIPTION")["WEEKLY_SALES"].sum().sort_values(ascending=False).reset_index() kpi_1 = top_weekly_sales["WEEKLY_SALES"].sum() kpi_2 = top_products["TOTAL_SALES"].sum() kpi_3 = top_products["SKUPOS_DESCRIPTION"].n_unique() kpi_4 = top_weekly_sales["WEEKLY_SALES"].mean() kpi_5 = top_weekly_sales["WEEKLY_SALES"].std() col1, col2, col3, col4, col5 = st.columns(5) col1.metric("Total Sales for Top 5 Products", f"${kpi_1:,.2f}") col2.metric("Total Sales Value (Top Products)", f"${kpi_2:,.2f}") col3.metric("Unique Products in Top 5", kpi_3) col4.metric("Avg Weekly Sales", f"${kpi_4:,.2f}") col5.metric("Weekly Sales Std Dev", f"${kpi_5:,.2f}") st.write(table) fig_line = px.line( top_weekly_sales, x="WEEK_NUM", y="WEEKLY_SALES", color="SKUPOS_DESCRIPTION", title="Weekly Sales Over Time - Top 5 Products (Excl. Fuel)", labels={"WEEK_NUM": "Week Number", "WEEKLY_SALES": "Weekly Sales"}, ) st.plotly_chart(fig_line) box_chart = alt.Chart(top_weekly_sales).mark_boxplot().encode( x="SKUPOS_DESCRIPTION:N", y="WEEKLY_SALES:Q" ).properties( title="Distribution of Weekly Sales - Top 5 Products" ) st.altair_chart(box_chart) fig_overlay = px.histogram( top_weekly_sales, x="WEEKLY_SALES", color="SKUPOS_DESCRIPTION", nbins=20, title="Distribution of Weekly Sales - Top 5 Products (Excl. Fuel)", labels={"WEEKLY_SALES": "Weekly Sales", "count": "Frequency"}, opacity=0.7, barmode="overlay" ) st.plotly_chart(fig_overlay) with tabs[1]: filtered_daily = daily.filter(pl.col("MONTH").is_in(selected_months) & pl.col("YEAR").is_in(selected_years_daily) & pl.col("CATEGORY").is_in(selected_categories) & pl.col("BRAND").is_in(selected_brands)) dsmerged = filtered_daily.join(gtin.select(["GTIN", "CATEGORY", "SKUPOS_DESCRIPTION"]), on="GTIN", how="left") beverages = dsmerged.filter(pl.col("CATEGORY").str.contains("Beverage")) weekly_sales = ( beverages .group_by(["SKUPOS_DESCRIPTION", "WEEK_NUM", "YEAR"]) .agg(pl.col("TOTAL_REVENUE_AMOUNT").sum().alias("WEEKLY_SALES")) ) bottom_products = ( weekly_sales .group_by("SKUPOS_DESCRIPTION") .agg(pl.col("WEEKLY_SALES").sum().alias("TOTAL_SALES")) .sort("TOTAL_SALES") .head(5) ) bottom_product_names = bottom_products["SKUPOS_DESCRIPTION"].to_list() bottom_weekly_sales = weekly_sales.filter(pl.col("SKUPOS_DESCRIPTION").is_in(bottom_product_names)).to_pandas() table = bottom_weekly_sales.groupby("SKUPOS_DESCRIPTION")["WEEKLY_SALES"].sum().sort_values().reset_index() kpi_1 = bottom_weekly_sales["WEEKLY_SALES"].sum() kpi_2 = bottom_products["TOTAL_SALES"].sum() kpi_3 = bottom_products["SKUPOS_DESCRIPTION"].n_unique() kpi_4 = bottom_weekly_sales["WEEKLY_SALES"].mean() kpi_5 = bottom_weekly_sales["WEEKLY_SALES"].std() col1, col2, col3, col4, col5 = st.columns(5) col1.metric("Total Sales for Bottom Products", f"${kpi_1:,.2f}") col2.metric("Total Sales Value (Bottom Products)", f"${kpi_2:,.2f}") col3.metric("Unique Products in Bottom 5", kpi_3) col4.metric("Avg Weekly Sales", f"${kpi_4:,.2f}") col5.metric("Weekly Sales Std Dev", f"${kpi_5:,.2f}") st.write("Least Popular Packaged Beverages:") st.write(table) fig = px.treemap( table, path=["SKUPOS_DESCRIPTION"], values="WEEKLY_SALES", title="Treemap - Total Sales for Least Popular Packaged Beverages" ) st.plotly_chart(fig) fig = px.density_contour( bottom_weekly_sales, x="WEEKLY_SALES", y="SKUPOS_DESCRIPTION", title="Density Contour - Sales Concentration by Product" ) fig.update_traces(contours_coloring="fill", showscale=True) st.plotly_chart(fig) fig_violin = px.violin( bottom_weekly_sales, y="WEEKLY_SALES", x="SKUPOS_DESCRIPTION", box=True, points="all", title="Violin Plot - Weekly Sales Distribution (Bottom 5 Beverages)" ) fig_violin.update_layout( xaxis_title="Product", yaxis_title="Weekly Sales" ) st.plotly_chart(fig_violin) with tabs[2]: filtered_items = items.filter(pl.col("MONTH").is_in(selected_months) & pl.col("YEAR").is_in(selected_years_items)) merged = filtered_items.join( sets.select(["TRANSACTION_SET_ID", "PAYMENT_TYPE", "GRAND_TOTAL_AMOUNT"]), on="TRANSACTION_SET_ID", how="left" ).join( gtin.select(["GTIN", "SKUPOS_DESCRIPTION"]), on="GTIN", how="left" ) grouped = merged.group_by(["PAYMENT_TYPE", "SKUPOS_DESCRIPTION"]).agg([ pl.col("UNIT_QUANTITY").sum().alias("TOTAL_ITEMS"), pl.col("GRAND_TOTAL_AMOUNT").sum().alias("TOTAL_SPENT") ]) top_products_by_payment = ( grouped.sort("TOTAL_ITEMS", descending=True) .group_by("PAYMENT_TYPE") .head(5) ) total_metrics = merged.group_by("PAYMENT_TYPE").agg([ pl.col("UNIT_QUANTITY").sum().alias("TOTAL_ITEMS"), pl.col("GRAND_TOTAL_AMOUNT").sum().alias("TOTAL_SPENT") ]) kpi_1 = total_metrics["TOTAL_SPENT"].sum() kpi_2 = total_metrics["TOTAL_ITEMS"].sum() kpi_3 = total_metrics["PAYMENT_TYPE"].n_unique() kpi_4 = total_metrics["TOTAL_ITEMS"].mean() kpi_5 = total_metrics["TOTAL_SPENT"].std() col1, col2, col3, col4, col5 = st.columns(5) col1.metric("Total Spend", f"${kpi_1:,.2f}") col2.metric("Total Items Sold", kpi_2) col3.metric("Unique Payment Methods", kpi_3) col4.metric("Avg Items Sold", kpi_4) col5.metric("Spend Std Dev", f"${kpi_5:,.2f}") st.write("Top Products for Each Payment Type:") st.write(top_products_by_payment) st.write("\nTotal Comparison - Cash vs Credit:") st.write(total_metrics) time_series = merged.group_by(["YEAR_MONTH", "PAYMENT_TYPE"]).agg( pl.col("GRAND_TOTAL_AMOUNT").sum().alias("TOTAL_SPENT") ).sort("YEAR_MONTH").to_pandas() fig = px.line( time_series, x="YEAR_MONTH", y="TOTAL_SPENT", color="PAYMENT_TYPE", title="Monthly Spend by Payment Type" ) st.plotly_chart(fig) treemap_data = ( top_products_by_payment .filter( pl.col("PAYMENT_TYPE").is_not_null() & pl.col("SKUPOS_DESCRIPTION").is_not_null() & (pl.col("SKUPOS_DESCRIPTION") != "") ) .to_pandas() ) fig = px.treemap( treemap_data, path=["PAYMENT_TYPE", "SKUPOS_DESCRIPTION"], values="TOTAL_SPENT", title="Treemap: Product Spend by Payment Type" ) st.plotly_chart(fig) heatmap_data = top_products_by_payment.to_pandas().pivot( index="SKUPOS_DESCRIPTION", columns="PAYMENT_TYPE", values="TOTAL_ITEMS" ) fig = go.Figure(data=go.Heatmap( z=heatmap_data.values, x=heatmap_data.columns, y=heatmap_data.index, colorscale='YlGnBu', colorbar=dict(title="Total Items"), )) fig.update_layout( title="Top Products by Total Items Purchased (Cash vs Credit)", xaxis_title="Payment Type", yaxis_title="Product", xaxis=dict(tickmode='array', tickvals=list(range(len(heatmap_data.columns))), ticktext=heatmap_data.columns), yaxis=dict(tickmode='array', tickvals=list(range(len(heatmap_data.index))), ticktext=heatmap_data.index), height=600, width=1000 ) st.plotly_chart(fig)