import os from pathlib import Path import random import numpy as np import pandas as pd import plotly.express as px import plotly.graph_objects as go import streamlit as st st.set_page_config(page_title="Freshie", page_icon="🐱", layout="wide", initial_sidebar_state="expanded") def inject_css(): st.markdown( """ """, unsafe_allow_html=True, ) def find_data_path() -> Path | None: candidates = [ Path("/app/perishable_goods_management.csv"), Path("perishable_goods_management.csv"), Path("/mnt/data/perishable_goods_management.csv"), ] for p in candidates: if p.exists(): return p return None @st.cache_data(show_spinner=False) def load_data() -> pd.DataFrame: path = find_data_path() if path is None: raise FileNotFoundError("perishable_goods_management.csv not found in /app or current folder.") df = pd.read_csv(path) # Standardize likely date field date_col = None for c in ["transaction_date", "date", "Date"]: if c in df.columns: date_col = c break if date_col is None: raise ValueError("No transaction date column found.") if date_col != "transaction_date": df = df.rename(columns={date_col: "transaction_date"}) df["transaction_date"] = pd.to_datetime(df["transaction_date"], errors="coerce") df = df.dropna(subset=["transaction_date"]).copy() # Expected columns with fallbacks if "discount_pct" not in df.columns: if "discount_percentage" in df.columns: df["discount_pct"] = df["discount_percentage"] else: df["discount_pct"] = 0.0 for col in ["units_sold", "initial_quantity", "daily_demand", "profit", "days_until_expiry"]: if col not in df.columns: raise ValueError(f"Missing required column: {col}") if "category" not in df.columns: df["category"] = "Unknown" if "region" not in df.columns: df["region"] = "Unknown" if "store_id" not in df.columns: df["store_id"] = "STORE_001" if "product_name" not in df.columns: df["product_name"] = df["category"].astype(str) if "selling_price" not in df.columns and "price" in df.columns: df["selling_price"] = df["price"] if "base_price" not in df.columns and "selling_price" in df.columns: df["base_price"] = df["selling_price"] / (1 - df["discount_pct"].replace(1, 0.999).clip(0, 0.99)) if "base_price" not in df.columns: df["base_price"] = 1.0 if "selling_price" not in df.columns: df["selling_price"] = 1.0 df["discount_pct"] = df["discount_pct"].fillna(0) if df["discount_pct"].max() > 1: df["discount_pct"] = df["discount_pct"] / 100 df["discount_pct"] = df["discount_pct"].clip(0, 1) if "waste_pct" not in df.columns: denom = df["initial_quantity"].replace(0, np.nan) if "units_wasted" in df.columns: df["waste_pct"] = (df["units_wasted"] / denom).fillna(0) else: df["waste_pct"] = 0.0 if df["waste_pct"].max() > 1: df["waste_pct"] = df["waste_pct"] / 100 df["waste_pct"] = df["waste_pct"].clip(0, 1) if "units_wasted" not in df.columns: df["units_wasted"] = (df["waste_pct"] * df["initial_quantity"]).round() df["leftover_units"] = (df["initial_quantity"] - df["units_sold"]).clip(lower=0) df["stockout_flag"] = (df["daily_demand"] > df["initial_quantity"]).astype(int) df["lost_sales_units"] = (df["daily_demand"] - df["units_sold"]).clip(lower=0) df["sell_through_pct"] = (df["units_sold"] / df["initial_quantity"].replace(0, np.nan)).fillna(0).clip(0, 1) df["month"] = df["transaction_date"].dt.strftime("%Y-%m") df["is_weekend"] = (df["transaction_date"].dt.dayofweek >= 5).astype(int) df["expiry_bucket"] = pd.cut( df["days_until_expiry"], bins=[-1, 1, 3, 7, 30, 10000], labels=["<=1d", "2-3d", "4-7d", "8-30d", ">30d"], ).astype(str) return df PCA_SHAP_LIBRARY = { "Bakery": { "avg_profit": 329.82, "avg_margin": 0.0573, "avg_waste": 0.1535, "avg_units_sold": 216.88, "avg_daily_demand": 114.32, "region_profit": [("Southwest", 314.96), ("Midwest", 319.24), ("Southeast", 324.43), ("Northeast", 332.06), ("West", 350.22)], "business": "Bakery is a high-turnover category with positive profit but thin margins. It behaves like a traffic driver: demand is strong, but a large part of profit quality depends on pricing discipline and waste control.", "pc1": 0.3441, "pc2": 0.2786, "pc1_theme": "Profitability efficiency axis", "pc1_loadings": [("avg_profit_margin", "+0.529"), ("avg_days_until_expiry", "+0.429"), ("avg_sell_through", "+0.412"), ("avg_waste_pct", "-0.412"), ("avg_discount_pct", "-0.409")], "pc2_theme": "Demand and replenishment intensity axis", "pc2_loadings": [("avg_daily_demand", "+0.543"), ("avg_waste_pct", "-0.406"), ("avg_sell_through", "+0.406"), ("avg_initial_qty", "+0.379"), ("avg_discount_pct", "+0.351")], "clusters": [ ("Cluster A · Core winners", "Demand 85.13 · Sell-through 98.09% · Waste 1.91% · Margin 30.99%", "Protect availability, keep discounting light, and avoid stockouts."), ("Cluster B · Traffic drivers but margin issue", "Demand 281.09 · Sell-through 94.90% · Waste 5.10% · Margin -9.91%", "Review price, cost, and promotion depth before cutting stock."), ("Cluster C · Balanced mid-tier", "Demand 89.10 · Sell-through 69.40% · Waste 30.60% · Margin 3.87%", "Use tighter replenishment and light tactical markdowns."), ("Cluster D · High-waste losers", "Demand 71.55 · Sell-through 49.53% · Waste 50.47% · Margin -95.47%", "Reduce initial stock, markdown earlier, bundle, and trim weak-SKU stores."), ], "r2": 0.809, "mae": 227.69, "drivers": ["discount_pct", "base_price", "initial_quantity", "cost_price", "markdown_applied", "daily_demand", "days_until_expiry", "shelf_life_days", "is_weekend", "spoilage_risk"], "conclusion": "Bakery profit is mainly shaped by discounting, pricing, stock depth, and demand strength rather than expiry alone.", "actions": [ "Review price and cost on high-demand but negative-margin SKUs.", "Start markdown earlier for high-waste items instead of waiting until the last moment.", "Use weekend-sensitive promotions in a more targeted rhythm.", ], }, "Meat": { "avg_profit": 777.27, "avg_margin": 0.1411, "avg_waste": 0.1820, "avg_units_sold": 213.96, "avg_daily_demand": 73.30, "region_profit": [("Northeast", 757.70), ("Southwest", 765.17), ("Southeast", 779.24), ("Midwest", 793.72), ("West", 799.80)], "business": "Meat is the healthiest of the three in profit contribution. Waste is meaningful, but margin quality is stronger than Bakery, so it is worth managing through precision replenishment and loss control.", "pc1": 0.2960, "pc2": 0.2060, "pc1_theme": "Operational efficiency axis", "pc1_loadings": [("avg_profit_margin", "+0.498"), ("avg_sell_through", "+0.488"), ("avg_waste_pct", "-0.488"), ("avg_days_until_expiry", "+0.356"), ("avg_discount_pct", "-0.339")], "pc2_theme": "Sales scale and stocking intensity axis", "pc2_loadings": [("avg_daily_demand", "+0.627"), ("avg_initial_qty", "+0.487"), ("avg_days_until_expiry", "-0.356"), ("avg_discount_pct", "+0.347")], "clusters": [ ("Cluster A · Premium winners", "Demand 58.81 · Sell-through 97.74% · Waste 2.26% · Margin 38.42%", "Protect availability for top sellers and avoid over-restricting stock."), ("Cluster B · Strong traffic winners", "Demand 184.22 · Sell-through 89.29% · Waste 10.71% · Margin 10.44%", "Keep these as stable replenishment anchors."), ("Cluster C · Manageable mid-tier", "Demand 58.11 · Sell-through 68.51% · Waste 31.49% · Margin 8.76%", "Fine-tune inventory and use local promotions only where needed."), ("Cluster D · High-waste loss cluster", "Demand 46.16 · Sell-through 42.50% · Waste 57.50% · Margin -86.58%", "Lower stock, tighten cold-chain rhythm, and optimize assortment by store."), ], "r2": 0.720, "mae": 464.22, "drivers": ["discount_pct", "initial_quantity", "base_price", "cost_price", "days_until_expiry", "daily_demand", "markdown_applied", "is_weekend", "spoilage_risk", "shelf_life_days"], "conclusion": "Meat profit is driven by discounting, stocking quantity, pricing, cost, and expiry pressure; compared with Bakery, it is more sensitive to initial quantity and days until expiry.", "actions": [ "Cut initial quantity first for high-waste clusters.", "Use earlier and lighter markdowns on SKUs with real demand but rising expiry pressure.", "Protect availability on the high-profit core group instead of applying blanket stock cuts.", ], }, "Ready_to_Eat": { "avg_profit": 182.98, "avg_margin": -0.1151, "avg_waste": 0.1456, "avg_units_sold": 220.33, "avg_daily_demand": 186.41, "region_profit": [("Northeast", 162.08), ("Southwest", 169.03), ("West", 183.12), ("Southeast", 189.53), ("Midwest", 224.99)], "business": "Ready-to-Eat shows the strongest contradiction: very high demand and sales volume, but negative average margin. The category sells, but the current commercial logic does not monetize that demand well.", "pc1": 0.3428, "pc2": 0.2842, "pc1_theme": "Promotion and expiry pressure axis", "pc1_loadings": [("avg_discount_pct", "+0.536"), ("avg_days_until_expiry", "-0.520"), ("avg_profit_margin", "-0.486"), ("avg_daily_demand", "+0.299"), ("avg_spoilage_risk", "+0.291")], "pc2_theme": "Waste versus sell-through axis", "pc2_loadings": [("avg_waste_pct", "+0.616"), ("avg_sell_through", "-0.616"), ("avg_daily_demand", "-0.344")], "clusters": [ ("Cluster A · Profitable fast movers", "Demand 137.37 · Sell-through 98.45% · Waste 1.55% · Margin 10.76%", "Protect supply and avoid stockouts."), ("Cluster B · Traffic drivers but deeply margin-negative", "Demand 564.13 · Sell-through 95.40% · Waste 4.60% · Margin -33.18%", "Fix price and promotion depth first."), ("Cluster C · High waste and deepest loss", "Demand 149.11 · Sell-through 58.54% · Waste 41.46% · Margin -111.87%", "Cut stock immediately, markdown earlier, bundle, and prune poor store fits."), ("Cluster D · Neutral mid-tier", "Demand 123.52 · Sell-through 69.63% · Waste 30.37% · Margin 0.45%", "Use light promotion and micro-replenishment changes to move toward Cluster A."), ], "r2": 0.823, "mae": 278.78, "drivers": ["discount_pct", "base_price", "cost_price", "initial_quantity", "markdown_applied", "days_until_expiry", "daily_demand", "spoilage_risk", "shelf_life_days", "is_weekend"], "conclusion": "Discount intensity is the dominant profit driver in Ready-to-Eat, followed by pricing, cost, and stock depth. The main managerial lever is promotion redesign before inventory expansion.", "actions": [ "Review discount_pct first; move from blanket discounts to targeted markdowns.", "Raise or reset pricing on high-demand hero items before adding more stock.", "Shorten replenishment cycles and reduce single-drop quantity on the high-waste group.", ], }, } def _sync_region_from_store(df: pd.DataFrame): stores = st.session_state.get("store_filter", []) if stores: inferred = sorted(df.loc[df["store_id"].isin(stores), "region"].dropna().unique().tolist()) st.session_state["region_filter"] = inferred def _sync_store_from_region(df: pd.DataFrame): regions = st.session_state.get("region_filter", []) current_stores = st.session_state.get("store_filter", []) allowed = sorted(df.loc[df["region"].isin(regions), "store_id"].dropna().unique().tolist()) if regions else sorted(df["store_id"].dropna().unique().tolist()) st.session_state["store_filter"] = [s for s in current_stores if s in allowed] def build_supply_balancing_table(df: pd.DataFrame) -> pd.DataFrame: work = df.copy() grouped = work.groupby(["region", "store_id", "category"], dropna=False).agg( remaining_inventory=("leftover_units", "mean"), demand=("daily_demand", "mean"), unmet_demand=("lost_sales_units", "mean"), avg_days_until_expiry=("days_until_expiry", "mean"), waste_pct=("waste_pct", "mean"), avg_profit=("profit", "mean"), ).reset_index() grouped["surplus_qty"] = (grouped["remaining_inventory"] - grouped["demand"]).clip(lower=0) return grouped def filter_scope(df: pd.DataFrame, selected_region="All", selected_store="All", selected_category="All") -> pd.DataFrame: scoped = df.copy() if selected_store != "All": scoped = scoped[scoped["store_id"] == selected_store] elif selected_region != "All": scoped = scoped[scoped["region"] == selected_region] if selected_category != "All": scoped = scoped[scoped["category"] == selected_category] return scoped def apply_filters(df: pd.DataFrame): st.sidebar.header("Filters") if "region_filter" not in st.session_state: st.session_state["region_filter"] = [] if "store_filter" not in st.session_state: st.session_state["store_filter"] = [] regions = sorted(df["region"].dropna().unique().tolist()) selected_regions = st.sidebar.multiselect( "Region", regions, key="region_filter", on_change=_sync_store_from_region, args=(df,), ) available_stores = sorted( df.loc[df["region"].isin(selected_regions), "store_id"].dropna().unique().tolist() ) if selected_regions else sorted(df["store_id"].dropna().unique().tolist()) # trim stale selections before widget render st.session_state["store_filter"] = [s for s in st.session_state.get("store_filter", []) if s in available_stores] selected_stores = st.sidebar.multiselect( "Store", available_stores, key="store_filter", on_change=_sync_region_from_store, args=(df,), ) if selected_stores: inferred_regions = sorted(df.loc[df["store_id"].isin(selected_stores), "region"].dropna().unique().tolist()) selected_regions = inferred_regions st.sidebar.caption("Auto region: " + ", ".join(inferred_regions)) category_options = ["All"] + sorted(df["category"].dropna().unique().tolist()) chosen_category = st.sidebar.selectbox("Category", category_options) day_type = st.sidebar.selectbox("Day type", ["All", "Weekday", "Weekend"]) use_expiry = st.sidebar.checkbox("Limit to inventory below 60 days until expiry", value=False) expiry_range = (0, 60) if use_expiry: expiry_range = st.sidebar.slider("Days until expiry", 0, 60, (0, 60)) exec_scope = df.copy() if selected_regions: exec_scope = exec_scope[exec_scope["region"].isin(selected_regions)] if selected_stores: exec_scope = exec_scope[exec_scope["store_id"].isin(selected_stores)] if day_type == "Weekday": exec_scope = exec_scope[exec_scope["is_weekend"] == 0] elif day_type == "Weekend": exec_scope = exec_scope[exec_scope["is_weekend"] == 1] if use_expiry: exec_scope = exec_scope[(exec_scope["days_until_expiry"] >= expiry_range[0]) & (exec_scope["days_until_expiry"] <= expiry_range[1])] filtered = exec_scope.copy() if chosen_category != "All": filtered = filtered[filtered["category"] == chosen_category] scope_info = { "stores": selected_stores, "regions": selected_regions, "category": chosen_category, "day_type": day_type, "use_expiry": use_expiry, "expiry_range": expiry_range, } return filtered, exec_scope, scope_info def build_category_recommendations(row: pd.Series, cat_summary: pd.DataFrame) -> list[str]: waste_mean = float(cat_summary["waste_pct"].mean()) if len(cat_summary) else 0 waste_p75 = float(cat_summary["waste_pct"].quantile(0.75)) if len(cat_summary) else waste_mean demand_mean = float(cat_summary["avg_demand"].mean()) if len(cat_summary) else 0 demand_p75 = float(cat_summary["avg_demand"].quantile(0.75)) if len(cat_summary) else demand_mean profit_mean = float(cat_summary["avg_profit"].mean()) if len(cat_summary) else 0 stockout_mean = float(cat_summary["stockout_rate"].mean()) if len(cat_summary) else 0 stockout_p75 = float(cat_summary["stockout_rate"].quantile(0.75)) if len(cat_summary) else stockout_mean sell_mean = float(cat_summary["sell_through"].mean()) if len(cat_summary) else 0 advice: list[str] = [] if row["waste_pct"] >= max(0.30, waste_p75): advice.append("start markdown earlier") if row["avg_demand"] >= demand_p75 and row["stockout_rate"] >= max(0.10, stockout_mean, stockout_p75 * 0.8): advice.append("increase replenishment") if row["avg_profit"] < 0 and row["avg_demand"] < demand_mean: advice.append("reduce replenishment") if row["waste_pct"] > waste_mean or row["avg_profit"] < profit_mean: advice.append("review mix and margin") if row["sell_through"] < max(0.45, sell_mean * 0.8): advice.append("prune weak SKUs / stores") if not advice: advice.append("maintain current playbook") # preserve order and uniqueness return list(dict.fromkeys(advice)) def _safe_profit_margin(df: pd.DataFrame) -> pd.Series: revenue = (df["selling_price"] * df["units_sold"]).replace(0, np.nan) return (df["profit"] / revenue).replace([np.inf, -np.inf], np.nan).fillna(0) def _assign_cluster_names(cluster_summary: pd.DataFrame) -> pd.DataFrame: ranked = cluster_summary.copy() ranked["score"] = ( ranked["margin"] * 0.45 + ranked["sell_through"] * 0.35 - ranked["waste"] * 0.20 ) ranked = ranked.sort_values(["score", "avg_profit"], ascending=[False, False]).reset_index(drop=True) names: list[str] = [] used: dict[str, int] = {} for i, row in ranked.iterrows(): if i == 0 and row["margin"] >= 0: base = "Core winners" elif row["margin"] < 0 and row["sell_through"] >= ranked["sell_through"].median(): base = "Traffic but unprofitable" elif row["waste"] >= ranked["waste"].median(): base = "High-waste cluster" else: base = "Mid-tier cluster" used[base] = used.get(base, 0) + 1 suffix = "" if used[base] == 1 else f" {used[base]}" names.append(f"{base}{suffix}") ranked["cluster_name"] = names return ranked.drop(columns=["score"]) def _cluster_color_map(cluster_names: list[str]) -> dict[str, str]: palette = { "Core winners": "#1f77b4", "Traffic but unprofitable": "#ff7f0e", "High-waste cluster": "#d62728", "Mid-tier cluster": "#7f7f7f", } mapping: dict[str, str] = {} for name in cluster_names: for base, color in palette.items(): if name.startswith(base): mapping[name] = color break else: mapping[name] = "#17becf" return mapping def _classify_product_segment(row: pd.Series, sell_thr: float, waste_thr: float) -> str: if row["avg_sell_through"] >= sell_thr and row["avg_profit"] >= 0: return "Scale ⭐" if row["avg_sell_through"] >= sell_thr and row["avg_profit"] < 0: return "Fix Pricing ⚠️" if row["avg_sell_through"] < sell_thr and row["avg_profit"] >= 0: return "Optimize 🛠" if row["avg_waste_pct"] >= waste_thr: return "Reduce ❌" return "Optimize 🛠" def _generate_sku_strategy(row: pd.Series, sell_thr: float, waste_thr: float, discount_thr: float, demand_thr: float) -> tuple[str, str]: actions: list[str] = [] if row["segment"] == "Scale ⭐": actions.extend(["protect availability", "keep discounts selective"]) if row["avg_discount_pct"] > discount_thr: actions.append("test a lighter markdown") headline = "Scale distribution" elif row["segment"] == "Fix Pricing ⚠️": actions.extend(["raise effective price or trim discount", "hold replenishment steady"]) if row["avg_discount_pct"] > discount_thr: actions.append("tighten markdown depth") if row["avg_daily_demand"] >= demand_thr: actions.append("treat as hero SKU, but monetize demand better") headline = "Fix pricing before adding stock" elif row["segment"] == "Reduce ❌": actions.extend(["cut initial quantity", "markdown earlier", "review store fit / assortment"]) headline = "Reduce exposure" else: actions.extend(["fine-tune stock depth", "use targeted promotion only where needed"]) if row["avg_waste_pct"] > waste_thr: actions.append("bring markdown trigger forward") headline = "Optimize locally" if row["avg_waste_pct"] >= waste_thr and "markdown earlier" not in actions: actions.append("reduce waste with earlier markdown") if row["avg_profit_margin"] < 0 and "review price / cost" not in actions and row["segment"] != "Reduce ❌": actions.append("review price / cost") if row["avg_sell_through"] < sell_thr * 0.85: actions.append("check demand realism before restocking") action_text = "; ".join(dict.fromkeys(actions)) return headline, action_text def _simulate_discount_profit(product_scope: pd.DataFrame, new_discount_pct: float) -> dict: sim = product_scope.copy() if sim.empty: return {} new_discount_pct = float(np.clip(new_discount_pct, 0, 0.8)) current_discount = float(sim["discount_pct"].mean()) base_price = sim["base_price"].replace(0, np.nan).fillna(sim["selling_price"]).astype(float) cost_price = sim["cost_price"].astype(float) if "cost_price" in sim.columns else sim["selling_price"].astype(float) * 0.7 initial_qty = sim["initial_quantity"].astype(float).clip(lower=0) daily_demand = sim["daily_demand"].astype(float).clip(lower=0) discount_delta = new_discount_pct - current_discount demand_multiplier = 1 + discount_delta * 1.35 demand_multiplier = np.clip(demand_multiplier, 0.65, 1.45) sim_price = base_price * (1 - new_discount_pct) sim_units = np.minimum(initial_qty, daily_demand * demand_multiplier).clip(lower=0) sim_leftover = (initial_qty - sim_units).clip(lower=0) sim_profit = ((sim_price - cost_price) * sim_units - cost_price * 0.35 * sim_leftover).mean() current_profit = float(sim["profit"].mean()) current_units = float(sim["units_sold"].mean()) sim_waste_pct = float((sim_leftover / initial_qty.replace(0, np.nan)).fillna(0).mean()) return { "current_discount": current_discount, "new_discount": new_discount_pct, "current_profit": current_profit, "sim_profit": float(sim_profit), "profit_delta": float(sim_profit - current_profit), "current_units": current_units, "sim_units": float(sim_units.mean()), "units_delta": float(sim_units.mean() - current_units), "sim_waste_pct": sim_waste_pct, } def _run_category_deep_dive(cat_df: pd.DataFrame) -> dict | None: if cat_df.empty: return None working = cat_df.copy() working["profit_margin"] = _safe_profit_margin(working) if "spoilage_risk" not in working.columns: working["spoilage_risk"] = working["waste_pct"] if "markdown_applied" not in working.columns: working["markdown_applied"] = (working["discount_pct"] > 0).astype(int) if "shelf_life_days" not in working.columns: working["shelf_life_days"] = working["days_until_expiry"] entity = working.groupby(["store_id", "product_name"], dropna=False).agg( region=("region", "first"), avg_profit=("profit", "mean"), avg_profit_margin=("profit_margin", "mean"), avg_waste_pct=("waste_pct", "mean"), avg_sell_through=("sell_through_pct", "mean"), avg_discount_pct=("discount_pct", "mean"), avg_daily_demand=("daily_demand", "mean"), avg_initial_qty=("initial_quantity", "mean"), avg_days_until_expiry=("days_until_expiry", "mean"), avg_units_sold=("units_sold", "mean"), avg_cost_price=("cost_price", "mean") if "cost_price" in working.columns else ("selling_price", "mean"), avg_base_price=("base_price", "mean"), avg_markdown_applied=("markdown_applied", "mean"), avg_is_weekend=("is_weekend", "mean"), avg_spoilage_risk=("spoilage_risk", "mean"), avg_shelf_life_days=("shelf_life_days", "mean"), ).reset_index() if len(entity) < 8: return None from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error, r2_score from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler pca_features = [ "avg_profit_margin", "avg_days_until_expiry", "avg_sell_through", "avg_waste_pct", "avg_discount_pct", "avg_daily_demand", "avg_initial_qty" ] X_pca = entity[pca_features].fillna(entity[pca_features].median()) scaler = StandardScaler() X_scaled = scaler.fit_transform(X_pca) pca = PCA(n_components=2, random_state=42) pca_scores = pca.fit_transform(X_scaled) loadings = pd.DataFrame(pca.components_.T, index=pca_features, columns=["PC1", "PC2"]) pc1_loadings = loadings["PC1"].sort_values(key=np.abs, ascending=False).head(5) pc2_loadings = loadings["PC2"].sort_values(key=np.abs, ascending=False).head(5) cluster_count = 4 if len(entity) >= 20 else max(2, min(4, len(entity) // 4)) km = KMeans(n_clusters=cluster_count, n_init=20, random_state=42) entity["cluster_id"] = km.fit_predict(X_scaled) entity["pc1"] = pca_scores[:, 0] entity["pc2"] = pca_scores[:, 1] cluster_summary = entity.groupby("cluster_id").agg( demand=("avg_daily_demand", "mean"), sell_through=("avg_sell_through", "mean"), waste=("avg_waste_pct", "mean"), margin=("avg_profit_margin", "mean"), avg_profit=("avg_profit", "mean"), sku_store_pairs=("product_name", "count"), ).reset_index() cluster_summary = _assign_cluster_names(cluster_summary) entity = entity.merge(cluster_summary[["cluster_id", "cluster_name"]], on="cluster_id", how="left") cluster_centers = entity.groupby("cluster_name", dropna=False).agg( pc1=("pc1", "mean"), pc2=("pc2", "mean"), avg_profit=("avg_profit", "mean"), ).reset_index() product_summary = entity.groupby("product_name", dropna=False).agg( avg_profit=("avg_profit", "mean"), avg_profit_margin=("avg_profit_margin", "mean"), avg_waste_pct=("avg_waste_pct", "mean"), avg_sell_through=("avg_sell_through", "mean"), avg_discount_pct=("avg_discount_pct", "mean"), avg_daily_demand=("avg_daily_demand", "mean"), avg_initial_qty=("avg_initial_qty", "mean"), avg_days_until_expiry=("avg_days_until_expiry", "mean"), avg_units_sold=("avg_units_sold", "mean"), avg_cost_price=("avg_cost_price", "mean"), avg_base_price=("avg_base_price", "mean"), store_count=("store_id", "nunique"), region_count=("region", "nunique"), observations=("store_id", "count"), ).reset_index() product_summary["health_score"] = ( product_summary["avg_sell_through"] * 0.40 + product_summary["avg_profit_margin"].clip(lower=-1, upper=1) * 0.30 - product_summary["avg_waste_pct"] * 0.20 + (product_summary["avg_profit"] > 0).astype(int) * 0.10 ) product_summary = product_summary.sort_values(["health_score", "avg_profit"], ascending=[False, False]) sell_thr = float(product_summary["avg_sell_through"].median()) waste_thr = float(product_summary["avg_waste_pct"].quantile(0.75)) discount_thr = float(product_summary["avg_discount_pct"].median()) demand_thr = float(product_summary["avg_daily_demand"].median()) product_summary["segment"] = product_summary.apply(lambda r: _classify_product_segment(r, sell_thr, waste_thr), axis=1) strategy_pack = product_summary.apply( lambda r: _generate_sku_strategy(r, sell_thr, waste_thr, discount_thr, demand_thr), axis=1, result_type="expand", ) product_summary[["strategy_headline", "action"]] = strategy_pack good_products = product_summary[product_summary["segment"] == "Scale ⭐"].copy().sort_values( ["avg_profit", "health_score"], ascending=[False, False] ).head(8) risky_products = product_summary[product_summary["segment"].isin(["Fix Pricing ⚠️", "Reduce ❌"])].copy().sort_values( ["avg_profit", "avg_waste_pct"], ascending=[True, False] ).head(8) product_focus = product_summary[[ "product_name", "segment", "strategy_headline", "avg_profit", "avg_profit_margin", "avg_waste_pct", "avg_sell_through", "avg_daily_demand", "avg_discount_pct", "action" ]].copy().sort_values(["segment", "avg_profit"], ascending=[True, False]) model_features = [ "discount_pct", "initial_quantity", "base_price", "cost_price" if "cost_price" in working.columns else "selling_price", "days_until_expiry", "daily_demand", "markdown_applied", "is_weekend", "spoilage_risk", "shelf_life_days" ] model_df = working[model_features + ["profit"]].copy().dropna() if len(model_df) < 20: model_df = working[model_features + ["profit"]].fillna(0) X = model_df[model_features] y = model_df["profit"] test_size = 0.25 if len(model_df) >= 40 else 0.2 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) rf = RandomForestRegressor(n_estimators=200, max_depth=8, min_samples_leaf=5, random_state=42) rf.fit(X_train, y_train) pred = rf.predict(X_test) importances = pd.Series(rf.feature_importances_, index=model_features).sort_values(ascending=False) category = str(cat_df["category"].dropna().iloc[0]) region_profit = cat_df.groupby("region")["profit"].mean().sort_values().reset_index() good_exists = not good_products.empty pricing_issue_count = int((product_summary["segment"] == "Fix Pricing ⚠️").sum()) reduce_count = int((product_summary["segment"] == "Reduce ❌").sum()) if good_exists: best_names = ", ".join(good_products["product_name"].head(3).tolist()) product_readout = ( f"Within {category}, true scale pockets exist, led by {best_names}. " f"At the same time, {pricing_issue_count} SKU groups are selling well but not monetizing demand, " f"and {reduce_count} look like reduce-or-reassort candidates." ) else: product_readout = ( f"No true scale group is visible in the current {category} scope. " f"Most products sit in pricing-fix or reduce buckets, which points to a category-wide reset rather than isolated fixes." ) return { "category": category, "avg_profit": float(cat_df["profit"].mean()), "avg_margin": float(_safe_profit_margin(cat_df).mean()), "avg_waste": float(cat_df["waste_pct"].mean()), "avg_units_sold": float(cat_df["units_sold"].mean()), "avg_daily_demand": float(cat_df["daily_demand"].mean()), "region_profit": region_profit, "entity": entity, "product_summary": product_summary, "good_products": good_products, "risky_products": risky_products, "product_focus": product_focus, "good_product_count": int(len(good_products)), "good_product_share": float(len(good_products) / max(len(product_summary), 1)), "product_segment_counts": product_summary["segment"].value_counts().to_dict(), "sell_thr": sell_thr, "waste_thr": waste_thr, "discount_thr": discount_thr, "product_readout": product_readout, "pc1": float(pca.explained_variance_ratio_[0]), "pc2": float(pca.explained_variance_ratio_[1]), "pc1_theme": "Efficiency / margin quality axis", "pc2_theme": "Demand / replenishment intensity axis", "pc1_loadings": [(idx, f"{val:+.3f}") for idx, val in pc1_loadings.items()], "pc2_loadings": [(idx, f"{val:+.3f}") for idx, val in pc2_loadings.items()], "cluster_summary": cluster_summary, "cluster_centers": cluster_centers, "cluster_color_map": _cluster_color_map(cluster_summary["cluster_name"].tolist()), "r2": float(r2_score(y_test, pred)) if len(y_test) > 1 else 0.0, "mae": float(mean_absolute_error(y_test, pred)), "drivers": importances, "library": PCA_SHAP_LIBRARY.get(category), } def build_pca_shap_dashboard(scope_df: pd.DataFrame): st.markdown("### PCA / SHAP Deep Dive") st.caption("PCA identifies internal operating structures inside the current scope, while SHAP-style explainability summarizes the main profit drivers for actionable retail levers.") categories = sorted(scope_df["category"].dropna().unique().tolist()) if not categories: st.info("No category remains in the current filter.") return if len(categories) != 1: summary = scope_df.groupby("category").agg( avg_profit=("profit", "mean"), avg_waste=("waste_pct", "mean"), avg_demand=("daily_demand", "mean"), avg_sell_through=("sell_through_pct", "mean"), avg_discount=("discount_pct", "mean"), avg_stock=("initial_quantity", "mean"), ).reset_index() summary["efficiency_index"] = summary["avg_sell_through"] - summary["avg_waste"] - summary["avg_discount"] * 0.25 summary["recommended_focus"] = np.where( summary["avg_profit"] < 0, "Fix pricing / promotion logic first", np.where(summary["avg_waste"] > summary["avg_waste"].median(), "Reduce stock and markdown earlier", "Protect availability and avoid unnecessary discounting"), ) c1, c2, c3, c4 = st.columns(4) c1.metric("Categories in scope", len(summary)) c2.metric("Avg scope profit", f"EUR {scope_df['profit'].mean():.2f}") c3.metric("Avg scope waste", f"{scope_df['waste_pct'].mean():.1%}") c4.metric("Highest-profit category", summary.sort_values("avg_profit", ascending=False).iloc[0]["category"]) left, right = st.columns([1.1, 1]) with left: fig = px.scatter( summary, x="avg_demand", y="avg_waste", size=summary["avg_profit"].abs() + 1, color="category", hover_data=["avg_sell_through", "avg_discount", "avg_stock"], title="Category operating map: demand vs waste", ) st.plotly_chart(fig, use_container_width=True) with right: fig = px.bar( summary.sort_values("avg_profit", ascending=False), x="category", y=["avg_profit", "avg_discount"], barmode="group", title="Profit and discount pressure by category", ) st.plotly_chart(fig, use_container_width=True) st.dataframe(summary[["category", "avg_profit", "avg_waste", "avg_demand", "recommended_focus"]], use_container_width=True, hide_index=True) st.info("For a fuller PCA / SHAP deep dive, narrow the sidebar filter to a single category.") return category = categories[0] cat_df = scope_df[scope_df["category"] == category].copy() deep = _run_category_deep_dive(cat_df) if deep is None: st.info("Not enough observations in the current scope to build a reliable PCA / SHAP deep dive. Try a broader filter.") return snap = deep.get("library") m1, m2, m3, m4, m5 = st.columns(5) m1.metric("Avg profit", f"EUR {deep['avg_profit']:.2f}") m2.metric("Avg margin", f"{deep['avg_margin']:.2%}") m3.metric("Avg waste", f"{deep['avg_waste']:.2%}") m4.metric("Avg units sold", f"{deep['avg_units_sold']:.2f}") m5.metric("Avg daily demand", f"{deep['avg_daily_demand']:.2f}") rp = deep["region_profit"] if not rp.empty: st.markdown("**Regional profit ranking**") st.write(" · ".join([f"{r.region}: EUR {r.profit:.2f}" for r in rp.itertuples(index=False)])) if snap: st.markdown(f"**Business readout:** {snap['business']}") else: readout = f"{category} currently shows average profit of EUR {deep['avg_profit']:.2f}, waste of {deep['avg_waste']:.1%}, and average demand of {deep['avg_daily_demand']:.1f}. The deep dive below highlights which SKU-store mixes behave like winners versus loss makers." st.markdown(f"**Business readout:** {readout}") left, right = st.columns(2) with left: st.markdown("#### PCA insight") st.markdown(f"**PC1 explained variance:** {deep['pc1']:.2%} ") st.markdown(f"**PC2 explained variance:** {deep['pc2']:.2%}") st.caption(f"PC1 theme: {deep['pc1_theme']} · PC2 theme: {deep['pc2_theme']}") pc1_df = pd.DataFrame(deep['pc1_loadings'], columns=['feature', 'value']) pc1_df['value'] = pc1_df['value'].astype(float) fig_pc1 = px.bar(pc1_df.sort_values('value'), x='value', y='feature', orientation='h', title='PC1 loading chart') st.plotly_chart(fig_pc1, use_container_width=True) pc2_df = pd.DataFrame(deep['pc2_loadings'], columns=['feature', 'value']) pc2_df['value'] = pc2_df['value'].astype(float) fig_pc2 = px.bar(pc2_df.sort_values('value'), x='value', y='feature', orientation='h', title='PC2 loading chart') st.plotly_chart(fig_pc2, use_container_width=True) with right: st.markdown("#### SHAP-style profit drivers") st.markdown(f"**Model R²:** {deep['r2']:.3f} ") st.markdown(f"**Model MAE:** {deep['mae']:.2f}") top3 = ", ".join(deep['drivers'].head(3).index.tolist()) st.markdown(f"**Top variables:** {top3}") if snap: st.markdown(f"**Core conclusion:** {snap['conclusion']}") else: st.markdown(f"**Core conclusion:** In the current scope, {category} profit is most strongly shaped by {top3}.") top_driver = deep['drivers'].index[0] if top_driver == 'discount_pct': st.error("⚠️ Core issue: profit is being driven by discounting, so the business is buying demand instead of monetizing it.") else: st.info(f"Main profit driver in this scope: {top_driver}") chart_left, chart_right = st.columns([1.1, 1]) with chart_left: color_map = deep.get('cluster_color_map', {}) ordered_clusters = deep['cluster_summary']['cluster_name'].tolist() fig = px.scatter( deep['entity'], x='pc1', y='pc2', color='cluster_name', category_orders={'cluster_name': ordered_clusters}, color_discrete_map=color_map, hover_data=['cluster_name', 'store_id', 'product_name', 'avg_profit', 'avg_waste_pct', 'avg_sell_through'], title=f"PCA category map for {category}", ) centers = deep.get('cluster_centers', pd.DataFrame()) if not centers.empty: fig.add_trace( go.Scatter( x=centers['pc1'], y=centers['pc2'], mode='markers+text', text=centers['cluster_name'], textposition='top center', marker=dict(size=16, color='black', symbol='x'), name='Cluster center', showlegend=False, ) ) st.plotly_chart(fig, use_container_width=True) with chart_right: driver_df = deep['drivers'].head(10).sort_values(ascending=True).reset_index() driver_df.columns = ['variable', 'importance'] fig = px.bar(driver_df, x='importance', y='variable', orientation='h', color='importance', title='Profit drivers (SHAP-style)') st.plotly_chart(fig, use_container_width=True) cluster_summary = deep['cluster_summary'].copy() cluster_summary['sell_through'] = cluster_summary['sell_through'].map(lambda x: f"{x:.2%}") cluster_summary['waste'] = cluster_summary['waste'].map(lambda x: f"{x:.2%}") cluster_summary['margin'] = cluster_summary['margin'].map(lambda x: f"{x:.2%}") cluster_summary['avg_profit'] = cluster_summary['avg_profit'].map(lambda x: f"EUR {x:.2f}") st.markdown("#### PCA cluster action map") st.dataframe(cluster_summary[["cluster_name", "demand", "sell_through", "waste", "margin", "avg_profit", "sku_store_pairs"]], use_container_width=True, hide_index=True) if snap: actions = pd.DataFrame({"recommended action": snap['actions']}) else: top_cluster = deep['cluster_summary'].sort_values(['avg_profit', 'waste'], ascending=[False, True]).iloc[0]['cluster_name'] worst_cluster = deep['cluster_summary'].sort_values(['avg_profit', 'waste'], ascending=[True, False]).iloc[0]['cluster_name'] actions = pd.DataFrame({"recommended action": [ f"Protect availability and shelf space for {top_cluster.lower()}.", f"Reduce stock depth and markdown earlier in {worst_cluster.lower()}.", "Review price-discount logic on any high-demand item that still carries weak margin.", ]}) st.markdown("#### Recommended actions") st.dataframe(actions, use_container_width=True, hide_index=True) st.markdown("#### Decision map: SKU actions") sku_map = deep["product_summary"][["product_name", "segment", "avg_profit", "avg_sell_through", "avg_waste_pct", "avg_discount_pct"]].copy() sku_map.rename(columns={ "avg_sell_through": "sell_through", "avg_waste_pct": "waste", "avg_discount_pct": "discount_pct", }, inplace=True) if not sku_map.empty: fig_decision = px.scatter( sku_map, x="sell_through", y="avg_profit", color="segment", size=sku_map["waste"].clip(lower=0.001) * 100, hover_data=["product_name", "waste", "discount_pct"], title="SKU decision map: sell-through vs profit", ) fig_decision.add_vline(x=deep.get("sell_thr", float(sku_map["sell_through"].median())), line_dash="dot") fig_decision.add_hline(y=0, line_dash="dot") st.plotly_chart(fig_decision, use_container_width=True) st.markdown("#### Product-level strategy engine") st.info(deep["product_readout"]) seg_counts = deep.get("product_segment_counts", {}) p1, p2, p3, p4 = st.columns(4) p1.metric("Products analysed", len(deep["product_summary"])) p2.metric("Scale ⭐", seg_counts.get("Scale ⭐", 0)) p3.metric("Fix Pricing ⚠️", seg_counts.get("Fix Pricing ⚠️", 0)) p4.metric("Reduce ❌", seg_counts.get("Reduce ❌", 0)) prod_left, prod_right = st.columns(2) with prod_left: winners = deep["good_products"].copy() if winners.empty: st.warning("No product currently qualifies as a true scale winner in this scope.") else: winners_display = winners[["product_name", "segment", "avg_profit", "avg_profit_margin", "avg_sell_through", "avg_waste_pct", "strategy_headline"]].copy() winners_display.columns = ["product_name", "segment", "avg_profit", "margin", "sell_through", "waste", "strategy"] winners_display["avg_profit"] = winners_display["avg_profit"].map(lambda x: f"EUR {x:.2f}") winners_display["margin"] = winners_display["margin"].map(lambda x: f"{x:.1%}") winners_display["sell_through"] = winners_display["sell_through"].map(lambda x: f"{x:.1%}") winners_display["waste"] = winners_display["waste"].map(lambda x: f"{x:.1%}") st.markdown("**Scale (high demand + profitable)**") st.dataframe(winners_display, use_container_width=True, hide_index=True) with prod_right: risk = deep["risky_products"].copy() if risk.empty: st.success("No pricing-fix or reduce candidates are visible in this scope.") else: risk_display = risk[["product_name", "segment", "avg_profit", "avg_profit_margin", "avg_sell_through", "avg_waste_pct", "strategy_headline"]].copy() risk_display.columns = ["product_name", "segment", "avg_profit", "margin", "sell_through", "waste", "strategy"] risk_display["avg_profit"] = risk_display["avg_profit"].map(lambda x: f"EUR {x:.2f}") risk_display["margin"] = risk_display["margin"].map(lambda x: f"{x:.1%}") risk_display["sell_through"] = risk_display["sell_through"].map(lambda x: f"{x:.1%}") risk_display["waste"] = risk_display["waste"].map(lambda x: f"{x:.1%}") st.markdown("**Fix / Reduce (high risk products)**") st.dataframe(risk_display, use_container_width=True, hide_index=True) if not deep["product_focus"].empty: focus = deep["product_focus"].copy() focus["avg_profit"] = focus["avg_profit"].map(lambda x: f"EUR {x:.2f}") focus["avg_profit_margin"] = focus["avg_profit_margin"].map(lambda x: f"{x:.1%}") focus["avg_waste_pct"] = focus["avg_waste_pct"].map(lambda x: f"{x:.1%}") focus["avg_sell_through"] = focus["avg_sell_through"].map(lambda x: f"{x:.1%}") focus["avg_discount_pct"] = focus["avg_discount_pct"].map(lambda x: f"{x:.1%}") focus.rename(columns={ "avg_profit_margin": "margin", "avg_waste_pct": "waste", "avg_sell_through": "sell_through", "avg_daily_demand": "demand", "avg_discount_pct": "discount", "strategy_headline": "strategy", }, inplace=True) st.markdown("#### SKU auto strategy table") st.dataframe(focus[["product_name", "segment", "strategy", "avg_profit", "margin", "sell_through", "waste", "discount", "demand", "action"]], use_container_width=True, hide_index=True) def forecast_filtered_demand(scope_df: pd.DataFrame, label: str = "Filtered selection", model_bundle=None) -> pd.DataFrame: d = scope_df.copy() if d.empty: return pd.DataFrame() hist = ( d.groupby("transaction_date", as_index=False) .agg(daily_demand=("daily_demand", "mean")) .sort_values("transaction_date") ) if len(hist) < 14: return pd.DataFrame() recent = d[d["transaction_date"] >= d["transaction_date"].max() - pd.Timedelta(days=56)].copy() if recent.empty: recent = d.copy() recent_hybrid = add_hybrid_forecast(recent, model_bundle=model_bundle) recent_hybrid["dow"] = recent_hybrid["transaction_date"].dt.dayofweek weekday_actual = recent.groupby(recent["transaction_date"].dt.dayofweek)["daily_demand"].mean().to_dict() weekday_forecast = recent_hybrid.groupby("dow")["forecast_units"].mean().to_dict() fallback_actual = float(hist["daily_demand"].tail(14).mean()) last_date = hist["transaction_date"].max() future_dates = pd.date_range(last_date + pd.Timedelta(days=1), periods=14, freq="D") future_vals = [] for dt in future_dates: dow = int(dt.dayofweek) base = float(weekday_actual.get(dow, fallback_actual)) hybrid = float(weekday_forecast.get(dow, base)) blended = 0.30 * hybrid + 0.70 * base val = float(np.clip(blended, base * 0.95, base * 1.05)) future_vals.append(val) future = pd.DataFrame({ "transaction_date": future_dates, "daily_demand": future_vals, "series": "Forecast", "scope": label, }) hist = hist.tail(60).copy() hist["series"] = "Actual" hist["scope"] = label return pd.concat([hist, future], ignore_index=True) def executive_overview(df: pd.DataFrame, exec_scope: pd.DataFrame, scope_info: dict): st.subheader("FRESHIE · Executive Overview") revenue = float((exec_scope["selling_price"] * exec_scope["units_sold"]).sum()) profit = float(exec_scope["profit"].sum()) units_sold = float(exec_scope["units_sold"].sum()) units_wasted = float(exec_scope["units_wasted"].sum()) c1, c2, c3, c4 = st.columns(4) c1.metric("Revenue", f"EUR {revenue:,.0f}") c2.metric("Profit", f"EUR {profit:,.0f}") c3.metric("Units sold", f"{units_sold:,.0f}") c4.metric("Units wasted", f"{units_wasted:,.0f}") st.markdown( '