import streamlit as st import pandas as pd import numpy as np import plotly.express as px import plotly.graph_objects as go from sklearn.decomposition import PCA import joblib import os from datetime import datetime # Import paths BASE_DIR = os.path.dirname(os.path.abspath(__file__)) SCALED_DATA_PATH = os.path.join(BASE_DIR, "data", "processed", "scaled_rfm_data.pkl") KMEANS_MODEL_PATH = os.path.join(BASE_DIR, "models", "kmeans_model.pkl") CUSTOMER_SEGMENTS_PATH = os.path.join(BASE_DIR, "outputs", "customer_segments.csv") SEGMENT_PRODUCTS_PATH = os.path.join(BASE_DIR, "outputs", "segment_products.csv") # Set Page Config st.set_page_config( page_title="SegmentX | Customer Intelligence Portal", page_icon="💎", layout="wide", initial_sidebar_state="expanded" ) # --- Industry-Grade UI Refinement --- st.markdown(""" """, unsafe_allow_html=True) @st.cache_data def load_data(): if not os.path.exists(CUSTOMER_SEGMENTS_PATH): return None, None df = pd.read_csv(CUSTOMER_SEGMENTS_PATH, index_col='Customer ID') # Load raw cleaned data for time-series analysis RAW_CLEANED_PATH = os.path.join(BASE_DIR, "data", "processed", "cleaned_retail_data.csv") if os.path.exists(RAW_CLEANED_PATH): df_raw = pd.read_csv(RAW_CLEANED_PATH, parse_dates=['InvoiceDate']) else: df_raw = None return df, df_raw @st.cache_resource def load_model(): if not os.path.exists(KMEANS_MODEL_PATH) or not os.path.exists(SCALED_DATA_PATH): return None, None model = joblib.load(KMEANS_MODEL_PATH) data_dict = joblib.load(SCALED_DATA_PATH) return model, data_dict @st.cache_data def get_pca_data(scaled_data, labels): pca = PCA(n_components=2) X_pca = pca.fit_transform(scaled_data) pca_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'], index=scaled_data.index) pca_df['Segment'] = labels return pca_df def main(): df, df_raw = load_data() model, data_dict = load_model() if df is None or model is None: st.error("Project data or models not found. Please run the pipeline scripts first.") return # Modern Sidebar st.sidebar.markdown("

SegmentX

", unsafe_allow_html=True) st.sidebar.markdown("---") page = st.sidebar.radio("Console Navigation", ["Overview", "Segment Profiles", "Customer Lookup"]) segments_list = df['Segment'].unique().tolist() selected_segments = st.sidebar.multiselect("Global Segment Filter", segments_list, default=segments_list) df_filtered = df[df['Segment'].isin(selected_segments)] st.markdown("""
Intelligence Console

Behavioral Portal v2.0

""", unsafe_allow_html=True) if page == "Overview": # Interactive Overview c1, c2, c3, c4 = st.columns(4) c1.metric("Revenue Impact", f"£{df_filtered['Monetary'].sum():,.0f}") c2.metric("Customer Scale", f"{len(df_filtered):,}") c3.metric("Retention Risk", f"{(len(df_filtered[df_filtered['Recency'] > 90]) / len(df_filtered) * 100):.1f}%") c4.metric("Avg. Order Value", f"£{df_filtered['Monetary'].mean():,.1f}") st.markdown("
", unsafe_allow_html=True) # Interactive Row 1 r1_c1, r1_c2 = st.columns([1, 1.2]) with r1_c1: st.markdown("### Segment Distribution") counts = df['Segment'].value_counts() fig = px.pie( values=counts.values, names=counts.index, hole=0.5, color_discrete_sequence=px.colors.sequential.ice_r ) fig.update_layout( paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', font_color="#f8fafc", margin=dict(t=0, b=0, l=0, r=0), legend=dict(orientation="h", yanchor="bottom", y=-0.1, xanchor="center", x=0.5) ) st.plotly_chart(fig, use_container_width=True) with r1_c2: st.markdown("### 2D Projection Topology") pca_df = get_pca_data(data_dict['rfm_scaled'], df['Segment']) fig = px.scatter( pca_df, x='PCA1', y='PCA2', color='Segment', opacity=0.6, color_discrete_sequence=px.colors.sequential.ice_r, hover_data=[pca_df.index] ) fig.update_layout( paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(15, 23, 42, 0.5)', font_color="#f8fafc", margin=dict(t=10, b=10, l=10, r=10), xaxis=dict(showgrid=False), yaxis=dict(showgrid=False) ) st.plotly_chart(fig, use_container_width=True) st.markdown("---") st.markdown("### 📈 Revenue Benchmarking") if df_raw is not None: df_raw['Month'] = df_raw['InvoiceDate'].dt.to_period('M').astype(str) df_raw['Revenue'] = df_raw['Quantity'] * df_raw['Price'] monthly_rev = df_raw.groupby('Month')['Revenue'].sum().reset_index() fig = px.line( monthly_rev, x='Month', y='Revenue', color_discrete_sequence=['#3b82f6'], render_mode='svg' ) fig.update_layout( paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(15, 23, 42, 0.5)', font_color="#f8fafc", xaxis_title=None, yaxis_title="Total Revenue (GBP)", margin=dict(t=20, b=20, l=20, r=20) ) fig.update_traces(line_width=3, fill='tozeroy', fillcolor='rgba(59, 130, 246, 0.1)') st.plotly_chart(fig, use_container_width=True) st.markdown("
", unsafe_allow_html=True) st.markdown("### 📊 Revenue Concentration (Pareto)") seg_rev = df_filtered.groupby('Segment')['Monetary'].sum().sort_values(ascending=False).reset_index() fig_bar = px.bar( seg_rev, x='Segment', y='Monetary', color='Monetary', color_continuous_scale='ice' ) fig_bar.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(15, 23, 42, 0.5)', font_color="#f8fafc") st.plotly_chart(fig_bar, use_container_width=True) st.markdown("---") st.subheader("🚨 Risk Analytics") high_risk = len(df_filtered[df_filtered['Recency'] > 90]) risk_pct = (high_risk / len(df_filtered)) * 100 if risk_pct > 30: st.warning(f"**Critical Warning**: {risk_pct:.1f}% of selected customers are churn-risk (90+ days inactive).") else: st.success(f"**Healthy Signal**: Retention is stable with only {risk_pct:.1f}% churn-risk.") st.markdown("
", unsafe_allow_html=True) st.subheader("📥 Data Export & Actions") csv = df_filtered.to_csv().encode('utf-8') st.download_button("Export Intelligence Report (CSV)", data=csv, file_name='segmentx_report.csv', mime='text/csv') elif page == "Segment Profiles": st.subheader("Cluster Behavioral Heatmap") profile_stats = df.groupby('Segment')[['Recency', 'Frequency', 'Monetary']].mean() profile_norm = (profile_stats - profile_stats.min()) / (profile_stats.max() - profile_stats.min()) fig = px.imshow( profile_norm.T, labels=dict(x="Segment", y="Metric", color="Score"), color_continuous_scale='Blues' ) fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', font_color="#f8fafc") st.plotly_chart(fig, use_container_width=True) st.markdown("#### Mean Values Matrix") st.table(profile_stats.style.format(lambda x: f"£{x:,.2f}" if x > 100 else f"{x:.2f}")) st.markdown("---") st.subheader("🛍️ Segment Affinity: Top 10 Products") if os.path.exists(SEGMENT_PRODUCTS_PATH): all_top_prods = pd.read_csv(SEGMENT_PRODUCTS_PATH) display_segs = selected_segments[:3] cols = st.columns(len(display_segs)) if display_segs else [st.container()] for i, seg in enumerate(display_segs): with cols[i]: st.markdown(f"**{seg}**") seg_prods = all_top_prods[all_top_prods['Segment'] == seg].head(10) if not seg_prods.empty: seg_prods['Description'] = seg_prods['Description'].str.slice(0, 30) + '...' st.table(seg_prods[['Description', 'Quantity']].set_index('Description')) else: st.info("No data.") else: st.info("Run product pipeline to see affinities.") elif page == "Customer Lookup": st.subheader("🔍 Intelligent Query") if st.button("🎲 Randomized ID Picker"): random_id = np.random.choice(df.index) st.session_state.customer_lookup_id = int(random_id) all_ids = sorted(df.index.unique().tolist()) if 'customer_lookup_id' not in st.session_state: st.session_state.customer_lookup_id = all_ids[0] customer_id = st.selectbox( "Target Customer ID", options=all_ids, index=all_ids.index(st.session_state.customer_lookup_id) if st.session_state.customer_lookup_id in all_ids else 0 ) st.session_state.customer_lookup_id = customer_id cust_data = df.loc[customer_id] l1, l2, l3 = st.columns(3) l1.metric("Segment Identity", cust_data['Segment']) l2.metric("Orders", f"{cust_data['Frequency']:.0f}") l3.metric("LTV GBP", f"£{cust_data['Monetary']:,.2f}") st.markdown("---") st.markdown("### 🛡️ Strategic Intelligence") ci1, ci2 = st.columns(2) # Churn Probability Logic avg_rec = df['Recency'].mean() churn_prob = 1 - np.exp(-cust_data['Recency'] / (avg_rec * 1.5)) churn_pct = min(max(churn_prob * 100, 0), 100) ci1.metric("Churn Risk Score", f"{churn_pct:.1f}%") # Predicted CLV avg_order = cust_data['Monetary'] / cust_data['Frequency'] projected_clv = cust_data['Monetary'] + (avg_order * cust_data['Frequency']) ci2.metric("Projected 1Y-LTV", f"£{projected_clv:,.2f}") st.markdown("
", unsafe_allow_html=True) recommendations = { "Champions": "High Value, Low Churn. Goal: Retention. Strategy: Early Access, Loyalty Rewards.", "Loyal Customers": "Consistent Value. Goal: Growth. Strategy: Cross-sell related categories.", "At-Risk": "Recent Inactivity. Goal: Re-activation. Strategy: Limited-time win-back discounts.", "Lost/Hibernating": "Historical only. Goal: Win-back or Pause. Strategy: Reactivate only high LTV types." } st.info(f"**Execution Strategy**: {recommendations.get(cust_data['Segment'], 'Maintain baseline engagement.')}") if __name__ == "__main__": main()