import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
import joblib
import os
from datetime import datetime
# Import paths
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
SCALED_DATA_PATH = os.path.join(BASE_DIR, "data", "processed", "scaled_rfm_data.pkl")
KMEANS_MODEL_PATH = os.path.join(BASE_DIR, "models", "kmeans_model.pkl")
CUSTOMER_SEGMENTS_PATH = os.path.join(BASE_DIR, "outputs", "customer_segments.csv")
SEGMENT_PRODUCTS_PATH = os.path.join(BASE_DIR, "outputs", "segment_products.csv")
# Set Page Config
st.set_page_config(
page_title="SegmentX | Customer Intelligence Portal",
page_icon="💎",
layout="wide",
initial_sidebar_state="expanded"
)
# --- Industry-Grade UI Refinement ---
st.markdown("""
""", unsafe_allow_html=True)
@st.cache_data
def load_data():
if not os.path.exists(CUSTOMER_SEGMENTS_PATH):
return None, None
df = pd.read_csv(CUSTOMER_SEGMENTS_PATH, index_col='Customer ID')
# Load raw cleaned data for time-series analysis
RAW_CLEANED_PATH = os.path.join(BASE_DIR, "data", "processed", "cleaned_retail_data.csv")
if os.path.exists(RAW_CLEANED_PATH):
df_raw = pd.read_csv(RAW_CLEANED_PATH, parse_dates=['InvoiceDate'])
else:
df_raw = None
return df, df_raw
@st.cache_resource
def load_model():
if not os.path.exists(KMEANS_MODEL_PATH) or not os.path.exists(SCALED_DATA_PATH):
return None, None
model = joblib.load(KMEANS_MODEL_PATH)
data_dict = joblib.load(SCALED_DATA_PATH)
return model, data_dict
@st.cache_data
def get_pca_data(scaled_data, labels):
pca = PCA(n_components=2)
X_pca = pca.fit_transform(scaled_data)
pca_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'], index=scaled_data.index)
pca_df['Segment'] = labels
return pca_df
def main():
df, df_raw = load_data()
model, data_dict = load_model()
if df is None or model is None:
st.error("Project data or models not found. Please run the pipeline scripts first.")
return
# Modern Sidebar
st.sidebar.markdown("
SegmentX
", unsafe_allow_html=True)
st.sidebar.markdown("---")
page = st.sidebar.radio("Console Navigation", ["Overview", "Segment Profiles", "Customer Lookup"])
segments_list = df['Segment'].unique().tolist()
selected_segments = st.sidebar.multiselect("Global Segment Filter", segments_list, default=segments_list)
df_filtered = df[df['Segment'].isin(selected_segments)]
st.markdown("""
Intelligence Console
Behavioral Portal v2.0
""", unsafe_allow_html=True)
if page == "Overview":
# Interactive Overview
c1, c2, c3, c4 = st.columns(4)
c1.metric("Revenue Impact", f"£{df_filtered['Monetary'].sum():,.0f}")
c2.metric("Customer Scale", f"{len(df_filtered):,}")
c3.metric("Retention Risk", f"{(len(df_filtered[df_filtered['Recency'] > 90]) / len(df_filtered) * 100):.1f}%")
c4.metric("Avg. Order Value", f"£{df_filtered['Monetary'].mean():,.1f}")
st.markdown("
", unsafe_allow_html=True)
# Interactive Row 1
r1_c1, r1_c2 = st.columns([1, 1.2])
with r1_c1:
st.markdown("### Segment Distribution")
counts = df['Segment'].value_counts()
fig = px.pie(
values=counts.values,
names=counts.index,
hole=0.5,
color_discrete_sequence=px.colors.sequential.ice_r
)
fig.update_layout(
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
font_color="#f8fafc",
margin=dict(t=0, b=0, l=0, r=0),
legend=dict(orientation="h", yanchor="bottom", y=-0.1, xanchor="center", x=0.5)
)
st.plotly_chart(fig, use_container_width=True)
with r1_c2:
st.markdown("### 2D Projection Topology")
pca_df = get_pca_data(data_dict['rfm_scaled'], df['Segment'])
fig = px.scatter(
pca_df, x='PCA1', y='PCA2', color='Segment',
opacity=0.6,
color_discrete_sequence=px.colors.sequential.ice_r,
hover_data=[pca_df.index]
)
fig.update_layout(
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(15, 23, 42, 0.5)',
font_color="#f8fafc",
margin=dict(t=10, b=10, l=10, r=10),
xaxis=dict(showgrid=False),
yaxis=dict(showgrid=False)
)
st.plotly_chart(fig, use_container_width=True)
st.markdown("---")
st.markdown("### 📈 Revenue Benchmarking")
if df_raw is not None:
df_raw['Month'] = df_raw['InvoiceDate'].dt.to_period('M').astype(str)
df_raw['Revenue'] = df_raw['Quantity'] * df_raw['Price']
monthly_rev = df_raw.groupby('Month')['Revenue'].sum().reset_index()
fig = px.line(
monthly_rev, x='Month', y='Revenue',
color_discrete_sequence=['#3b82f6'],
render_mode='svg'
)
fig.update_layout(
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(15, 23, 42, 0.5)',
font_color="#f8fafc",
xaxis_title=None,
yaxis_title="Total Revenue (GBP)",
margin=dict(t=20, b=20, l=20, r=20)
)
fig.update_traces(line_width=3, fill='tozeroy', fillcolor='rgba(59, 130, 246, 0.1)')
st.plotly_chart(fig, use_container_width=True)
st.markdown("
", unsafe_allow_html=True)
st.markdown("### 📊 Revenue Concentration (Pareto)")
seg_rev = df_filtered.groupby('Segment')['Monetary'].sum().sort_values(ascending=False).reset_index()
fig_bar = px.bar(
seg_rev, x='Segment', y='Monetary',
color='Monetary',
color_continuous_scale='ice'
)
fig_bar.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(15, 23, 42, 0.5)', font_color="#f8fafc")
st.plotly_chart(fig_bar, use_container_width=True)
st.markdown("---")
st.subheader("🚨 Risk Analytics")
high_risk = len(df_filtered[df_filtered['Recency'] > 90])
risk_pct = (high_risk / len(df_filtered)) * 100
if risk_pct > 30:
st.warning(f"**Critical Warning**: {risk_pct:.1f}% of selected customers are churn-risk (90+ days inactive).")
else:
st.success(f"**Healthy Signal**: Retention is stable with only {risk_pct:.1f}% churn-risk.")
st.markdown("
", unsafe_allow_html=True)
st.subheader("📥 Data Export & Actions")
csv = df_filtered.to_csv().encode('utf-8')
st.download_button("Export Intelligence Report (CSV)", data=csv, file_name='segmentx_report.csv', mime='text/csv')
elif page == "Segment Profiles":
st.subheader("Cluster Behavioral Heatmap")
profile_stats = df.groupby('Segment')[['Recency', 'Frequency', 'Monetary']].mean()
profile_norm = (profile_stats - profile_stats.min()) / (profile_stats.max() - profile_stats.min())
fig = px.imshow(
profile_norm.T,
labels=dict(x="Segment", y="Metric", color="Score"),
color_continuous_scale='Blues'
)
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', font_color="#f8fafc")
st.plotly_chart(fig, use_container_width=True)
st.markdown("#### Mean Values Matrix")
st.table(profile_stats.style.format(lambda x: f"£{x:,.2f}" if x > 100 else f"{x:.2f}"))
st.markdown("---")
st.subheader("🛍️ Segment Affinity: Top 10 Products")
if os.path.exists(SEGMENT_PRODUCTS_PATH):
all_top_prods = pd.read_csv(SEGMENT_PRODUCTS_PATH)
display_segs = selected_segments[:3]
cols = st.columns(len(display_segs)) if display_segs else [st.container()]
for i, seg in enumerate(display_segs):
with cols[i]:
st.markdown(f"**{seg}**")
seg_prods = all_top_prods[all_top_prods['Segment'] == seg].head(10)
if not seg_prods.empty:
seg_prods['Description'] = seg_prods['Description'].str.slice(0, 30) + '...'
st.table(seg_prods[['Description', 'Quantity']].set_index('Description'))
else: st.info("No data.")
else:
st.info("Run product pipeline to see affinities.")
elif page == "Customer Lookup":
st.subheader("🔍 Intelligent Query")
if st.button("🎲 Randomized ID Picker"):
random_id = np.random.choice(df.index)
st.session_state.customer_lookup_id = int(random_id)
all_ids = sorted(df.index.unique().tolist())
if 'customer_lookup_id' not in st.session_state:
st.session_state.customer_lookup_id = all_ids[0]
customer_id = st.selectbox(
"Target Customer ID",
options=all_ids,
index=all_ids.index(st.session_state.customer_lookup_id) if st.session_state.customer_lookup_id in all_ids else 0
)
st.session_state.customer_lookup_id = customer_id
cust_data = df.loc[customer_id]
l1, l2, l3 = st.columns(3)
l1.metric("Segment Identity", cust_data['Segment'])
l2.metric("Orders", f"{cust_data['Frequency']:.0f}")
l3.metric("LTV GBP", f"£{cust_data['Monetary']:,.2f}")
st.markdown("---")
st.markdown("### 🛡️ Strategic Intelligence")
ci1, ci2 = st.columns(2)
# Churn Probability Logic
avg_rec = df['Recency'].mean()
churn_prob = 1 - np.exp(-cust_data['Recency'] / (avg_rec * 1.5))
churn_pct = min(max(churn_prob * 100, 0), 100)
ci1.metric("Churn Risk Score", f"{churn_pct:.1f}%")
# Predicted CLV
avg_order = cust_data['Monetary'] / cust_data['Frequency']
projected_clv = cust_data['Monetary'] + (avg_order * cust_data['Frequency'])
ci2.metric("Projected 1Y-LTV", f"£{projected_clv:,.2f}")
st.markdown("
", unsafe_allow_html=True)
recommendations = {
"Champions": "High Value, Low Churn. Goal: Retention. Strategy: Early Access, Loyalty Rewards.",
"Loyal Customers": "Consistent Value. Goal: Growth. Strategy: Cross-sell related categories.",
"At-Risk": "Recent Inactivity. Goal: Re-activation. Strategy: Limited-time win-back discounts.",
"Lost/Hibernating": "Historical only. Goal: Win-back or Pause. Strategy: Reactivate only high LTV types."
}
st.info(f"**Execution Strategy**: {recommendations.get(cust_data['Segment'], 'Maintain baseline engagement.')}")
if __name__ == "__main__":
main()