entropy25 commited on
Commit
61d745b
·
verified ·
1 Parent(s): 3dbb5ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +566 -767
app.py CHANGED
@@ -4,26 +4,23 @@ import pandas as pd
4
  import numpy as np
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
- from sklearn.model_selection import train_test_split
8
  from sklearn.ensemble import RandomForestClassifier
9
- from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
10
  import plotly.express as px
11
  import plotly.graph_objects as go
12
- from plotly.subplots import make_subplots
13
- import plotly.io as pio
14
  from datetime import datetime, timedelta
15
  import io
16
  import base64
17
  import warnings
18
  warnings.filterwarnings('ignore')
19
 
20
- # Try importing xgboost and reportlab, use fallbacks if not available
21
  try:
22
  import xgboost as xgb
23
  XGBOOST_AVAILABLE = True
24
  except ImportError:
25
  XGBOOST_AVAILABLE = False
26
- print("XGBoost not available, using RandomForest only")
27
 
28
  try:
29
  from reportlab.lib.pagesizes import letter, A4
@@ -31,863 +28,665 @@ try:
31
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
32
  from reportlab.lib.units import inch
33
  from reportlab.lib import colors
 
 
 
 
34
  REPORTLAB_AVAILABLE = True
35
  except ImportError:
36
  REPORTLAB_AVAILABLE = False
37
- print("ReportLab not available, PDF generation disabled")
38
 
39
- # Modern color palette
 
 
 
 
 
 
 
40
  COLORS = {
41
  'primary': '#6366f1',
42
- 'success': '#10b981',
43
  'warning': '#f59e0b',
44
  'danger': '#ef4444',
45
- 'purple': '#8b5cf6',
46
- 'pink': '#ec4899',
47
- 'blue': '#3b82f6',
48
- 'indigo': '#6366f1'
49
  }
50
 
51
- # Global analytics instance
52
- analytics = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- class B2BCustomerAnalytics:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def __init__(self):
56
- self.df = None
57
- self.processed_df = None
58
  self.model = None
59
  self.feature_importance = None
60
- self.predictions = None
61
 
62
- def load_and_process_data(self, file):
63
- """Load and process the uploaded CSV file"""
64
- try:
65
- if file is None:
66
- return "Please upload a CSV file", None, None
67
-
68
- # Load raw data
69
- self.df = pd.read_csv(file.name)
70
-
71
- # Check for required columns - be flexible with column names
72
- required_columns = ['customer_id', 'order_date', 'amount']
73
- df_columns_lower = [col.lower().strip() for col in self.df.columns]
74
-
75
- # Map common variations
76
- column_mapping = {}
77
- for req_col in required_columns:
78
- found = False
79
- for df_col in self.df.columns:
80
- if req_col in df_col.lower() or df_col.lower().strip() in req_col:
81
- column_mapping[req_col] = df_col
82
- found = True
83
- break
84
- # Check for common variations
85
- variations = {
86
- 'customer_id': ['customer', 'cust_id', 'id', 'customerid', 'client_id'],
87
- 'order_date': ['date', 'order_date', 'orderdate', 'purchase_date', 'transaction_date'],
88
- 'amount': ['revenue', 'value', 'price', 'total', 'sales', 'order_value']
89
- }
90
- if req_col in variations:
91
- for var in variations[req_col]:
92
- if var in df_col.lower():
93
- column_mapping[req_col] = df_col
94
- found = True
95
- break
96
- if not found:
97
- return f"Missing required column: {req_col}. Available columns: {list(self.df.columns)}", None, None
98
-
99
- # Rename columns to standard names
100
- self.df = self.df.rename(columns=column_mapping)
101
-
102
- # Clean and convert data types
103
- self.df['customer_id'] = self.df['customer_id'].astype(str)
104
- self.df['order_date'] = pd.to_datetime(self.df['order_date'], errors='coerce')
105
- self.df['amount'] = pd.to_numeric(self.df['amount'], errors='coerce')
106
-
107
- # Remove rows with invalid data
108
- initial_rows = len(self.df)
109
- self.df = self.df.dropna(subset=['customer_id', 'order_date', 'amount'])
110
- final_rows = len(self.df)
111
-
112
- if final_rows == 0:
113
- return "No valid data rows found after cleaning", None, None
114
-
115
- # Calculate RFM metrics
116
- self.processed_df = self.calculate_rfm_metrics(self.df.copy())
117
-
118
- # Perform customer segmentation
119
- self.processed_df = self.perform_customer_segmentation(self.processed_df)
120
-
121
- # Generate summary
122
- summary_html = self.generate_summary_dashboard()
123
-
124
- status_msg = f"✅ Data loaded successfully! Processed {final_rows} records from {self.df['customer_id'].nunique()} customers."
125
- if initial_rows != final_rows:
126
- status_msg += f" ({initial_rows - final_rows} invalid rows removed)"
127
-
128
- return status_msg, summary_html, self.processed_df.head(20)
129
-
130
- except Exception as e:
131
- return f"❌ Error loading data: {str(e)}", None, None
132
 
133
- def calculate_rfm_metrics(self, df):
134
- """Calculate RFM metrics from transaction data"""
135
- try:
136
- current_date = df['order_date'].max() + timedelta(days=1)
137
-
138
- # Calculate customer-level metrics
139
- customer_metrics = df.groupby('customer_id').agg({
140
- 'order_date': ['max', 'count'],
141
- 'amount': ['sum', 'mean']
142
- }).round(2)
143
-
144
- # Flatten column names
145
- customer_metrics.columns = ['last_order_date', 'frequency', 'monetary', 'avg_order_value']
146
- customer_metrics['recency_days'] = (current_date - customer_metrics['last_order_date']).dt.days
147
-
148
- # Merge back with original data
149
- df_with_rfm = df.merge(
150
- customer_metrics[['recency_days', 'frequency', 'monetary']],
151
- left_on='customer_id',
152
- right_index=True,
153
- how='left'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  )
155
-
156
- return df_with_rfm
157
-
158
- except Exception as e:
159
- print(f"Error in calculate_rfm_metrics: {e}")
160
- # Add default RFM values
161
- df['recency_days'] = 30
162
- df['frequency'] = 1
163
- df['monetary'] = df['amount']
164
- return df
 
 
 
165
 
166
- def perform_customer_segmentation(self, df):
167
- """Perform customer segmentation based on RFM analysis"""
168
- try:
169
- # Get unique customer data
170
- customer_df = df.groupby('customer_id').agg({
171
- 'recency_days': 'first',
172
- 'frequency': 'first',
173
- 'monetary': 'first'
174
- }).reset_index()
175
-
176
- # Calculate RFM scores using quantiles with duplicates handling
177
- if len(customer_df) >= 5:
178
- try:
179
- customer_df['R_Score'] = pd.qcut(customer_df['recency_days'], 5, labels=[5,4,3,2,1], duplicates='drop')
180
- customer_df['F_Score'] = pd.qcut(customer_df['frequency'], 5, labels=[1,2,3,4,5], duplicates='drop')
181
- customer_df['M_Score'] = pd.qcut(customer_df['monetary'], 5, labels=[1,2,3,4,5], duplicates='drop')
182
- except (ValueError, TypeError):
183
- # Fallback to percentile-based scoring
184
- customer_df['R_Score'] = pd.cut(customer_df['recency_days'], bins=5, labels=[5,4,3,2,1], include_lowest=True)
185
- customer_df['F_Score'] = pd.cut(customer_df['frequency'], bins=5, labels=[1,2,3,4,5], include_lowest=True)
186
- customer_df['M_Score'] = pd.cut(customer_df['monetary'], bins=5, labels=[1,2,3,4,5], include_lowest=True)
187
- else:
188
- # Simple scoring for small datasets
189
- customer_df['R_Score'] = 3
190
- customer_df['F_Score'] = 3
191
- customer_df['M_Score'] = 3
192
-
193
- # Convert to int, handle NaN values
194
- customer_df['R_Score'] = pd.to_numeric(customer_df['R_Score'], errors='coerce').fillna(3).astype(int)
195
- customer_df['F_Score'] = pd.to_numeric(customer_df['F_Score'], errors='coerce').fillna(3).astype(int)
196
- customer_df['M_Score'] = pd.to_numeric(customer_df['M_Score'], errors='coerce').fillna(3).astype(int)
197
-
198
- def segment_customers(row):
199
- if row['R_Score'] >= 4 and row['F_Score'] >= 4 and row['M_Score'] >= 4:
200
- return 'Champions'
201
- elif row['R_Score'] >= 3 and row['F_Score'] >= 3 and row['M_Score'] >= 3:
202
- return 'Loyal Customers'
203
- elif row['R_Score'] >= 3 and row['F_Score'] >= 2:
204
- return 'Potential Loyalists'
205
- elif row['R_Score'] >= 4 and row['F_Score'] <= 2:
206
- return 'New Customers'
207
- elif row['R_Score'] <= 2 and row['F_Score'] >= 3:
208
- return 'At Risk'
209
- elif row['R_Score'] <= 2 and row['F_Score'] <= 2 and row['M_Score'] >= 3:
210
- return 'Cannot Lose Them'
211
- elif row['R_Score'] <= 2 and row['F_Score'] <= 2 and row['M_Score'] <= 2:
212
- return 'Lost Customers'
213
- else:
214
- return 'Others'
215
-
216
- customer_df['Segment'] = customer_df.apply(segment_customers, axis=1)
217
-
218
- customer_df['Churn_Risk'] = customer_df.apply(lambda x:
219
- 'High' if x['Segment'] in ['Lost Customers', 'At Risk'] else
220
- 'Medium' if x['Segment'] in ['Others', 'Cannot Lose Them'] else 'Low', axis=1)
221
-
222
- # Merge segmentation data back
223
- segment_data = customer_df[['customer_id', 'Segment', 'Churn_Risk', 'R_Score', 'F_Score', 'M_Score']]
224
- df_with_segments = df.merge(segment_data, on='customer_id', how='left')
225
-
226
- return df_with_segments
227
-
228
- except Exception as e:
229
- print(f"Error in perform_customer_segmentation: {e}")
230
- # Return original df with dummy segments if segmentation fails
231
- df['Segment'] = 'Others'
232
- df['Churn_Risk'] = 'Medium'
233
- df['R_Score'] = 3
234
- df['F_Score'] = 3
235
- df['M_Score'] = 3
236
- return df
237
 
238
- def generate_summary_dashboard(self):
239
- """Generate modern dashboard summary with KPI cards"""
240
- if self.processed_df is None:
241
- return "No data loaded"
 
 
 
242
 
243
- try:
244
- total_customers = self.processed_df['customer_id'].nunique()
245
- total_orders = len(self.processed_df)
246
- total_revenue = self.processed_df['amount'].sum()
247
- avg_order_value = self.processed_df['amount'].mean()
248
-
249
- # Get segment and risk distributions
250
- segment_dist = self.processed_df.groupby('customer_id')['Segment'].first().value_counts()
251
- risk_dist = self.processed_df.groupby('customer_id')['Churn_Risk'].first().value_counts()
252
-
253
- # Create modern dashboard
254
- summary_html = f"""
255
- <div style="display: flex; flex-wrap: wrap; gap: 1rem; margin-bottom: 2rem;">
256
- <div style="flex: 1; min-width: 200px; background: linear-gradient(135deg, #3b82f6, #1d4ed8); padding: 1.5rem; border-radius: 12px; color: white; text-align: center;">
257
- <h3 style="margin: 0 0 0.5rem 0; font-size: 0.9rem; opacity: 0.9;">Total Customers</h3>
258
- <div style="font-size: 2.5rem; font-weight: bold;">{total_customers:,}</div>
259
- </div>
260
- <div style="flex: 1; min-width: 200px; background: linear-gradient(135deg, #10b981, #047857); padding: 1.5rem; border-radius: 12px; color: white; text-align: center;">
261
- <h3 style="margin: 0 0 0.5rem 0; font-size: 0.9rem; opacity: 0.9;">Total Revenue</h3>
262
- <div style="font-size: 2.5rem; font-weight: bold;">${total_revenue/1000000:.1f}M</div>
263
- </div>
264
- <div style="flex: 1; min-width: 200px; background: linear-gradient(135deg, #8b5cf6, #6d28d9); padding: 1.5rem; border-radius: 12px; color: white; text-align: center;">
265
- <h3 style="margin: 0 0 0.5rem 0; font-size: 0.9rem; opacity: 0.9;">Avg Order Value</h3>
266
- <div style="font-size: 2.5rem; font-weight: bold;">${avg_order_value:.0f}</div>
267
- </div>
268
- <div style="flex: 1; min-width: 200px; background: linear-gradient(135deg, #ef4444, #dc2626); padding: 1.5rem; border-radius: 12px; color: white; text-align: center;">
269
- <h3 style="margin: 0 0 0.5rem 0; font-size: 0.9rem; opacity: 0.9;">High Risk Customers</h3>
270
- <div style="font-size: 2.5rem; font-weight: bold;">{risk_dist.get('High', 0)}</div>
271
- </div>
272
  </div>
273
- <div style="background: #f8fafc; padding: 1.5rem; border-radius: 12px; border-left: 4px solid #6366f1;">
274
- <h4 style="margin: 0 0 1rem 0; color: #374151;">Customer Segments Overview</h4>
275
- <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem;">
276
- {' '.join([f'<div><strong>{segment}:</strong> {count}</div>' for segment, count in segment_dist.items()])}
277
- </div>
 
 
 
 
 
 
 
 
278
  </div>
279
  """
280
-
281
- return summary_html
282
-
283
- except Exception as e:
284
- return f"Error generating dashboard: {str(e)}"
285
 
286
- def train_churn_model(self):
287
- """Train churn prediction model"""
288
- if self.processed_df is None:
289
- return "❌ No data available. Please upload and process a CSV file first.", None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  try:
292
- # Prepare customer-level features
293
- customer_features = self.processed_df.groupby('customer_id').agg({
294
- 'recency_days': 'first',
295
- 'frequency': 'first',
296
- 'monetary': 'first',
297
- 'amount': ['mean', 'std', 'min', 'max'],
298
- 'order_date': ['min', 'max']
299
- }).reset_index()
300
-
301
- # Flatten column names
302
- customer_features.columns = ['customer_id', 'recency_days', 'frequency', 'monetary',
303
- 'avg_amount', 'std_amount', 'min_amount', 'max_amount',
304
- 'first_order', 'last_order']
305
-
306
- # Handle missing values
307
- customer_features['std_amount'].fillna(0, inplace=True)
308
-
309
- # Calculate additional features
310
- customer_features['customer_lifetime'] = (customer_features['last_order'] - customer_features['first_order']).dt.days
311
- customer_features['customer_lifetime'].fillna(0, inplace=True)
312
-
313
- # Create churn labels based on recency (customers who haven't ordered in 90 days are churned)
314
- customer_features['churn_label'] = (customer_features['recency_days'] > 90).astype(int)
315
-
316
- # Check if we have enough data for training
317
- if len(customer_features) < 10:
318
- return "❌ Not enough data for model training (minimum 10 customers required).", None
319
-
320
- # Check if we have both classes
321
- if customer_features['churn_label'].nunique() < 2:
322
- return "❌ Cannot train model: all customers have the same churn status.", None
323
-
324
- # Select features for modeling
325
- feature_cols = ['recency_days', 'frequency', 'monetary', 'avg_amount', 'std_amount',
326
- 'min_amount', 'max_amount', 'customer_lifetime']
327
-
328
- X = customer_features[feature_cols]
329
- y = customer_features['churn_label']
330
-
331
- # Train-test split
332
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
333
-
334
- # Train model
335
- if XGBOOST_AVAILABLE:
336
- try:
337
- self.model = xgb.XGBClassifier(random_state=42, eval_metric='logloss')
338
- self.model.fit(X_train, y_train)
339
- model_name = "XGBoost Classifier"
340
- except:
341
- self.model = RandomForestClassifier(random_state=42, n_estimators=100)
342
- self.model.fit(X_train, y_train)
343
- model_name = "Random Forest Classifier"
344
- else:
345
- self.model = RandomForestClassifier(random_state=42, n_estimators=100)
346
- self.model.fit(X_train, y_train)
347
- model_name = "Random Forest Classifier"
348
-
349
- # Make predictions
350
- y_pred = self.model.predict(X_test)
351
- accuracy = accuracy_score(y_test, y_pred)
352
-
353
- # Feature importance
354
- self.feature_importance = pd.DataFrame({
355
- 'feature': feature_cols,
356
- 'importance': self.model.feature_importances_
357
- }).sort_values('importance', ascending=False)
358
 
359
- # Predict for all customers
360
- all_predictions = self.model.predict_proba(X)[:, 1]
361
- customer_features['churn_probability'] = all_predictions
362
- self.predictions = customer_features
363
 
364
- results_html = f"""
365
- <div style="background: white; padding: 2rem; border-radius: 1rem; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); margin-bottom: 2rem;">
366
- <div style="text-align: center; margin-bottom: 2rem;">
367
- <h3 style="color: #1f2937; font-size: 1.5rem; font-weight: bold; margin-bottom: 0.5rem;">
368
- ✅ Model Training Completed
369
- </h3>
370
- <p style="color: #6b7280;">{model_name} with Advanced Feature Engineering</p>
371
- </div>
372
-
373
- <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem; margin-bottom: 2rem;">
374
- <div style="background: linear-gradient(135deg, #6366f1, #4f46e5); padding: 1rem; border-radius: 8px; text-align: center; color: white;">
375
- <div style="font-size: 2rem; font-weight: bold;">{accuracy:.1%}</div>
376
- <div style="font-size: 0.9rem;">Model Accuracy</div>
377
- </div>
378
- <div style="background: linear-gradient(135deg, #10b981, #059669); padding: 1rem; border-radius: 8px; text-align: center; color: white;">
379
- <div style="font-size: 2rem; font-weight: bold;">{len(feature_cols)}</div>
380
- <div style="font-size: 0.9rem;">Features Used</div>
381
- </div>
382
- <div style="background: linear-gradient(135deg, #f59e0b, #d97706); padding: 1rem; border-radius: 8px; text-align: center; color: white;">
383
- <div style="font-size: 2rem; font-weight: bold;">{len(X_train)}</div>
384
- <div style="font-size: 0.9rem;">Training Samples</div>
385
- </div>
386
- </div>
387
-
388
- <div style="background: #f8fafc; padding: 1rem; border-radius: 8px;">
389
- <h4 style="color: #374151; margin-bottom: 1rem;">Top Feature Importance</h4>
390
- {''.join([f'<div style="display: flex; justify-content: space-between; padding: 0.5rem 0; border-bottom: 1px solid #e5e7eb;"><span>{row["feature"].replace("_", " ").title()}</span><span style="font-weight: bold;">{row["importance"]:.3f}</span></div>' for _, row in self.feature_importance.head(5).iterrows()])}
391
- </div>
392
- </div>
393
- """
394
 
395
- return results_html, self.create_feature_importance_chart()
 
396
 
397
  except Exception as e:
398
- return f"Error training model: {str(e)}", None
399
 
400
- def create_feature_importance_chart(self):
401
- """Create feature importance visualization"""
402
- if self.feature_importance is None:
403
- return None
404
-
405
  try:
406
- fig = px.bar(
407
- self.feature_importance.head(8),
408
- x='importance',
409
- y='feature',
410
- orientation='h',
411
- title='Feature Importance Analysis',
412
- labels={'importance': 'Importance Score', 'feature': 'Features'},
413
- color='importance',
414
- color_continuous_scale='viridis'
415
- )
416
 
417
- fig.update_layout(
418
- height=500,
419
- showlegend=False,
420
- plot_bgcolor='white',
421
- paper_bgcolor='white',
422
- title={'x': 0.5, 'xanchor': 'center'},
423
- yaxis={'categoryorder': 'total ascending'}
424
  )
425
 
426
- return fig
427
 
428
  except Exception as e:
429
- print(f"Error creating feature importance chart: {e}")
430
- return None
431
 
432
  def create_visualizations(self):
433
- """Create comprehensive visualizations"""
434
- if self.processed_df is None or len(self.processed_df) == 0:
435
- print("No processed data available for visualization")
436
- return None, None, None, None
437
 
438
  try:
439
- print(f"Creating visualizations with {len(self.processed_df)} rows")
440
-
441
- # 1. Customer Segment Distribution
442
- segment_data = self.processed_df.groupby('customer_id')['Segment'].first().value_counts().reset_index()
443
- segment_data.columns = ['Segment', 'Count']
444
- print(f"Segment data: {segment_data}")
445
-
446
- if len(segment_data) == 0:
447
- print("No segment data found")
448
- fig1 = None
449
- else:
450
- fig1 = px.pie(
451
- segment_data,
452
- values='Count',
453
- names='Segment',
454
- title='Customer Segment Distribution',
455
- hole=0.4,
456
- color_discrete_sequence=['#6366f1', '#10b981', '#f59e0b', '#ef4444', '#8b5cf6', '#ec4899']
457
- )
458
- fig1.update_traces(textposition='inside', textinfo='percent+label')
459
- fig1.update_layout(height=400, title={'x': 0.5, 'xanchor': 'center'})
460
-
461
- # 2. RFM Analysis
462
- customer_rfm = self.processed_df.groupby('customer_id').agg({
463
- 'recency_days': 'first',
464
- 'frequency': 'first',
465
- 'monetary': 'first',
466
- 'Segment': 'first'
467
- }).reset_index()
468
- print(f"RFM data shape: {customer_rfm.shape}")
469
 
470
- if len(customer_rfm) == 0:
471
- print("No RFM data found")
472
- fig2 = None
473
- else:
474
- fig2 = px.scatter(
475
- customer_rfm,
476
- x='recency_days',
477
- y='frequency',
478
- size='monetary',
479
- color='Segment',
480
- title='RFM Customer Behavior Matrix',
481
- labels={
482
- 'recency_days': 'Days Since Last Purchase',
483
- 'frequency': 'Purchase Frequency',
484
- 'monetary': 'Total Revenue'
485
- }
486
- )
487
- fig2.update_layout(height=400, title={'x': 0.5, 'xanchor': 'center'})
488
-
489
- # 3. Churn Risk Distribution
490
- if self.predictions is not None and len(self.predictions) > 0:
491
- print(f"Using predictions data with {len(self.predictions)} rows")
492
- fig3 = px.histogram(
493
- self.predictions,
494
- x='churn_probability',
495
- nbins=20,
496
- title='Churn Probability Distribution',
497
- labels={'churn_probability': 'Churn Probability', 'count': 'Number of Customers'},
498
- color_discrete_sequence=['#6366f1']
499
- )
500
- fig3.add_vline(x=0.5, line_dash="dash", line_color="red",
501
- annotation_text="High Risk Threshold")
502
- else:
503
- risk_data = self.processed_df.groupby('customer_id')['Churn_Risk'].first().value_counts().reset_index()
504
- risk_data.columns = ['Risk_Level', 'Count']
505
- print(f"Risk data: {risk_data}")
506
-
507
- if len(risk_data) == 0:
508
- print("No risk data found")
509
- fig3 = None
510
- else:
511
- colors_map = {'High': '#ef4444', 'Medium': '#f59e0b', 'Low': '#10b981'}
512
- fig3 = px.bar(
513
- risk_data,
514
- x='Risk_Level',
515
- y='Count',
516
- title='Customer Churn Risk Distribution',
517
- color='Risk_Level',
518
- color_discrete_map=colors_map
519
- )
520
- fig3.update_layout(height=400, title={'x': 0.5, 'xanchor': 'center'}, showlegend=False)
521
-
522
- # 4. Revenue Trends
523
- try:
524
- self.processed_df['order_month'] = self.processed_df['order_date'].dt.to_period('M')
525
- monthly_revenue = self.processed_df.groupby('order_month')['amount'].sum().reset_index()
526
- monthly_revenue['order_month'] = monthly_revenue['order_month'].astype(str)
527
- print(f"Monthly revenue data: {monthly_revenue.head()}")
528
-
529
- if len(monthly_revenue) == 0:
530
- fig4 = None
531
- else:
532
- fig4 = px.line(
533
- monthly_revenue,
534
- x='order_month',
535
- y='amount',
536
- title='Monthly Revenue Trends',
537
- labels={'amount': 'Revenue ($)', 'order_month': 'Month'}
538
- )
539
- fig4.update_traces(line_color='#6366f1', line_width=3)
540
- fig4.update_layout(height=400, title={'x': 0.5, 'xanchor': 'center'})
541
- except Exception as e:
542
- print(f"Error creating revenue chart: {e}")
543
- fig4 = None
544
 
545
- return fig1, fig2, fig3, fig4
546
 
547
  except Exception as e:
548
- print(f"Error creating visualizations: {e}")
549
- import traceback
550
- traceback.print_exc()
551
- return None, None, None, None
552
 
553
- def create_customer_table(self):
554
- """Create customer segmentation table"""
555
- if self.processed_df is None:
556
  return None
557
-
558
  try:
559
- customer_summary = self.processed_df.groupby('customer_id').agg({
560
- 'Segment': 'first',
561
- 'Churn_Risk': 'first',
562
- 'recency_days': 'first',
563
- 'frequency': 'first',
564
- 'monetary': 'first',
565
- 'amount': 'mean'
566
- }).reset_index()
567
-
568
- if self.predictions is not None:
569
- customer_summary = customer_summary.merge(
570
- self.predictions[['customer_id', 'churn_probability']],
571
- on='customer_id',
572
- how='left'
573
  )
574
- customer_summary['churn_probability'] = customer_summary['churn_probability'].fillna(0)
575
  else:
576
- customer_summary['churn_probability'] = 0.5
577
 
578
- customer_summary['monetary'] = customer_summary['monetary'].round(2)
579
- customer_summary['amount'] = customer_summary['amount'].round(2)
580
- customer_summary['churn_probability'] = (customer_summary['churn_probability'] * 100).round(1)
 
 
581
 
582
- customer_summary.columns = [
583
- 'Customer ID', 'Segment', 'Risk Level', 'Recency (Days)',
584
- 'Frequency', 'Total Spent ($)', 'Avg Order ($)', 'Churn Probability (%)'
585
  ]
586
 
587
- return customer_summary.head(50)
588
 
589
  except Exception as e:
590
- print(f"Error creating customer table: {e}")
591
  return None
592
 
593
- def get_customer_insights(self, customer_id):
594
- """Get detailed insights for a specific customer"""
595
- if self.processed_df is None:
596
- return "❌ No data available"
597
-
598
- if not customer_id:
599
- return "Please enter a customer ID"
600
-
601
- try:
602
- customer_data = self.processed_df[self.processed_df['customer_id'] == customer_id]
603
- if customer_data.empty:
604
- return f"❌ Customer {customer_id} not found"
605
-
606
- total_orders = len(customer_data)
607
- total_spent = customer_data['amount'].sum()
608
- avg_order_value = customer_data['amount'].mean()
609
- segment = customer_data['Segment'].iloc[0]
610
- risk_level = customer_data['Churn_Risk'].iloc[0]
611
- recency = customer_data['recency_days'].iloc[0]
612
-
613
- churn_prob = 0.5
614
- if self.predictions is not None:
615
- pred_data = self.predictions[self.predictions['customer_id'] == customer_id]
616
- if not pred_data.empty:
617
- churn_prob = pred_data['churn_probability'].iloc[0]
618
-
619
- insights_html = f"""
620
- <div style="background: white; padding: 2rem; border-radius: 1rem; box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1); margin-bottom: 1rem;">
621
- <h3 style="text-align: center; color: #1f2937; margin-bottom: 1.5rem;">Customer Profile: {customer_id}</h3>
622
-
623
- <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-bottom: 2rem;">
624
- <div style="background: linear-gradient(135deg, #6366f1, #4f46e5); padding: 1rem; border-radius: 8px; color: white; text-align: center;">
625
- <h4 style="margin: 0 0 0.5rem 0; font-size: 0.9rem; opacity: 0.9;">Segment</h4>
626
- <div style="font-size: 1.2rem; font-weight: bold;">{segment}</div>
627
- </div>
628
- <div style="background: linear-gradient(135deg, #ef4444, #dc2626); padding: 1rem; border-radius: 8px; color: white; text-align: center;">
629
- <h4 style="margin: 0 0 0.5rem 0; font-size: 0.9rem; opacity: 0.9;">Churn Risk</h4>
630
- <div style="font-size: 1.2rem; font-weight: bold;">{risk_level}</div>
631
- </div>
632
- <div style="background: linear-gradient(135deg, #8b5cf6, #6d28d9); padding: 1rem; border-radius: 8px; color: white; text-align: center;">
633
- <h4 style="margin: 0 0 0.5rem 0; font-size: 0.9rem; opacity: 0.9;">Churn Probability</h4>
634
- <div style="font-size: 1.2rem; font-weight: bold;">{churn_prob:.1%}</div>
635
- </div>
636
- </div>
637
-
638
- <div style="background: #f8fafc; padding: 1.5rem; border-radius: 8px; margin-bottom: 1rem;">
639
- <h4 style="color: #374151; margin-bottom: 1rem;">Transaction Analytics</h4>
640
- <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(150px, 1fr)); gap: 1rem;">
641
- <div>
642
- <div style="font-size: 0.8rem; color: #6b7280; margin-bottom: 0.2rem;">Total Orders</div>
643
- <div style="font-size: 1.5rem; font-weight: bold; color: #1f2937;">{total_orders}</div>
644
- </div>
645
- <div>
646
- <div style="font-size: 0.8rem; color: #6b7280; margin-bottom: 0.2rem;">Total Spent</div>
647
- <div style="font-size: 1.5rem; font-weight: bold; color: #1f2937;">${total_spent:,.0f}</div>
648
- </div>
649
- <div>
650
- <div style="font-size: 0.8rem; color: #6b7280; margin-bottom: 0.2rem;">Avg Order Value</div>
651
- <div style="font-size: 1.5rem; font-weight: bold; color: #1f2937;">${avg_order_value:.0f}</div>
652
- </div>
653
- <div>
654
- <div style="font-size: 0.8rem; color: #6b7280; margin-bottom: 0.2rem;">Days Since Last Order</div>
655
- <div style="font-size: 1.5rem; font-weight: bold; color: #1f2937;">{recency}</div>
656
- </div>
657
- </div>
658
- </div>
659
-
660
- <div style="background: linear-gradient(135deg, #f0f9ff, #e0f2fe); border-left: 4px solid #3b82f6; padding: 1rem; border-radius: 4px;">
661
- <h4 style="color: #1e40af; margin-bottom: 0.5rem;">Recommendations</h4>
662
- <p style="color: #1f2937; margin: 0;">{self._get_customer_recommendations(segment, risk_level, churn_prob, recency)}</p>
663
- </div>
664
- </div>
665
- """
666
-
667
- return insights_html
668
-
669
- except Exception as e:
670
- return f"Error getting customer insights: {str(e)}"
671
-
672
- def _get_customer_recommendations(self, segment, risk_level, churn_prob, recency):
673
- """Generate personalized recommendations based on customer profile"""
674
- recommendations = []
675
-
676
- if risk_level == 'High' or churn_prob > 0.7:
677
- recommendations.append("URGENT: Personal outreach required within 24 hours")
678
- recommendations.append("Offer retention incentive or loyalty program")
679
- elif risk_level == 'Medium':
680
- recommendations.append("Send personalized re-engagement campaign")
681
-
682
- if segment == 'Champions':
683
- recommendations.append("Invite to VIP program or advisory board")
684
- elif segment == 'At Risk':
685
- recommendations.append("Proactive customer success intervention needed")
686
- elif segment == 'New Customers':
687
- recommendations.append("Deploy onboarding campaign sequence")
688
-
689
- if recency > 60:
690
- recommendations.append("Win-back campaign with special offer")
691
-
692
- return " • ".join(recommendations) if recommendations else "Continue monitoring customer engagement patterns."
693
-
694
  def generate_pdf_report(self):
695
- """Generate PDF report (simplified version if ReportLab not available)"""
696
- if not REPORTLAB_AVAILABLE:
697
- return "PDF generation requires ReportLab library. Please install: pip install reportlab"
698
-
699
- if self.processed_df is None:
700
- return "No data available for report generation"
701
-
702
  try:
703
- buffer = io.BytesIO()
704
- doc = SimpleDocTemplate(buffer, pagesize=A4)
705
- styles = getSampleStyleSheet()
706
- story = []
707
-
708
- # Title
709
- title_style = ParagraphStyle('Title', parent=styles['Title'], fontSize=24, spaceAfter=30)
710
- story.append(Paragraph("B2B Customer Analytics Report", title_style))
711
-
712
- # Summary stats
713
- total_customers = self.processed_df['customer_id'].nunique()
714
- total_revenue = self.processed_df['amount'].sum()
715
-
716
- story.append(Paragraph("Executive Summary", styles['Heading2']))
717
- summary_text = f"""
718
- This analysis covers {total_customers} customers with total revenue of ${total_revenue:,.2f}.
719
- The data has been processed for customer segmentation and churn risk assessment.
720
- """
721
- story.append(Paragraph(summary_text, styles['Normal']))
722
 
723
- # Build PDF
724
- doc.build(story)
725
- pdf_bytes = buffer.getvalue()
726
- buffer.close()
727
- return pdf_bytes
728
 
 
 
 
 
 
 
729
  except Exception as e:
730
- return f"Error generating PDF: {str(e)}"
731
-
732
 
733
- def create_gradio_interface():
734
- """Create the Gradio interface"""
735
-
736
- # Initialize analytics instance
737
- analytics = B2BCustomerAnalytics()
738
-
739
- # Define interface functions
740
- def load_data(file):
741
- return analytics.load_and_process_data(file)
742
 
743
- def train_model():
744
- return analytics.train_churn_model()
745
 
746
- def create_charts():
747
- charts = analytics.create_visualizations()
748
- return charts if charts[0] is not None else [None, None, None, None]
749
-
750
- def get_customer_table():
751
- return analytics.create_customer_table()
752
-
753
- def get_insights(customer_id):
754
- return analytics.get_customer_insights(customer_id)
755
-
756
- def generate_report():
757
- return analytics.generate_pdf_report()
758
-
759
- # Custom CSS
760
- custom_css = """
761
- .gradio-container {
762
- font-family: 'Inter', system-ui, sans-serif !important;
763
- max-width: 1200px !important;
764
- }
765
- """
766
-
767
- # Create interface
768
- with gr.Blocks(theme=gr.themes.Soft(), title="B2B Customer Analytics", css=custom_css) as demo:
769
 
770
  gr.HTML("""
771
- <div style="background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%); padding: 2rem; border-radius: 1rem; color: white; text-align: center; margin-bottom: 2rem;">
772
- <h1 style="font-size: 2.5rem; font-weight: bold; margin-bottom: 0.5rem;">B2B Customer Analytics Platform</h1>
773
- <p style="font-size: 1.1rem; opacity: 0.9;">Advanced Customer Segmentation & Churn Prediction</p>
 
 
 
 
 
774
  </div>
775
  """)
776
 
777
  with gr.Tabs():
778
-
779
  with gr.Tab("Data Upload & Dashboard"):
780
  with gr.Row():
781
- with gr.Column():
782
- file_input = gr.File(label="Upload Customer Data CSV", file_types=[".csv"])
783
- load_btn = gr.Button("Load & Process Data", variant="primary", size="lg")
784
- load_status = gr.HTML()
785
 
786
- summary_display = gr.HTML()
 
787
  data_preview = gr.DataFrame(label="Data Preview")
788
 
 
789
  with gr.Tab("Customer Segmentation"):
790
  with gr.Row():
791
- with gr.Column():
792
- segment_chart = gr.Plot(label="Customer Segments")
793
- with gr.Column():
794
- rfm_chart = gr.Plot(label="RFM Analysis")
795
 
796
- customer_table = gr.DataFrame(label="Customer Details")
797
 
 
798
  with gr.Tab("Churn Prediction"):
799
  train_btn = gr.Button("Train Churn Model", variant="primary", size="lg")
800
- model_results = gr.HTML()
801
 
802
  with gr.Row():
803
- with gr.Column():
804
- performance_chart = gr.Plot(label="Feature Importance")
805
- with gr.Column():
806
- churn_chart = gr.Plot(label="Churn Risk")
807
-
808
- with gr.Tab("Revenue Analytics"):
809
- revenue_chart = gr.Plot(label="Monthly Revenue Trends")
810
-
811
- with gr.Tab("Customer Insights"):
812
- with gr.Row():
813
- customer_id_input = gr.Textbox(label="Customer ID", placeholder="Enter customer ID")
814
- insights_btn = gr.Button("Get Profile", variant="primary")
815
-
816
- customer_insights = gr.HTML()
817
 
 
818
  with gr.Tab("Reports"):
819
  report_btn = gr.Button("Generate PDF Report", variant="primary", size="lg")
 
820
  report_file = gr.File(label="Download Report")
821
 
822
- # Event handlers with better error handling
823
- def safe_load_data(file):
824
- try:
825
- return load_data(file)
826
- except Exception as e:
827
- return f"Error: {str(e)}", None, None
828
-
829
- def safe_create_charts():
830
- try:
831
- return create_charts()
832
- except Exception as e:
833
- return None, None, None, None
834
-
835
- def safe_train_model():
836
- try:
837
- return train_model()
838
- except Exception as e:
839
- return f"Error: {str(e)}", None
840
 
841
- def safe_get_table():
842
- try:
843
- return get_customer_table()
844
- except Exception as e:
845
- return None
 
846
 
847
- def safe_get_insights(customer_id):
848
- try:
849
- return get_insights(customer_id)
850
- except Exception as e:
851
- return f"Error: {str(e)}"
852
-
853
- # Connect events - fix the chart loading issue
854
- def load_and_update_all(file):
855
- # Load data first
856
- status, summary, preview = safe_load_data(file)
857
-
858
- # Then create charts if data loaded successfully
859
- if "successfully" in str(status):
860
- charts = safe_create_charts()
861
- table = safe_get_table()
862
- return status, summary, preview, charts[0], charts[1], charts[2], charts[3], table
863
- else:
864
- return status, summary, preview, None, None, None, None, None
865
 
 
866
  load_btn.click(
867
- fn=load_and_update_all,
868
  inputs=[file_input],
869
- outputs=[load_status, summary_display, data_preview, segment_chart, rfm_chart, churn_chart, revenue_chart, customer_table]
 
870
  )
871
 
872
  train_btn.click(
873
- fn=safe_train_model,
874
- outputs=[model_results, performance_chart]
875
- )
876
-
877
- insights_btn.click(
878
- fn=safe_get_insights,
879
- inputs=[customer_id_input],
880
- outputs=[customer_insights]
881
- )
882
-
883
- report_btn.click(
884
- fn=generate_report,
885
- outputs=[report_file]
886
- )
887
-
888
- return demo
889
-
890
-
891
- if __name__ == "__main__":
892
- demo = create_gradio_interface()
893
- demo.launch(share=True, server_name="0.0.0.0", server_port=7860)
 
4
  import numpy as np
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
+ from sklearn.model_selection import train_test_split, cross_val_score
8
  from sklearn.ensemble import RandomForestClassifier
9
+ from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
10
  import plotly.express as px
11
  import plotly.graph_objects as go
 
 
12
  from datetime import datetime, timedelta
13
  import io
14
  import base64
15
  import warnings
16
  warnings.filterwarnings('ignore')
17
 
18
+ # Optional imports with fallbacks
19
  try:
20
  import xgboost as xgb
21
  XGBOOST_AVAILABLE = True
22
  except ImportError:
23
  XGBOOST_AVAILABLE = False
 
24
 
25
  try:
26
  from reportlab.lib.pagesizes import letter, A4
 
28
  from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
29
  from reportlab.lib.units import inch
30
  from reportlab.lib import colors
31
+ from reportlab.graphics.shapes import Drawing
32
+ from reportlab.graphics.charts.piecharts import Pie
33
+ from reportlab.graphics.charts.barcharts import VerticalBarChart
34
+ from reportlab.graphics import renderPDF
35
  REPORTLAB_AVAILABLE = True
36
  except ImportError:
37
  REPORTLAB_AVAILABLE = False
 
38
 
39
+ # Configuration
40
+ CONFIG = {
41
+ 'churn_threshold_days': 90,
42
+ 'high_risk_probability': 0.7,
43
+ 'rfm_quantiles': 5,
44
+ 'min_customers_for_training': 10
45
+ }
46
+
47
  COLORS = {
48
  'primary': '#6366f1',
49
+ 'success': '#10b981',
50
  'warning': '#f59e0b',
51
  'danger': '#ef4444',
52
+ 'purple': '#8b5cf6'
 
 
 
53
  }
54
 
55
+ class DataProcessor:
56
+ """Handles data loading, cleaning, and validation"""
57
+
58
+ @staticmethod
59
+ def load_and_validate(file_path):
60
+ """Load and validate CSV file"""
61
+ df = pd.read_csv(file_path)
62
+
63
+ # Column mapping
64
+ column_map = DataProcessor._map_columns(df.columns)
65
+ df = df.rename(columns=column_map)
66
+
67
+ # Data cleaning
68
+ df = DataProcessor._clean_data(df)
69
+
70
+ return df
71
+
72
+ @staticmethod
73
+ def _map_columns(columns):
74
+ """Map various column name formats to standard names"""
75
+ mapping = {}
76
+ columns_lower = [col.lower().strip() for col in columns]
77
+
78
+ variations = {
79
+ 'customer_id': ['customer', 'cust_id', 'id', 'customerid', 'client_id'],
80
+ 'order_date': ['date', 'orderdate', 'purchase_date', 'transaction_date'],
81
+ 'amount': ['revenue', 'value', 'price', 'total', 'sales', 'order_value']
82
+ }
83
+
84
+ for standard_name, variants in variations.items():
85
+ for col, col_lower in zip(columns, columns_lower):
86
+ if (standard_name in col_lower or
87
+ any(variant in col_lower for variant in variants)):
88
+ mapping[col] = standard_name
89
+ break
90
+
91
+ return mapping
92
+
93
+ @staticmethod
94
+ def _clean_data(df):
95
+ """Clean and convert data types"""
96
+ required_cols = ['customer_id', 'order_date', 'amount']
97
+
98
+ # Check required columns
99
+ missing_cols = [col for col in required_cols if col not in df.columns]
100
+ if missing_cols:
101
+ raise ValueError(f"Missing columns: {missing_cols}")
102
+
103
+ # Convert data types
104
+ df['customer_id'] = df['customer_id'].astype(str)
105
+ df['order_date'] = pd.to_datetime(df['order_date'], errors='coerce')
106
+ df['amount'] = pd.to_numeric(df['amount'], errors='coerce')
107
+
108
+ # Remove invalid rows
109
+ df = df.dropna(subset=required_cols)
110
+ df = df[df['amount'] > 0] # Remove negative/zero amounts
111
+
112
+ return df
113
 
114
+ class FeatureEngineering:
115
+ """Advanced feature engineering for customer analytics"""
116
+
117
+ @staticmethod
118
+ def calculate_rfm_features(df):
119
+ """Calculate RFM and additional behavioral features"""
120
+ current_date = df['order_date'].max() + timedelta(days=1)
121
+
122
+ # Basic RFM
123
+ customer_features = df.groupby('customer_id').agg({
124
+ 'order_date': ['min', 'max', 'count'],
125
+ 'amount': ['sum', 'mean', 'std', 'min', 'max']
126
+ })
127
+
128
+ # Flatten columns
129
+ customer_features.columns = [
130
+ 'first_order', 'last_order', 'frequency',
131
+ 'monetary', 'avg_amount', 'std_amount', 'min_amount', 'max_amount'
132
+ ]
133
+
134
+ # Calculate derived features
135
+ customer_features['recency_days'] = (current_date - customer_features['last_order']).dt.days
136
+ customer_features['customer_lifetime_days'] = (customer_features['last_order'] - customer_features['first_order']).dt.days
137
+ customer_features['std_amount'] = customer_features['std_amount'].fillna(0)
138
+
139
+ # Behavioral features
140
+ customer_features['order_frequency'] = customer_features['frequency'] / (customer_features['customer_lifetime_days'] + 1)
141
+ customer_features['amount_trend'] = customer_features['max_amount'] / customer_features['min_amount']
142
+ customer_features['amount_consistency'] = 1 - (customer_features['std_amount'] / customer_features['avg_amount']).fillna(0)
143
+
144
+ return customer_features.reset_index()
145
+
146
+ class CustomerSegmenter:
147
+ """Customer segmentation using RFM analysis"""
148
+
149
+ @staticmethod
150
+ def perform_segmentation(customer_features):
151
+ """Segment customers based on RFM scores"""
152
+ df = customer_features.copy()
153
+
154
+ # Calculate RFM scores
155
+ if len(df) >= CONFIG['rfm_quantiles']:
156
+ df['r_score'] = pd.qcut(df['recency_days'], CONFIG['rfm_quantiles'],
157
+ labels=[5,4,3,2,1], duplicates='drop')
158
+ df['f_score'] = pd.qcut(df['frequency'], CONFIG['rfm_quantiles'],
159
+ labels=[1,2,3,4,5], duplicates='drop')
160
+ df['m_score'] = pd.qcut(df['monetary'], CONFIG['rfm_quantiles'],
161
+ labels=[1,2,3,4,5], duplicates='drop')
162
+ else:
163
+ # Simple scoring for small datasets
164
+ df['r_score'] = pd.cut(df['recency_days'], bins=3, labels=[3,2,1])
165
+ df['f_score'] = pd.cut(df['frequency'], bins=3, labels=[1,2,3])
166
+ df['m_score'] = pd.cut(df['monetary'], bins=3, labels=[1,2,3])
167
+
168
+ # Convert to numeric
169
+ for col in ['r_score', 'f_score', 'm_score']:
170
+ df[col] = pd.to_numeric(df[col], errors='coerce').fillna(3).astype(int)
171
+
172
+ # Segment assignment
173
+ df['segment'] = df.apply(CustomerSegmenter._assign_segment, axis=1)
174
+ df['churn_risk'] = df['segment'].map(CustomerSegmenter._get_risk_mapping())
175
+
176
+ return df
177
+
178
+ @staticmethod
179
+ def _assign_segment(row):
180
+ """Assign customer segment based on RFM scores"""
181
+ r, f, m = row['r_score'], row['f_score'], row['m_score']
182
+
183
+ if r >= 4 and f >= 4 and m >= 4:
184
+ return 'Champions'
185
+ elif r >= 3 and f >= 3 and m >= 3:
186
+ return 'Loyal Customers'
187
+ elif r >= 3 and f >= 2:
188
+ return 'Potential Loyalists'
189
+ elif r >= 4 and f <= 2:
190
+ return 'New Customers'
191
+ elif r <= 2 and f >= 3:
192
+ return 'At Risk'
193
+ elif r <= 2 and f <= 2 and m >= 3:
194
+ return 'Cannot Lose'
195
+ elif r <= 2 and f <= 2 and m <= 2:
196
+ return 'Lost'
197
+ else:
198
+ return 'Others'
199
+
200
+ @staticmethod
201
+ def _get_risk_mapping():
202
+ """Map segments to risk levels"""
203
+ return {
204
+ 'Champions': 'Low',
205
+ 'Loyal Customers': 'Low',
206
+ 'Potential Loyalists': 'Medium',
207
+ 'New Customers': 'Low',
208
+ 'At Risk': 'High',
209
+ 'Cannot Lose': 'High',
210
+ 'Lost': 'High',
211
+ 'Others': 'Medium'
212
+ }
213
+
214
+ class ChurnPredictor:
215
+ """Machine learning model for churn prediction"""
216
+
217
  def __init__(self):
 
 
218
  self.model = None
219
  self.feature_importance = None
 
220
 
221
+ def train(self, customer_features):
222
+ """Train churn prediction model"""
223
+ df = customer_features.copy()
224
+
225
+ # Create target variable
226
+ df['churn_label'] = (df['recency_days'] > CONFIG['churn_threshold_days']).astype(int)
227
+
228
+ # Validate data
229
+ if len(df) < CONFIG['min_customers_for_training']:
230
+ raise ValueError(f"Insufficient data: need at least {CONFIG['min_customers_for_training']} customers")
231
+
232
+ if df['churn_label'].nunique() < 2:
233
+ raise ValueError("All customers have same churn status - cannot train model")
234
+
235
+ # Select features
236
+ feature_cols = [
237
+ 'recency_days', 'frequency', 'monetary', 'avg_amount', 'std_amount',
238
+ 'customer_lifetime_days', 'order_frequency', 'amount_trend', 'amount_consistency'
239
+ ]
240
+
241
+ X = df[feature_cols].fillna(0)
242
+ y = df['churn_label']
243
+
244
+ # Train model
245
+ self.model = self._get_best_model()
246
+ self.model.fit(X, y)
247
+
248
+ # Feature importance
249
+ self.feature_importance = pd.DataFrame({
250
+ 'feature': feature_cols,
251
+ 'importance': self.model.feature_importances_
252
+ }).sort_values('importance', ascending=False)
253
+
254
+ # Model evaluation
255
+ cv_scores = cross_val_score(self.model, X, y, cv=5, scoring='roc_auc')
256
+
257
+ # Predictions for all customers
258
+ df['churn_probability'] = self.model.predict_proba(X)[:, 1]
259
+
260
+ return {
261
+ 'model_type': type(self.model).__name__,
262
+ 'cv_auc_mean': cv_scores.mean(),
263
+ 'cv_auc_std': cv_scores.std(),
264
+ 'feature_importance': self.feature_importance,
265
+ 'predictions': df
266
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
+ def _get_best_model(self):
269
+ """Select best available model"""
270
+ if XGBOOST_AVAILABLE:
271
+ try:
272
+ return xgb.XGBClassifier(random_state=42, eval_metric='logloss')
273
+ except:
274
+ pass
275
+ return RandomForestClassifier(random_state=42, n_estimators=100)
276
+
277
+ class Visualizer:
278
+ """Create interactive visualizations"""
279
+
280
+ @staticmethod
281
+ def create_segment_chart(df):
282
+ """Customer segment distribution"""
283
+ segment_counts = df['segment'].value_counts()
284
+
285
+ fig = px.pie(
286
+ values=segment_counts.values,
287
+ names=segment_counts.index,
288
+ title='Customer Segment Distribution',
289
+ hole=0.4,
290
+ color_discrete_sequence=px.colors.qualitative.Set3
291
+ )
292
+ fig.update_layout(height=400, title_x=0.5)
293
+ return fig
294
+
295
+ @staticmethod
296
+ def create_rfm_scatter(df):
297
+ """RFM behavior matrix"""
298
+ fig = px.scatter(
299
+ df, x='recency_days', y='frequency', size='monetary',
300
+ color='segment', title='Customer Behavior Matrix (RFM)',
301
+ labels={'recency_days': 'Days Since Last Order', 'frequency': 'Order Count'}
302
+ )
303
+ fig.update_layout(height=400, title_x=0.5)
304
+ return fig
305
+
306
+ @staticmethod
307
+ def create_churn_distribution(df):
308
+ """Churn probability distribution"""
309
+ if 'churn_probability' in df.columns:
310
+ fig = px.histogram(
311
+ df, x='churn_probability', nbins=20,
312
+ title='Churn Probability Distribution',
313
+ labels={'churn_probability': 'Churn Probability'}
314
  )
315
+ fig.add_vline(x=CONFIG['high_risk_probability'], line_dash="dash",
316
+ line_color="red", annotation_text="High Risk Threshold")
317
+ else:
318
+ risk_counts = df['churn_risk'].value_counts()
319
+ colors = {'High': COLORS['danger'], 'Medium': COLORS['warning'], 'Low': COLORS['success']}
320
+ fig = px.bar(
321
+ x=risk_counts.index, y=risk_counts.values,
322
+ title='Churn Risk Distribution',
323
+ color=risk_counts.index, color_discrete_map=colors
324
+ )
325
+
326
+ fig.update_layout(height=400, title_x=0.5)
327
+ return fig
328
 
329
+ @staticmethod
330
+ def create_feature_importance_chart(feature_importance):
331
+ """Feature importance visualization"""
332
+ fig = px.bar(
333
+ feature_importance.head(8), x='importance', y='feature',
334
+ orientation='h', title='Feature Importance Analysis',
335
+ color='importance', color_continuous_scale='viridis'
336
+ )
337
+ fig.update_layout(height=500, title_x=0.5, yaxis={'categoryorder': 'total ascending'})
338
+ return fig
339
+
340
+ class ReportGenerator:
341
+ """Generate dashboards and PDF reports"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
+ @staticmethod
344
+ def create_dashboard(df, model_results=None):
345
+ """Generate HTML dashboard"""
346
+ total_customers = len(df)
347
+ total_revenue = df['monetary'].sum()
348
+ avg_order_value = df['avg_amount'].mean()
349
+ high_risk_count = len(df[df['churn_risk'] == 'High'])
350
 
351
+ dashboard_html = f"""
352
+ <div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; margin-bottom: 2rem;">
353
+ <div style="background: linear-gradient(135deg, {COLORS['primary']}, #4f46e5); padding: 1.5rem; border-radius: 12px; color: white; text-align: center;">
354
+ <h3 style="margin: 0 0 0.5rem 0; font-size: 0.9rem; opacity: 0.9;">Total Customers</h3>
355
+ <div style="font-size: 2.5rem; font-weight: bold;">{total_customers:,}</div>
356
+ </div>
357
+ <div style="background: linear-gradient(135deg, {COLORS['success']}, #047857); padding: 1.5rem; border-radius: 12px; color: white; text-align: center;">
358
+ <h3 style="margin: 0 0 0.5rem 0; font-size: 0.9rem; opacity: 0.9;">Total Revenue</h3>
359
+ <div style="font-size: 2.5rem; font-weight: bold;">${total_revenue/1000:.0f}K</div>
360
+ </div>
361
+ <div style="background: linear-gradient(135deg, {COLORS['purple']}, #6d28d9); padding: 1.5rem; border-radius: 12px; color: white; text-align: center;">
362
+ <h3 style="margin: 0 0 0.5rem 0; font-size: 0.9rem; opacity: 0.9;">Avg Order Value</h3>
363
+ <div style="font-size: 2.5rem; font-weight: bold;">${avg_order_value:.0f}</div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
364
  </div>
365
+ <div style="background: linear-gradient(135deg, {COLORS['danger']}, #dc2626); padding: 1.5rem; border-radius: 12px; color: white; text-align: center;">
366
+ <h3 style="margin: 0 0 0.5rem 0; font-size: 0.9rem; opacity: 0.9;">High Risk</h3>
367
+ <div style="font-size: 2.5rem; font-weight: bold;">{high_risk_count}</div>
368
+ </div>
369
+ </div>
370
+ """
371
+
372
+ if model_results:
373
+ dashboard_html += f"""
374
+ <div style="background: #f8fafc; padding: 1.5rem; border-radius: 12px; border-left: 4px solid {COLORS['primary']}; margin-top: 1rem;">
375
+ <h4 style="margin: 0 0 1rem 0; color: #374151;">Model Performance</h4>
376
+ <p><strong>Model:</strong> {model_results['model_type']}</p>
377
+ <p><strong>Cross-validation AUC:</strong> {model_results['cv_auc_mean']:.3f} ± {model_results['cv_auc_std']:.3f}</p>
378
  </div>
379
  """
380
+
381
+ return dashboard_html
 
 
 
382
 
383
+ @staticmethod
384
+ def generate_pdf_report(df, model_results=None):
385
+ """Generate comprehensive PDF report"""
386
+ if not REPORTLAB_AVAILABLE:
387
+ raise ImportError("ReportLab is required for PDF generation")
388
+
389
+ buffer = io.BytesIO()
390
+ doc = SimpleDocTemplate(buffer, pagesize=A4, rightMargin=72, leftMargin=72,
391
+ topMargin=72, bottomMargin=18)
392
+
393
+ styles = getSampleStyleSheet()
394
+ story = []
395
+
396
+ # Title
397
+ title_style = ParagraphStyle('CustomTitle', parent=styles['Title'],
398
+ fontSize=24, spaceAfter=30, alignment=1)
399
+ story.append(Paragraph("B2B Customer Analytics Report", title_style))
400
+ story.append(Spacer(1, 12))
401
+
402
+ # Executive Summary
403
+ story.append(Paragraph("Executive Summary", styles['Heading2']))
404
+
405
+ total_customers = len(df)
406
+ total_revenue = df['monetary'].sum()
407
+ avg_revenue = df['monetary'].mean()
408
+
409
+ summary_text = f"""
410
+ <para>This comprehensive analysis covers <b>{total_customers:,}</b> customers with
411
+ total revenue of <b>${total_revenue:,.0f}</b>. The average customer lifetime value
412
+ is <b>${avg_revenue:.0f}</b>.</para>
413
+ <para>Customers have been segmented using advanced RFM analysis, and machine learning
414
+ models have been applied for churn prediction.</para>
415
+ """
416
+ story.append(Paragraph(summary_text, styles['Normal']))
417
+ story.append(Spacer(1, 12))
418
+
419
+ # Customer Segments
420
+ story.append(Paragraph("Customer Segmentation", styles['Heading2']))
421
 
422
+ segment_data = df['segment'].value_counts()
423
+ segment_table_data = [['Segment', 'Count', 'Percentage']]
424
+ for segment, count in segment_data.items():
425
+ percentage = f"{count/len(df)*100:.1f}%"
426
+ segment_table_data.append([segment, str(count), percentage])
427
+
428
+ segment_table = Table(segment_table_data)
429
+ segment_table.setStyle(TableStyle([
430
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
431
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
432
+ ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
433
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
434
+ ('FONTSIZE', (0, 0), (-1, 0), 14),
435
+ ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
436
+ ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
437
+ ('GRID', (0, 0), (-1, -1), 1, colors.black)
438
+ ]))
439
+ story.append(segment_table)
440
+ story.append(Spacer(1, 12))
441
+
442
+ # Model Performance
443
+ if model_results:
444
+ story.append(Paragraph("Churn Prediction Model", styles['Heading2']))
445
+ model_text = f"""
446
+ <para><b>Model Type:</b> {model_results['model_type']}</para>
447
+ <para><b>Cross-validation AUC:</b> {model_results['cv_auc_mean']:.3f} ± {model_results['cv_auc_std']:.3f}</para>
448
+ <para>The model uses advanced feature engineering including behavioral patterns
449
+ and customer lifecycle metrics for accurate churn prediction.</para>
450
+ """
451
+ story.append(Paragraph(model_text, styles['Normal']))
452
+ story.append(Spacer(1, 12))
453
+
454
+ # Top features
455
+ if not model_results['feature_importance'].empty:
456
+ story.append(Paragraph("Key Predictive Features", styles['Heading3']))
457
+ feature_table_data = [['Feature', 'Importance']]
458
+ for _, row in model_results['feature_importance'].head(5).iterrows():
459
+ feature_table_data.append([row['feature'].replace('_', ' ').title(), f"{row['importance']:.3f}"])
460
+
461
+ feature_table = Table(feature_table_data)
462
+ feature_table.setStyle(TableStyle([
463
+ ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
464
+ ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
465
+ ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
466
+ ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
467
+ ('GRID', (0, 0), (-1, -1), 1, colors.black)
468
+ ]))
469
+ story.append(feature_table)
470
+
471
+ # Build PDF
472
+ doc.build(story)
473
+ pdf_bytes = buffer.getvalue()
474
+ buffer.close()
475
+
476
+ return pdf_bytes
477
+
478
+ class B2BAnalyticsApp:
479
+ """Main application orchestrator"""
480
+
481
+ def __init__(self):
482
+ self.raw_data = None
483
+ self.customer_features = None
484
+ self.segmented_data = None
485
+ self.model_results = None
486
+ self.predictor = ChurnPredictor()
487
+
488
+ def load_data(self, file):
489
+ """Load and process uploaded file"""
490
  try:
491
+ if file is None:
492
+ return "Please upload a CSV file", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
 
494
+ # Load and process data
495
+ self.raw_data = DataProcessor.load_and_validate(file.name)
496
+ self.customer_features = FeatureEngineering.calculate_rfm_features(self.raw_data)
497
+ self.segmented_data = CustomerSegmenter.perform_segmentation(self.customer_features)
498
 
499
+ # Generate dashboard
500
+ dashboard = ReportGenerator.create_dashboard(self.segmented_data)
501
+ preview = self.segmented_data.head(20)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502
 
503
+ status = f"Successfully processed {len(self.segmented_data)} customers from {len(self.raw_data)} transactions"
504
+ return status, dashboard, preview
505
 
506
  except Exception as e:
507
+ return f"Error: {str(e)}", None, None
508
 
509
+ def train_churn_model(self):
510
+ """Train churn prediction model"""
 
 
 
511
  try:
512
+ if self.segmented_data is None:
513
+ return "Please load data first", None
514
+
515
+ self.model_results = self.predictor.train(self.segmented_data)
 
 
 
 
 
 
516
 
517
+ # Update dashboard with model results
518
+ dashboard = ReportGenerator.create_dashboard(self.segmented_data, self.model_results)
519
+
520
+ # Create feature importance chart
521
+ importance_chart = Visualizer.create_feature_importance_chart(
522
+ self.model_results['feature_importance']
 
523
  )
524
 
525
+ return dashboard, importance_chart
526
 
527
  except Exception as e:
528
+ return f"Error: {str(e)}", None
 
529
 
530
  def create_visualizations(self):
531
+ """Generate all visualization charts"""
532
+ if self.segmented_data is None:
533
+ return None, None, None
 
534
 
535
  try:
536
+ # Use predictions if available, otherwise use segmented data
537
+ data_for_viz = (self.model_results['predictions'] if self.model_results
538
+ else self.segmented_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
540
+ segment_chart = Visualizer.create_segment_chart(data_for_viz)
541
+ rfm_chart = Visualizer.create_rfm_scatter(data_for_viz)
542
+ churn_chart = Visualizer.create_churn_distribution(data_for_viz)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
 
544
+ return segment_chart, rfm_chart, churn_chart
545
 
546
  except Exception as e:
547
+ print(f"Visualization error: {e}")
548
+ return None, None, None
 
 
549
 
550
+ def get_customer_summary_table(self):
551
+ """Generate customer summary table"""
552
+ if self.segmented_data is None:
553
  return None
554
+
555
  try:
556
+ display_data = self.segmented_data.copy()
557
+
558
+ # Add predictions if available
559
+ if self.model_results:
560
+ pred_data = self.model_results['predictions']
561
+ display_data = display_data.merge(
562
+ pred_data[['customer_id', 'churn_probability']],
563
+ on='customer_id', how='left'
 
 
 
 
 
 
564
  )
565
+ display_data['churn_probability'] = (display_data['churn_probability'] * 100).round(1)
566
  else:
567
+ display_data['churn_probability'] = 50.0
568
 
569
+ # Select and format columns
570
+ summary_table = display_data[[
571
+ 'customer_id', 'segment', 'churn_risk', 'recency_days',
572
+ 'frequency', 'monetary', 'avg_amount', 'churn_probability'
573
+ ]].round(2)
574
 
575
+ summary_table.columns = [
576
+ 'Customer ID', 'Segment', 'Risk Level', 'Recency (Days)',
577
+ 'Orders', 'Total Revenue ($)', 'Avg Order ($)', 'Churn Risk (%)'
578
  ]
579
 
580
+ return summary_table.head(100)
581
 
582
  except Exception as e:
583
+ print(f"Table generation error: {e}")
584
  return None
585
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
  def generate_pdf_report(self):
587
+ """Generate and return PDF report"""
 
 
 
 
 
 
588
  try:
589
+ if self.segmented_data is None:
590
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
591
 
592
+ pdf_bytes = ReportGenerator.generate_pdf_report(
593
+ self.segmented_data, self.model_results
594
+ )
 
 
595
 
596
+ # Save to temporary file for download
597
+ import tempfile
598
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp_file:
599
+ tmp_file.write(pdf_bytes)
600
+ return tmp_file.name
601
+
602
  except Exception as e:
603
+ print(f"PDF generation error: {e}")
604
+ return None
605
 
606
+ def create_interface():
607
+ """Create Gradio interface"""
 
 
 
 
 
 
 
608
 
609
+ app = B2BAnalyticsApp()
 
610
 
611
+ with gr.Blocks(theme=gr.themes.Soft(), title="B2B Customer Analytics") as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
 
613
  gr.HTML("""
614
+ <div style="background: linear-gradient(135deg, #6366f1 0%, #8b5cf6 100%);
615
+ padding: 2rem; border-radius: 1rem; color: white; text-align: center; margin-bottom: 2rem;">
616
+ <h1 style="font-size: 2.5rem; font-weight: bold; margin-bottom: 0.5rem;">
617
+ B2B Customer Analytics Platform
618
+ </h1>
619
+ <p style="font-size: 1.1rem; opacity: 0.9;">
620
+ Advanced Customer Segmentation & Churn Prediction
621
+ </p>
622
  </div>
623
  """)
624
 
625
  with gr.Tabs():
626
+ # Data Upload Tab
627
  with gr.Tab("Data Upload & Dashboard"):
628
  with gr.Row():
629
+ file_input = gr.File(label="Upload Customer Data CSV", file_types=[".csv"])
630
+ load_btn = gr.Button("Load & Process Data", variant="primary", size="lg")
 
 
631
 
632
+ load_status = gr.Textbox(label="Status", interactive=False)
633
+ dashboard_display = gr.HTML()
634
  data_preview = gr.DataFrame(label="Data Preview")
635
 
636
+ # Segmentation Tab
637
  with gr.Tab("Customer Segmentation"):
638
  with gr.Row():
639
+ segment_chart = gr.Plot(label="Customer Segments")
640
+ rfm_chart = gr.Plot(label="RFM Analysis")
 
 
641
 
642
+ customer_table = gr.DataFrame(label="Customer Summary")
643
 
644
+ # Churn Prediction Tab
645
  with gr.Tab("Churn Prediction"):
646
  train_btn = gr.Button("Train Churn Model", variant="primary", size="lg")
647
+ model_dashboard = gr.HTML()
648
 
649
  with gr.Row():
650
+ importance_chart = gr.Plot(label="Feature Importance")
651
+ churn_dist_chart = gr.Plot(label="Churn Risk Distribution")
 
 
 
 
 
 
 
 
 
 
 
 
652
 
653
+ # Reports Tab
654
  with gr.Tab("Reports"):
655
  report_btn = gr.Button("Generate PDF Report", variant="primary", size="lg")
656
+ report_status = gr.Textbox(label="Status", interactive=False)
657
  report_file = gr.File(label="Download Report")
658
 
659
+ # Event handlers
660
+ def load_and_visualize(file):
661
+ status, dashboard, preview = app.load_data(file)
662
+ if "Successfully" in status:
663
+ charts = app.create_visualizations()
664
+ table = app.get_customer_summary_table()
665
+ return status, dashboard, preview, charts[0], charts[1], table
666
+ return status, dashboard, preview, None, None, None
 
 
 
 
 
 
 
 
 
 
667
 
668
+ def train_and_update():
669
+ dashboard, importance = app.train_churn_model()
670
+ if "Error" not in dashboard:
671
+ charts = app.create_visualizations()
672
+ return dashboard, importance, charts[2]
673
+ return dashboard, importance, None
674
 
675
+ def generate_report():
676
+ report_path = app.generate_pdf_report()
677
+ if report_path:
678
+ return "PDF report generated successfully", report_path
679
+ return "Error generating PDF report", None
 
 
 
 
 
 
 
 
 
 
 
 
 
680
 
681
+ # Connect events
682
  load_btn.click(
683
+ fn=load_and_visualize,
684
  inputs=[file_input],
685
+ outputs=[load_status, dashboard_display, data_preview,
686
+ segment_chart, rfm_chart, customer_table]
687
  )
688
 
689
  train_btn.click(
690
+ fn=train_and_update,
691
+ outputs=[model_dashboard, importance_chart, churn_dist_chart]
692
+ )