Hamza4100 commited on
Commit
8d6e50c
·
verified ·
1 Parent(s): faaab91

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +764 -747
app.py CHANGED
@@ -1,747 +1,764 @@
1
- """
2
- Enhanced Telecom Customer Segmentation Backend API
3
- =================================================
4
- FastAPI backend with:
5
- - Enhanced cluster analysis with ALL data fields
6
- - Time-based analysis (morning/evening/night)
7
- - SMS insights
8
- - Upload/Download breakdown
9
- - Dynamic visualization generation
10
- - On-demand clustering
11
- - Gemini LLM integration
12
- - HuggingFace embeddings for semantic search
13
- """
14
-
15
- import os
16
- import json
17
- import sqlite3
18
- import pickle
19
- import io
20
- import base64
21
- from typing import Optional, List, Dict, Any
22
- from contextlib import asynccontextmanager
23
- from datetime import datetime
24
-
25
- import pandas as pd
26
- import numpy as np
27
- from fastapi import FastAPI, HTTPException, Query
28
- from fastapi.middleware.cors import CORSMiddleware
29
- from fastapi.responses import JSONResponse, Response
30
- from pydantic import BaseModel
31
- import google.generativeai as genai
32
- from sentence_transformers import SentenceTransformer
33
- import faiss
34
-
35
- # ML imports
36
- from sklearn.cluster import MiniBatchKMeans, DBSCAN
37
- from sklearn.preprocessing import StandardScaler
38
- from sklearn.metrics import silhouette_score
39
- from sklearn.decomposition import PCA
40
-
41
- # Visualization imports
42
- import matplotlib
43
- matplotlib.use('Agg') # Non-interactive backend
44
- import matplotlib.pyplot as plt
45
- import plotly.graph_objects as go
46
- import plotly.express as px
47
-
48
- # ============================================
49
- # CONFIGURATION
50
- # ============================================
51
-
52
- GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
53
- if GEMINI_API_KEY:
54
- genai.configure(api_key=GEMINI_API_KEY)
55
-
56
- # Data paths
57
- MERGED_DATA_PATH = "merged_subscriber_data.csv"
58
- INTL_DATA_PATH = "international_calls.csv"
59
- CLUSTERED_DATA_PATH = "golden_table_clustered.csv"
60
- DB_PATH = "data/database.db"
61
- FAISS_INDEX_PATH = "data/faiss_index.bin"
62
- EMBEDDINGS_PATH = "data/embeddings.pkl"
63
-
64
- # Global variables
65
- df = None
66
- df_full = None # Full data with all fields
67
- conn = None
68
- embedding_model = None
69
- faiss_index = None
70
- gemini_model = None
71
-
72
-
73
- # ============================================
74
- # STARTUP / SHUTDOWN
75
- # ============================================
76
-
77
- @asynccontextmanager
78
- async def lifespan(app: FastAPI):
79
- """Initialize resources on startup"""
80
- global df, df_full, conn, embedding_model, faiss_index, gemini_model
81
-
82
- print("🚀 Starting Enhanced Telecom API...")
83
-
84
- # Load full data with all fields
85
- if os.path.exists(MERGED_DATA_PATH):
86
- df_merged = pd.read_csv(MERGED_DATA_PATH)
87
- if os.path.exists(INTL_DATA_PATH):
88
- df_intl = pd.read_csv(INTL_DATA_PATH)
89
- df_full = pd.merge(df_merged, df_intl, on='subscriberid', how='left')
90
- else:
91
- df_full = df_merged
92
-
93
- # Fill NaN values
94
- df_full = df_full.fillna(0)
95
- print(f"✓ Loaded {len(df_full):,} customers with enhanced data")
96
-
97
- # Load clustered results if available
98
- if os.path.exists(CLUSTERED_DATA_PATH):
99
- df_clustered = pd.read_csv(CLUSTERED_DATA_PATH)
100
- # Merge cluster labels into full data
101
- df_full = pd.merge(
102
- df_full,
103
- df_clustered[['subscriberid', 'kmeans_cluster', 'dbscan_cluster']],
104
- on='subscriberid',
105
- how='left'
106
- )
107
-
108
- df = df_full.copy()
109
- else:
110
- print("⚠ Data files not found")
111
- df = df_full = create_sample_data()
112
-
113
- # Initialize database
114
- init_database()
115
-
116
- # Load models
117
- try:
118
- embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
119
- print("✓ Loaded embedding model")
120
- except Exception as e:
121
- print(f"⚠ Embedding model error: {e}")
122
-
123
- if GEMINI_API_KEY:
124
- try:
125
- gemini_model = genai.GenerativeModel('gemini-2.5-flash')
126
- print("✓ Initialized Gemini")
127
- except Exception as e:
128
- print(f"⚠ Gemini error: {e}")
129
-
130
- init_faiss_index()
131
- print(" API ready!")
132
-
133
- yield
134
-
135
- if conn:
136
- conn.close()
137
- print("👋 Shutdown complete")
138
-
139
-
140
- # ============================================
141
- # INITIALIZE APP
142
- # ============================================
143
-
144
- app = FastAPI(
145
- title="Enhanced Telecom Segmentation API",
146
- description="Advanced telecom customer analytics with time-based insights",
147
- version="2.0.0",
148
- lifespan=lifespan
149
- )
150
-
151
- app.add_middleware(
152
- CORSMiddleware,
153
- allow_origins=["*"],
154
- allow_credentials=True,
155
- allow_methods=["*"],
156
- allow_headers=["*"],
157
- )
158
-
159
-
160
- # ============================================
161
- # PYDANTIC MODELS
162
- # ============================================
163
-
164
- class QueryRequest(BaseModel):
165
- question: str
166
-
167
- class QueryResponse(BaseModel):
168
- answer: str
169
- data: Optional[Dict[str, Any]] = None
170
-
171
- class EnhancedCustomerInfo(BaseModel):
172
- subscriberid: int
173
-
174
- # Voice communication
175
- voice_total_duration_mins: float
176
- voice_total_calls: float
177
- voice_morning_calls: float
178
- voice_evening_calls: float
179
- voice_night_calls: float
180
-
181
- # SMS
182
- sms_total_messages: float
183
-
184
- # Data
185
- data_total_mb: float
186
- data_downlink_mb: float
187
- data_uplink_mb: float
188
-
189
- # International
190
- intl_total_calls: float
191
- intl_total_duration_mins: float
192
- intl_countries_called: float
193
- intl_top_country: Optional[str]
194
-
195
- # User types
196
- call_lover: int
197
- download_lover: int
198
- upload_lover: int
199
- data_lover: int
200
-
201
- # Clustering
202
- kmeans_cluster: Optional[int]
203
- dbscan_cluster: Optional[int]
204
-
205
- class ClusterRequest(BaseModel):
206
- n_clusters: int = 6
207
- algorithm: str = "kmeans" # kmeans or dbscan
208
-
209
-
210
- # ============================================
211
- # HELPER FUNCTIONS
212
- # ============================================
213
-
214
- def create_sample_data():
215
- """Create sample data"""
216
- np.random.seed(42)
217
- n = 1000
218
- return pd.DataFrame({
219
- 'subscriberid': range(1, n+1),
220
- 'voice_total_duration_mins': np.random.exponential(10, n),
221
- 'voice_total_calls': np.random.poisson(10, n),
222
- 'voice_morning_calls': np.random.poisson(3, n),
223
- 'voice_evening_calls': np.random.poisson(4, n),
224
- 'voice_night_calls': np.random.poisson(3, n),
225
- 'sms_total_messages': np.random.poisson(5, n),
226
- 'data_total_mb': np.random.exponential(400, n),
227
- 'data_downlink_mb': np.random.exponential(300, n),
228
- 'data_uplink_mb': np.random.exponential(100, n),
229
- 'intl_total_calls': np.random.poisson(0.5, n),
230
- 'intl_total_duration_mins': np.random.exponential(0.5, n),
231
- 'intl_countries_called': np.random.poisson(0.3, n),
232
- 'call_lover': np.random.choice([0, 1], n, p=[0.75, 0.25]),
233
- 'data_lover': np.random.choice([0, 1], n, p=[0.75, 0.25]),
234
- 'kmeans_cluster': np.random.choice(range(6), n),
235
- 'dbscan_cluster': np.random.choice(range(12), n),
236
- })
237
-
238
-
239
- def init_database():
240
- """Initialize SQLite database"""
241
- global conn, df_full
242
- os.makedirs("data", exist_ok=True)
243
- conn = sqlite3.connect(DB_PATH, check_same_thread=False)
244
- df_full.to_sql('customers', conn, if_exists='replace', index=False)
245
- conn.execute("CREATE INDEX IF NOT EXISTS idx_subscriberid ON customers(subscriberid)")
246
- print("✓ Database initialized")
247
-
248
-
249
- def init_faiss_index():
250
- """Build FAISS index for semantic search"""
251
- global faiss_index, embedding_model, df
252
-
253
- if embedding_model is None:
254
- return
255
-
256
- if os.path.exists(FAISS_INDEX_PATH):
257
- try:
258
- faiss_index = faiss.read_index(FAISS_INDEX_PATH)
259
- print("✓ Loaded FAISS index")
260
- return
261
- except:
262
- pass
263
-
264
- # Build index
265
- print("Building FAISS index...")
266
- descriptions = []
267
- for _, row in df.iterrows():
268
- desc = f"Customer {row['subscriberid']}: "
269
- desc += f"{row.get('voice_total_calls', 0):.0f} voice calls, "
270
- desc += f"{row.get('data_total_mb', 0):.0f} MB data, "
271
- desc += f"{row.get('sms_total_messages', 0):.0f} SMS, "
272
- if row.get('intl_total_calls', 0) > 0:
273
- desc += f"{row.get('intl_total_calls', 0):.0f} international calls"
274
- descriptions.append(desc)
275
-
276
- embeddings = embedding_model.encode(descriptions, show_progress_bar=True, batch_size=32)
277
-
278
- dimension = embeddings.shape[1]
279
- faiss_index = faiss.IndexFlatIP(dimension)
280
- faiss.normalize_L2(embeddings)
281
- faiss_index.add(embeddings)
282
-
283
- faiss.write_index(faiss_index, FAISS_INDEX_PATH)
284
- print("✓ Built FAISS index")
285
-
286
-
287
- def get_cluster_label(row):
288
- """Get human-readable cluster label"""
289
- if row['intl_total_calls'] > 0:
290
- if row['data_total_mb'] > row['data_total_mb'].median():
291
- return "International Data Users"
292
- else:
293
- return "International Callers"
294
- elif row['voice_total_calls'] > row['voice_total_calls'].quantile(0.75):
295
- return "Heavy Voice Users"
296
- elif row['data_total_mb'] > row['data_total_mb'].quantile(0.75):
297
- return "Heavy Data Users"
298
- elif row['sms_total_messages'] > row['sms_total_messages'].quantile(0.75):
299
- return "SMS Enthusiasts"
300
- else:
301
- return "Light Users"
302
-
303
-
304
- # ============================================
305
- # ENDPOINTS
306
- # ============================================
307
-
308
- @app.get("/")
309
- def health_check():
310
- """Health check"""
311
- return {
312
- "status": "healthy",
313
- "version": "2.0",
314
- "customers": len(df) if df is not None else 0,
315
- "columns": list(df.columns) if df is not None else [],
316
- "features": [
317
- "time_analysis",
318
- "sms_insights",
319
- "upload_download_split",
320
- "international_details",
321
- "dynamic_clustering",
322
- "dynamic_visualizations"
323
- ]
324
- }
325
-
326
-
327
- @app.get("/api/stats")
328
- def get_stats():
329
- """Get overall statistics with enhanced metrics"""
330
- if df is None:
331
- raise HTTPException(status_code=500, detail="Data not loaded")
332
-
333
- return {
334
- "total_customers": int(len(df)),
335
- "international_users": int(df[df['intl_total_calls'] > 0]['subscriberid'].nunique()),
336
- "international_percentage": float((df['intl_total_calls'] > 0).sum() / len(df) * 100),
337
-
338
- # Voice stats
339
- "avg_voice_mins": float(df['voice_total_duration_mins'].mean()),
340
- "avg_voice_calls": float(df['voice_total_calls'].mean()),
341
- "total_voice_mins": float(df['voice_total_duration_mins'].sum()),
342
-
343
- # Time breakdown
344
- "morning_calls": int(df['voice_morning_calls'].sum()),
345
- "evening_calls": int(df['voice_evening_calls'].sum()),
346
- "night_calls": int(df['voice_night_calls'].sum()),
347
-
348
- # SMS stats
349
- "total_sms": int(df['sms_total_messages'].sum()),
350
- "avg_sms_per_user": float(df['sms_total_messages'].mean()),
351
- "sms_users": int((df['sms_total_messages'] > 0).sum()),
352
-
353
- # Data stats
354
- "avg_data_mb": float(df['data_total_mb'].mean()),
355
- "avg_download_mb": float(df['data_downlink_mb'].mean()),
356
- "avg_upload_mb": float(df['data_uplink_mb'].mean()),
357
- "total_data_gb": float(df['data_total_mb'].sum() / 1024),
358
-
359
- # User types
360
- "call_lovers": int(df['call_lover'].sum()),
361
- "data_lovers": int(df['data_lover'].sum()),
362
- "download_lovers": int(df.get('download_lover', pd.Series([0])).sum()),
363
- "upload_lovers": int(df.get('upload_lover', pd.Series([0])).sum()),
364
- }
365
-
366
-
367
- @app.get("/api/customers/{customer_id}")
368
- def get_customer(customer_id: int):
369
- """Get detailed customer information"""
370
- if df is None:
371
- raise HTTPException(status_code=500, detail="Data not loaded")
372
-
373
- customer = df[df['subscriberid'] == customer_id]
374
-
375
- if customer.empty:
376
- raise HTTPException(status_code=404, detail=f"Customer {customer_id} not found")
377
-
378
- row = customer.iloc[0]
379
-
380
- # Calculate time distribution
381
- total_calls_by_time = (
382
- row.get('voice_morning_calls', 0) +
383
- row.get('voice_evening_calls', 0) +
384
- row.get('voice_night_calls', 0)
385
- )
386
-
387
- return {
388
- "subscriberid": int(row['subscriberid']),
389
-
390
- # Communication
391
- "communication": {
392
- "voice_total_duration_mins": float(row['voice_total_duration_mins']),
393
- "voice_total_calls": float(row['voice_total_calls']),
394
- "voice_avg_duration_mins": float(row.get('voice_avg_duration_mins', 0)),
395
- "time_distribution": {
396
- "morning_calls": int(row.get('voice_morning_calls', 0)),
397
- "evening_calls": int(row.get('voice_evening_calls', 0)),
398
- "night_calls": int(row.get('voice_night_calls', 0)),
399
- "morning_pct": float(row.get('voice_morning_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
400
- "evening_pct": float(row.get('voice_evening_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
401
- "night_pct": float(row.get('voice_night_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
402
- }
403
- },
404
-
405
- # International
406
- "international": {
407
- "total_calls": float(row.get('intl_total_calls', 0)),
408
- "total_duration_mins": float(row.get('intl_total_duration_mins', 0)),
409
- "countries_called": int(row.get('intl_countries_called', 0)),
410
- "top_country": str(row.get('intl_top_country', 'N/A')) if pd.notna(row.get('intl_top_country')) else 'N/A',
411
- "all_countries": str(row.get('intl_all_countries', 'N/A')) if pd.notna(row.get('intl_all_countries')) else 'N/A',
412
- "is_international_user": bool(row.get('intl_total_calls', 0) > 0)
413
- },
414
-
415
- # Internet
416
- "internet": {
417
- "total_mb": float(row['data_total_mb']),
418
- "download_mb": float(row.get('data_downlink_mb', 0)),
419
- "upload_mb": float(row.get('data_uplink_mb', 0)),
420
- "download_pct": float(row.get('data_downlink_mb', 0) / row['data_total_mb'] * 100 if row['data_total_mb'] > 0 else 0),
421
- "upload_pct": float(row.get('data_uplink_mb', 0) / row['data_total_mb'] * 100 if row['data_total_mb'] > 0 else 0),
422
- },
423
-
424
- # SMS
425
- "sms": {
426
- "total_messages": int(row.get('sms_total_messages', 0)),
427
- "frequency": "High" if row.get('sms_total_messages', 0) > df['sms_total_messages'].quantile(0.75) else
428
- "Medium" if row.get('sms_total_messages', 0) > df['sms_total_messages'].quantile(0.25) else "Low"
429
- },
430
-
431
- # User profile
432
- "profile": {
433
- "call_lover": bool(row.get('call_lover', 0)),
434
- "data_lover": bool(row.get('data_lover', 0)),
435
- "download_lover": bool(row.get('download_lover', 0)),
436
- "upload_lover": bool(row.get('upload_lover', 0)),
437
- "kmeans_cluster": int(row.get('kmeans_cluster', -1)) if pd.notna(row.get('kmeans_cluster')) else None,
438
- "dbscan_cluster": int(row.get('dbscan_cluster', -1)) if pd.notna(row.get('dbscan_cluster')) else None,
439
- }
440
- }
441
-
442
-
443
- @app.get("/api/time-analysis")
444
- def get_time_analysis():
445
- """Get time-based analysis of voice calls"""
446
- if df is None:
447
- raise HTTPException(status_code=500, detail="Data not loaded")
448
-
449
- total_morning = df['voice_morning_calls'].sum()
450
- total_evening = df['voice_evening_calls'].sum()
451
- total_night = df['voice_night_calls'].sum()
452
- total_all = total_morning + total_evening + total_night
453
-
454
- return {
455
- "overall": {
456
- "morning_calls": int(total_morning),
457
- "evening_calls": int(total_evening),
458
- "night_calls": int(total_night),
459
- "morning_pct": float(total_morning / total_all * 100 if total_all > 0 else 0),
460
- "evening_pct": float(total_evening / total_all * 100 if total_all > 0 else 0),
461
- "night_pct": float(total_night / total_all * 100 if total_all > 0 else 0),
462
- },
463
- "peak_time": "Morning" if total_morning == max(total_morning, total_evening, total_night) else
464
- "Evening" if total_evening == max(total_morning, total_evening, total_night) else "Night",
465
- "by_user_type": {
466
- "call_lovers": {
467
- "morning": int(df[df['call_lover'] == 1]['voice_morning_calls'].sum()),
468
- "evening": int(df[df['call_lover'] == 1]['voice_evening_calls'].sum()),
469
- "night": int(df[df['call_lover'] == 1]['voice_night_calls'].sum()),
470
- },
471
- "others": {
472
- "morning": int(df[df['call_lover'] == 0]['voice_morning_calls'].sum()),
473
- "evening": int(df[df['call_lover'] == 0]['voice_evening_calls'].sum()),
474
- "night": int(df[df['call_lover'] == 0]['voice_night_calls'].sum()),
475
- }
476
- }
477
- }
478
-
479
-
480
- @app.get("/api/visualizations/time-distribution")
481
- def viz_time_distribution():
482
- """Generate time distribution chart"""
483
- if df is None:
484
- raise HTTPException(status_code=500, detail="Data not loaded")
485
-
486
- time_data = {
487
- 'Time Period': ['Morning', 'Evening', 'Night'],
488
- 'Total Calls': [
489
- df['voice_morning_calls'].sum(),
490
- df['voice_evening_calls'].sum(),
491
- df['voice_night_calls'].sum()
492
- ]
493
- }
494
-
495
- fig = px.bar(
496
- time_data,
497
- x='Time Period',
498
- y='Total Calls',
499
- title='Call Distribution by Time of Day',
500
- color='Time Period',
501
- color_discrete_map={'Morning': '#FDB462', 'Evening': '#80B1D3', 'Night': '#8DD3C7'}
502
- )
503
-
504
- return JSONResponse(content={"chart": fig.to_json()})
505
-
506
-
507
- @app.get("/api/visualizations/data-breakdown")
508
- def viz_data_breakdown():
509
- """Generate upload/download breakdown chart"""
510
- if df is None:
511
- raise HTTPException(status_code=500, detail="Data not loaded")
512
-
513
- data_summary = {
514
- 'Type': ['Download', 'Upload'],
515
- 'Total (GB)': [
516
- df['data_downlink_mb'].sum() / 1024,
517
- df['data_uplink_mb'].sum() / 1024
518
- ]
519
- }
520
-
521
- fig = px.pie(
522
- data_summary,
523
- values='Total (GB)',
524
- names='Type',
525
- title='Data Usage: Download vs Upload',
526
- color_discrete_sequence=['#66C2A5', '#FC8D62']
527
- )
528
-
529
- return JSONResponse(content={"chart": fig.to_json()})
530
-
531
-
532
- @app.get("/api/visualizations/customer-segments")
533
- def viz_customer_segments():
534
- """Generate customer segments visualization"""
535
- if df is None or 'kmeans_cluster' not in df.columns:
536
- raise HTTPException(status_code=500, detail="Clustering data not available")
537
-
538
- # Get cluster statistics
539
- cluster_stats = df.groupby('kmeans_cluster').agg({
540
- 'subscriberid': 'count',
541
- 'voice_total_calls': 'mean',
542
- 'data_total_mb': 'mean',
543
- 'sms_total_messages': 'mean'
544
- }).reset_index()
545
-
546
- cluster_stats.columns = ['Cluster', 'Customers', 'Avg Calls', 'Avg Data (MB)', 'Avg SMS']
547
-
548
- fig = px.bar(
549
- cluster_stats,
550
- x='Cluster',
551
- y='Customers',
552
- title='Customer Distribution Across Segments',
553
- color='Customers',
554
- color_continuous_scale='viridis'
555
- )
556
-
557
- return JSONResponse(content={"chart": fig.to_json()})
558
-
559
-
560
- @app.post("/api/cluster/run")
561
- def run_clustering(request: ClusterRequest):
562
- """Run clustering on-demand"""
563
- if df is None:
564
- raise HTTPException(status_code=500, detail="Data not loaded")
565
-
566
- # Select features
567
- feature_cols = [
568
- 'voice_total_duration_mins', 'voice_total_calls',
569
- 'data_total_mb', 'sms_total_messages'
570
- ]
571
-
572
- # Add international if exists
573
- if 'intl_total_calls' in df.columns:
574
- feature_cols.append('intl_total_calls')
575
-
576
- X = df[feature_cols].fillna(0)
577
-
578
- # Scale
579
- scaler = StandardScaler()
580
- X_scaled = scaler.fit_transform(X)
581
-
582
- # Cluster
583
- if request.algorithm == "kmeans":
584
- model = MiniBatchKMeans(n_clusters=request.n_clusters, random_state=42, batch_size=1000)
585
- labels = model.fit_predict(X_scaled)
586
-
587
- # Calculate silhouette score
588
- if len(df) > 10000:
589
- sample_idx = np.random.choice(len(df), 10000, replace=False)
590
- score = silhouette_score(X_scaled[sample_idx], labels[sample_idx])
591
- else:
592
- score = silhouette_score(X_scaled, labels)
593
-
594
- elif request.algorithm == "dbscan":
595
- model = DBSCAN(eps=0.3, min_samples=10)
596
- labels = model.fit_predict(X_scaled)
597
- score = None
598
- else:
599
- raise HTTPException(status_code=400, detail="Invalid algorithm")
600
-
601
- # Get cluster stats
602
- df_temp = df.copy()
603
- df_temp['cluster'] = labels
604
-
605
- cluster_info = []
606
- for cluster_id in sorted(df_temp['cluster'].unique()):
607
- cluster_data = df_temp[df_temp['cluster'] == cluster_id]
608
- cluster_info.append({
609
- "cluster_id": int(cluster_id),
610
- "size": int(len(cluster_data)),
611
- "percentage": float(len(cluster_data) / len(df) * 100),
612
- "avg_voice_calls": float(cluster_data['voice_total_calls'].mean()),
613
- "avg_data_mb": float(cluster_data['data_total_mb'].mean()),
614
- "avg_sms": float(cluster_data.get('sms_total_messages', pd.Series([0])).mean()),
615
- })
616
-
617
- return {
618
- "algorithm": request.algorithm,
619
- "n_clusters": int(labels.max() + 1),
620
- "silhouette_score": float(score) if score else None,
621
- "clusters": cluster_info
622
- }
623
-
624
-
625
- @app.post("/api/query")
626
- def query_with_llm(request: QueryRequest):
627
- """Query data using Gemini LLM"""
628
- if gemini_model is None:
629
- raise HTTPException(status_code=503, detail="Gemini API not configured")
630
-
631
- # Build context with safe column access
632
- def safe_col_sum(col_name, default=0):
633
- """Safely get column sum or return default"""
634
- return df[col_name].sum() if col_name in df.columns else default
635
-
636
- def safe_col_mean(col_name, default=0):
637
- """Safely get column mean or return default"""
638
- return df[col_name].mean() if col_name in df.columns else default
639
-
640
- def safe_col_count(col_name, condition_value=0):
641
- """Safely count rows where column > condition_value"""
642
- if col_name in df.columns:
643
- return (df[col_name] > condition_value).sum()
644
- return 0
645
-
646
- context = f"""
647
- You are analyzing telecom customer data. Here are the key statistics:
648
-
649
- Total Customers: {len(df):,}
650
- International Users: {int(safe_col_count('intl_total_calls', 0)):,}
651
-
652
- Voice Communication:
653
- - Total Calls: {safe_col_sum('voice_total_calls'):,.0f}
654
- - Total Duration: {safe_col_sum('voice_total_duration_mins'):,.0f} mins
655
- - Average per User: {safe_col_mean('voice_total_calls'):.1f} calls
656
-
657
- {'Time Distribution:' if 'voice_morning_calls' in df.columns else ''}
658
- {f"- Morning Calls: {safe_col_sum('voice_morning_calls'):,.0f}" if 'voice_morning_calls' in df.columns else ''}
659
- {f"- Evening Calls: {safe_col_sum('voice_evening_calls'):,.0f}" if 'voice_evening_calls' in df.columns else ''}
660
- {f"- Night Calls: {safe_col_sum('voice_night_calls'):,.0f}" if 'voice_night_calls' in df.columns else ''}
661
-
662
- {'SMS:' if 'sms_total_messages' in df.columns else ''}
663
- {f"- Total Messages: {safe_col_sum('sms_total_messages'):,.0f}" if 'sms_total_messages' in df.columns else ''}
664
- {f"- Average per User: {safe_col_mean('sms_total_messages'):.1f}" if 'sms_total_messages' in df.columns else ''}
665
-
666
- Data Usage:
667
- - Total Data (MB): {safe_col_sum('data_total_mb'):,.0f}
668
- - Average per User (MB): {safe_col_mean('data_total_mb'):.1f}
669
- {f"- Total Download (GB): {safe_col_sum('data_downlink_mb') / 1024:.1f}" if 'data_downlink_mb' in df.columns else ''}
670
- {f"- Total Upload (GB): {safe_col_sum('data_uplink_mb') / 1024:.1f}" if 'data_uplink_mb' in df.columns else ''}
671
-
672
- User Question: {request.question}
673
-
674
- Provide a clear, concise answer based on the statistics above.
675
- """
676
-
677
- try:
678
- response = gemini_model.generate_content(context)
679
- return QueryResponse(answer=response.text, data=None)
680
- except Exception as e:
681
- import traceback
682
- error_details = traceback.format_exc()
683
- print(f"❌ Query error: {error_details}")
684
- raise HTTPException(status_code=500, detail=f"LLM error: {str(e)}")
685
-
686
-
687
- @app.get("/api/search")
688
- def semantic_search(query: str = Query(..., description="Search query"), limit: int = 10):
689
- """Semantic search for customers"""
690
- if embedding_model is None or faiss_index is None:
691
- raise HTTPException(status_code=503, detail="Search not available")
692
-
693
- # Embed query
694
- query_embedding = embedding_model.encode([query])
695
- faiss.normalize_L2(query_embedding)
696
-
697
- # Search
698
- scores, indices = faiss_index.search(query_embedding, limit)
699
-
700
- results = []
701
- for score, idx in zip(scores[0], indices[0]):
702
- if idx < len(df):
703
- customer = df.iloc[idx]
704
- results.append({
705
- "customer_id": int(customer['subscriberid']),
706
- "similarity_score": float(score),
707
- "voice_calls": float(customer['voice_total_calls']),
708
- "data_mb": float(customer['data_total_mb']),
709
- "sms": int(customer.get('sms_total_messages', 0)),
710
- "is_international": bool(customer.get('intl_total_calls', 0) > 0)
711
- })
712
-
713
- return {"results": results}
714
-
715
-
716
- @app.get("/api/clusters")
717
- def get_clusters(cluster_type: str = "kmeans"):
718
- """Get cluster information"""
719
- if df is None:
720
- raise HTTPException(status_code=500, detail="Data not loaded")
721
-
722
- cluster_col = f"{cluster_type}_cluster"
723
- if cluster_col not in df.columns:
724
- raise HTTPException(status_code=404, detail=f"{cluster_type} clusters not found")
725
-
726
- cluster_info = []
727
- for cluster_id in sorted(df[cluster_col].unique()):
728
- if pd.isna(cluster_id):
729
- continue
730
-
731
- cluster_data = df[df[cluster_col] == cluster_id]
732
- cluster_info.append({
733
- "cluster_id": int(cluster_id),
734
- "size": int(len(cluster_data)),
735
- "percentage": float(len(cluster_data) / len(df) * 100),
736
- "avg_voice_mins": float(cluster_data['voice_total_duration_mins'].mean()),
737
- "avg_data_mb": float(cluster_data['data_total_mb'].mean()),
738
- "avg_sms": float(cluster_data.get('sms_total_messages', pd.Series([0])).mean()),
739
- "avg_intl_calls": float(cluster_data.get('intl_total_calls', pd.Series([0])).mean()),
740
- })
741
-
742
- return {"cluster_type": cluster_type, "clusters": cluster_info}
743
-
744
-
745
- if __name__ == "__main__":
746
- import uvicorn
747
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced Telecom Customer Segmentation Backend API
3
+ =================================================
4
+ FastAPI backend with:
5
+ - Enhanced cluster analysis with ALL data fields
6
+ - Time-based analysis (morning/evening/night)
7
+ - SMS insights
8
+ - Upload/Download breakdown
9
+ - Dynamic visualization generation
10
+ - On-demand clustering
11
+ - Gemini LLM integration
12
+ - HuggingFace embeddings for semantic search
13
+ """
14
+
15
+ import os
16
+ import json
17
+ import sqlite3
18
+ import pickle
19
+ import io
20
+ import base64
21
+ from typing import Optional, List, Dict, Any
22
+ from contextlib import asynccontextmanager
23
+ from datetime import datetime
24
+
25
+ import pandas as pd
26
+ import numpy as np
27
+ from fastapi import FastAPI, HTTPException, Query
28
+ from fastapi.middleware.cors import CORSMiddleware
29
+ from fastapi.responses import JSONResponse, Response
30
+ from pydantic import BaseModel
31
+ import google.generativeai as genai
32
+ from sentence_transformers import SentenceTransformer
33
+ import faiss
34
+
35
+ # ML imports
36
+ from sklearn.cluster import MiniBatchKMeans, DBSCAN
37
+ from sklearn.preprocessing import StandardScaler
38
+ from sklearn.metrics import silhouette_score
39
+ from sklearn.decomposition import PCA
40
+
41
+ # Global flag for FAISS initialization
42
+ faiss_building = False
43
+
44
+ # Visualization imports
45
+ import matplotlib
46
+ matplotlib.use('Agg') # Non-interactive backend
47
+ import matplotlib.pyplot as plt
48
+ import plotly.graph_objects as go
49
+ import plotly.express as px
50
+
51
+ # ============================================
52
+ # CONFIGURATION
53
+ # ============================================
54
+
55
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
56
+ if GEMINI_API_KEY:
57
+ genai.configure(api_key=GEMINI_API_KEY)
58
+
59
+ # Data paths
60
+ MERGED_DATA_PATH = "merged_subscriber_data.csv"
61
+ INTL_DATA_PATH = "international_calls.csv"
62
+ CLUSTERED_DATA_PATH = "golden_table_clustered.csv"
63
+ DB_PATH = "data/database.db"
64
+ FAISS_INDEX_PATH = "data/faiss_index.bin"
65
+ EMBEDDINGS_PATH = "data/embeddings.pkl"
66
+
67
+ # Global variables
68
+ df = None
69
+ df_full = None # Full data with all fields
70
+ conn = None
71
+ embedding_model = None
72
+ faiss_index = None
73
+ gemini_model = None
74
+
75
+
76
+ # ============================================
77
+ # STARTUP / SHUTDOWN
78
+ # ============================================
79
+
80
+ @asynccontextmanager
81
+ async def lifespan(app: FastAPI):
82
+ """Initialize resources on startup"""
83
+ global df, df_full, conn, embedding_model, faiss_index, gemini_model
84
+
85
+ print("🚀 Starting Enhanced Telecom API...")
86
+
87
+ # Load full data with all fields
88
+ if os.path.exists(MERGED_DATA_PATH):
89
+ df_merged = pd.read_csv(MERGED_DATA_PATH)
90
+ if os.path.exists(INTL_DATA_PATH):
91
+ df_intl = pd.read_csv(INTL_DATA_PATH)
92
+ df_full = pd.merge(df_merged, df_intl, on='subscriberid', how='left')
93
+ else:
94
+ df_full = df_merged
95
+
96
+ # Fill NaN values
97
+ df_full = df_full.fillna(0)
98
+ print(f"✓ Loaded {len(df_full):,} customers with enhanced data")
99
+
100
+ # Load clustered results if available
101
+ if os.path.exists(CLUSTERED_DATA_PATH):
102
+ df_clustered = pd.read_csv(CLUSTERED_DATA_PATH)
103
+ # Merge cluster labels into full data
104
+ df_full = pd.merge(
105
+ df_full,
106
+ df_clustered[['subscriberid', 'kmeans_cluster', 'dbscan_cluster']],
107
+ on='subscriberid',
108
+ how='left'
109
+ )
110
+
111
+ df = df_full.copy()
112
+ else:
113
+ print("⚠ Data files not found")
114
+ df = df_full = create_sample_data()
115
+
116
+ # Initialize database
117
+ init_database()
118
+
119
+ # Load models
120
+ try:
121
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
122
+ print("✓ Loaded embedding model")
123
+ except Exception as e:
124
+ print(f"⚠ Embedding model error: {e}")
125
+
126
+ if GEMINI_API_KEY:
127
+ try:
128
+ gemini_model = genai.GenerativeModel('gemini-2.5-flash')
129
+ print("✓ Initialized Gemini")
130
+ except Exception as e:
131
+ print(f" Gemini error: {e}")
132
+
133
+ # FAISS index will build on first search request (lazy loading)
134
+ print("ℹ FAISS index will build on first search request")
135
+ print("✅ API ready!")
136
+
137
+ yield
138
+
139
+ if conn:
140
+ conn.close()
141
+ print("👋 Shutdown complete")
142
+
143
+
144
+ # ============================================
145
+ # INITIALIZE APP
146
+ # ============================================
147
+
148
+ app = FastAPI(
149
+ title="Enhanced Telecom Segmentation API",
150
+ description="Advanced telecom customer analytics with time-based insights",
151
+ version="2.0.0",
152
+ lifespan=lifespan
153
+ )
154
+
155
+ app.add_middleware(
156
+ CORSMiddleware,
157
+ allow_origins=["*"],
158
+ allow_credentials=True,
159
+ allow_methods=["*"],
160
+ allow_headers=["*"],
161
+ )
162
+
163
+
164
+ # ============================================
165
+ # PYDANTIC MODELS
166
+ # ============================================
167
+
168
+ class QueryRequest(BaseModel):
169
+ question: str
170
+
171
+ class QueryResponse(BaseModel):
172
+ answer: str
173
+ data: Optional[Dict[str, Any]] = None
174
+
175
+ class EnhancedCustomerInfo(BaseModel):
176
+ subscriberid: int
177
+
178
+ # Voice communication
179
+ voice_total_duration_mins: float
180
+ voice_total_calls: float
181
+ voice_morning_calls: float
182
+ voice_evening_calls: float
183
+ voice_night_calls: float
184
+
185
+ # SMS
186
+ sms_total_messages: float
187
+
188
+ # Data
189
+ data_total_mb: float
190
+ data_downlink_mb: float
191
+ data_uplink_mb: float
192
+
193
+ # International
194
+ intl_total_calls: float
195
+ intl_total_duration_mins: float
196
+ intl_countries_called: float
197
+ intl_top_country: Optional[str]
198
+
199
+ # User types
200
+ call_lover: int
201
+ download_lover: int
202
+ upload_lover: int
203
+ data_lover: int
204
+
205
+ # Clustering
206
+ kmeans_cluster: Optional[int]
207
+ dbscan_cluster: Optional[int]
208
+
209
+ class ClusterRequest(BaseModel):
210
+ n_clusters: int = 6
211
+ algorithm: str = "kmeans" # kmeans or dbscan
212
+
213
+
214
+ # ============================================
215
+ # HELPER FUNCTIONS
216
+ # ============================================
217
+
218
+ def create_sample_data():
219
+ """Create sample data"""
220
+ np.random.seed(42)
221
+ n = 1000
222
+ return pd.DataFrame({
223
+ 'subscriberid': range(1, n+1),
224
+ 'voice_total_duration_mins': np.random.exponential(10, n),
225
+ 'voice_total_calls': np.random.poisson(10, n),
226
+ 'voice_morning_calls': np.random.poisson(3, n),
227
+ 'voice_evening_calls': np.random.poisson(4, n),
228
+ 'voice_night_calls': np.random.poisson(3, n),
229
+ 'sms_total_messages': np.random.poisson(5, n),
230
+ 'data_total_mb': np.random.exponential(400, n),
231
+ 'data_downlink_mb': np.random.exponential(300, n),
232
+ 'data_uplink_mb': np.random.exponential(100, n),
233
+ 'intl_total_calls': np.random.poisson(0.5, n),
234
+ 'intl_total_duration_mins': np.random.exponential(0.5, n),
235
+ 'intl_countries_called': np.random.poisson(0.3, n),
236
+ 'call_lover': np.random.choice([0, 1], n, p=[0.75, 0.25]),
237
+ 'data_lover': np.random.choice([0, 1], n, p=[0.75, 0.25]),
238
+ 'kmeans_cluster': np.random.choice(range(6), n),
239
+ 'dbscan_cluster': np.random.choice(range(12), n),
240
+ })
241
+
242
+
243
+ def init_database():
244
+ """Initialize SQLite database"""
245
+ global conn, df_full
246
+ os.makedirs("data", exist_ok=True)
247
+ conn = sqlite3.connect(DB_PATH, check_same_thread=False)
248
+ df_full.to_sql('customers', conn, if_exists='replace', index=False)
249
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_subscriberid ON customers(subscriberid)")
250
+ print(" Database initialized")
251
+
252
+
253
+ def init_faiss_index():
254
+ """Build FAISS index for semantic search"""
255
+ global faiss_index, embedding_model, df
256
+
257
+ if embedding_model is None:
258
+ return
259
+
260
+ if os.path.exists(FAISS_INDEX_PATH):
261
+ try:
262
+ faiss_index = faiss.read_index(FAISS_INDEX_PATH)
263
+ print("✓ Loaded FAISS index")
264
+ return
265
+ except:
266
+ pass
267
+
268
+ # Build index
269
+ print("Building FAISS index...")
270
+ descriptions = []
271
+ for _, row in df.iterrows():
272
+ desc = f"Customer {row['subscriberid']}: "
273
+ desc += f"{row.get('voice_total_calls', 0):.0f} voice calls, "
274
+ desc += f"{row.get('data_total_mb', 0):.0f} MB data, "
275
+ desc += f"{row.get('sms_total_messages', 0):.0f} SMS, "
276
+ if row.get('intl_total_calls', 0) > 0:
277
+ desc += f"{row.get('intl_total_calls', 0):.0f} international calls"
278
+ descriptions.append(desc)
279
+
280
+ embeddings = embedding_model.encode(descriptions, show_progress_bar=True, batch_size=32)
281
+
282
+ dimension = embeddings.shape[1]
283
+ faiss_index = faiss.IndexFlatIP(dimension)
284
+ faiss.normalize_L2(embeddings)
285
+ faiss_index.add(embeddings)
286
+
287
+ faiss.write_index(faiss_index, FAISS_INDEX_PATH)
288
+ print(" Built FAISS index")
289
+
290
+
291
+ def get_cluster_label(row):
292
+ """Get human-readable cluster label"""
293
+ if row['intl_total_calls'] > 0:
294
+ if row['data_total_mb'] > row['data_total_mb'].median():
295
+ return "International Data Users"
296
+ else:
297
+ return "International Callers"
298
+ elif row['voice_total_calls'] > row['voice_total_calls'].quantile(0.75):
299
+ return "Heavy Voice Users"
300
+ elif row['data_total_mb'] > row['data_total_mb'].quantile(0.75):
301
+ return "Heavy Data Users"
302
+ elif row['sms_total_messages'] > row['sms_total_messages'].quantile(0.75):
303
+ return "SMS Enthusiasts"
304
+ else:
305
+ return "Light Users"
306
+
307
+
308
+ # ============================================
309
+ # ENDPOINTS
310
+ # ============================================
311
+
312
+ @app.get("/")
313
+ def health_check():
314
+ """Health check"""
315
+ return {
316
+ "status": "healthy",
317
+ "version": "2.0",
318
+ "customers": len(df) if df is not None else 0,
319
+ "columns": list(df.columns) if df is not None else [],
320
+ "features": [
321
+ "time_analysis",
322
+ "sms_insights",
323
+ "upload_download_split",
324
+ "international_details",
325
+ "dynamic_clustering",
326
+ "dynamic_visualizations"
327
+ ]
328
+ }
329
+
330
+
331
+ @app.get("/api/stats")
332
+ def get_stats():
333
+ """Get overall statistics with enhanced metrics"""
334
+ if df is None:
335
+ raise HTTPException(status_code=500, detail="Data not loaded")
336
+
337
+ return {
338
+ "total_customers": int(len(df)),
339
+ "international_users": int(df[df['intl_total_calls'] > 0]['subscriberid'].nunique()),
340
+ "international_percentage": float((df['intl_total_calls'] > 0).sum() / len(df) * 100),
341
+
342
+ # Voice stats
343
+ "avg_voice_mins": float(df['voice_total_duration_mins'].mean()),
344
+ "avg_voice_calls": float(df['voice_total_calls'].mean()),
345
+ "total_voice_mins": float(df['voice_total_duration_mins'].sum()),
346
+
347
+ # Time breakdown
348
+ "morning_calls": int(df['voice_morning_calls'].sum()),
349
+ "evening_calls": int(df['voice_evening_calls'].sum()),
350
+ "night_calls": int(df['voice_night_calls'].sum()),
351
+
352
+ # SMS stats
353
+ "total_sms": int(df['sms_total_messages'].sum()),
354
+ "avg_sms_per_user": float(df['sms_total_messages'].mean()),
355
+ "sms_users": int((df['sms_total_messages'] > 0).sum()),
356
+
357
+ # Data stats
358
+ "avg_data_mb": float(df['data_total_mb'].mean()),
359
+ "avg_download_mb": float(df['data_downlink_mb'].mean()),
360
+ "avg_upload_mb": float(df['data_uplink_mb'].mean()),
361
+ "total_data_gb": float(df['data_total_mb'].sum() / 1024),
362
+
363
+ # User types
364
+ "call_lovers": int(df['call_lover'].sum()),
365
+ "data_lovers": int(df['data_lover'].sum()),
366
+ "download_lovers": int(df.get('download_lover', pd.Series([0])).sum()),
367
+ "upload_lovers": int(df.get('upload_lover', pd.Series([0])).sum()),
368
+ }
369
+
370
+
371
+ @app.get("/api/customers/{customer_id}")
372
+ def get_customer(customer_id: int):
373
+ """Get detailed customer information"""
374
+ if df is None:
375
+ raise HTTPException(status_code=500, detail="Data not loaded")
376
+
377
+ customer = df[df['subscriberid'] == customer_id]
378
+
379
+ if customer.empty:
380
+ raise HTTPException(status_code=404, detail=f"Customer {customer_id} not found")
381
+
382
+ row = customer.iloc[0]
383
+
384
+ # Calculate time distribution
385
+ total_calls_by_time = (
386
+ row.get('voice_morning_calls', 0) +
387
+ row.get('voice_evening_calls', 0) +
388
+ row.get('voice_night_calls', 0)
389
+ )
390
+
391
+ return {
392
+ "subscriberid": int(row['subscriberid']),
393
+
394
+ # Communication
395
+ "communication": {
396
+ "voice_total_duration_mins": float(row['voice_total_duration_mins']),
397
+ "voice_total_calls": float(row['voice_total_calls']),
398
+ "voice_avg_duration_mins": float(row.get('voice_avg_duration_mins', 0)),
399
+ "time_distribution": {
400
+ "morning_calls": int(row.get('voice_morning_calls', 0)),
401
+ "evening_calls": int(row.get('voice_evening_calls', 0)),
402
+ "night_calls": int(row.get('voice_night_calls', 0)),
403
+ "morning_pct": float(row.get('voice_morning_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
404
+ "evening_pct": float(row.get('voice_evening_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
405
+ "night_pct": float(row.get('voice_night_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
406
+ }
407
+ },
408
+
409
+ # International
410
+ "international": {
411
+ "total_calls": float(row.get('intl_total_calls', 0)),
412
+ "total_duration_mins": float(row.get('intl_total_duration_mins', 0)),
413
+ "countries_called": int(row.get('intl_countries_called', 0)),
414
+ "top_country": str(row.get('intl_top_country', 'N/A')) if pd.notna(row.get('intl_top_country')) else 'N/A',
415
+ "all_countries": str(row.get('intl_all_countries', 'N/A')) if pd.notna(row.get('intl_all_countries')) else 'N/A',
416
+ "is_international_user": bool(row.get('intl_total_calls', 0) > 0)
417
+ },
418
+
419
+ # Internet
420
+ "internet": {
421
+ "total_mb": float(row['data_total_mb']),
422
+ "download_mb": float(row.get('data_downlink_mb', 0)),
423
+ "upload_mb": float(row.get('data_uplink_mb', 0)),
424
+ "download_pct": float(row.get('data_downlink_mb', 0) / row['data_total_mb'] * 100 if row['data_total_mb'] > 0 else 0),
425
+ "upload_pct": float(row.get('data_uplink_mb', 0) / row['data_total_mb'] * 100 if row['data_total_mb'] > 0 else 0),
426
+ },
427
+
428
+ # SMS
429
+ "sms": {
430
+ "total_messages": int(row.get('sms_total_messages', 0)),
431
+ "frequency": "High" if row.get('sms_total_messages', 0) > df['sms_total_messages'].quantile(0.75) else
432
+ "Medium" if row.get('sms_total_messages', 0) > df['sms_total_messages'].quantile(0.25) else "Low"
433
+ },
434
+
435
+ # User profile
436
+ "profile": {
437
+ "call_lover": bool(row.get('call_lover', 0)),
438
+ "data_lover": bool(row.get('data_lover', 0)),
439
+ "download_lover": bool(row.get('download_lover', 0)),
440
+ "upload_lover": bool(row.get('upload_lover', 0)),
441
+ "kmeans_cluster": int(row.get('kmeans_cluster', -1)) if pd.notna(row.get('kmeans_cluster')) else None,
442
+ "dbscan_cluster": int(row.get('dbscan_cluster', -1)) if pd.notna(row.get('dbscan_cluster')) else None,
443
+ }
444
+ }
445
+
446
+
447
+ @app.get("/api/time-analysis")
448
+ def get_time_analysis():
449
+ """Get time-based analysis of voice calls"""
450
+ if df is None:
451
+ raise HTTPException(status_code=500, detail="Data not loaded")
452
+
453
+ total_morning = df['voice_morning_calls'].sum()
454
+ total_evening = df['voice_evening_calls'].sum()
455
+ total_night = df['voice_night_calls'].sum()
456
+ total_all = total_morning + total_evening + total_night
457
+
458
+ return {
459
+ "overall": {
460
+ "morning_calls": int(total_morning),
461
+ "evening_calls": int(total_evening),
462
+ "night_calls": int(total_night),
463
+ "morning_pct": float(total_morning / total_all * 100 if total_all > 0 else 0),
464
+ "evening_pct": float(total_evening / total_all * 100 if total_all > 0 else 0),
465
+ "night_pct": float(total_night / total_all * 100 if total_all > 0 else 0),
466
+ },
467
+ "peak_time": "Morning" if total_morning == max(total_morning, total_evening, total_night) else
468
+ "Evening" if total_evening == max(total_morning, total_evening, total_night) else "Night",
469
+ "by_user_type": {
470
+ "call_lovers": {
471
+ "morning": int(df[df['call_lover'] == 1]['voice_morning_calls'].sum()),
472
+ "evening": int(df[df['call_lover'] == 1]['voice_evening_calls'].sum()),
473
+ "night": int(df[df['call_lover'] == 1]['voice_night_calls'].sum()),
474
+ },
475
+ "others": {
476
+ "morning": int(df[df['call_lover'] == 0]['voice_morning_calls'].sum()),
477
+ "evening": int(df[df['call_lover'] == 0]['voice_evening_calls'].sum()),
478
+ "night": int(df[df['call_lover'] == 0]['voice_night_calls'].sum()),
479
+ }
480
+ }
481
+ }
482
+
483
+
484
+ @app.get("/api/visualizations/time-distribution")
485
+ def viz_time_distribution():
486
+ """Generate time distribution chart"""
487
+ if df is None:
488
+ raise HTTPException(status_code=500, detail="Data not loaded")
489
+
490
+ time_data = {
491
+ 'Time Period': ['Morning', 'Evening', 'Night'],
492
+ 'Total Calls': [
493
+ df['voice_morning_calls'].sum(),
494
+ df['voice_evening_calls'].sum(),
495
+ df['voice_night_calls'].sum()
496
+ ]
497
+ }
498
+
499
+ fig = px.bar(
500
+ time_data,
501
+ x='Time Period',
502
+ y='Total Calls',
503
+ title='Call Distribution by Time of Day',
504
+ color='Time Period',
505
+ color_discrete_map={'Morning': '#FDB462', 'Evening': '#80B1D3', 'Night': '#8DD3C7'}
506
+ )
507
+
508
+ return JSONResponse(content={"chart": fig.to_json()})
509
+
510
+
511
+ @app.get("/api/visualizations/data-breakdown")
512
+ def viz_data_breakdown():
513
+ """Generate upload/download breakdown chart"""
514
+ if df is None:
515
+ raise HTTPException(status_code=500, detail="Data not loaded")
516
+
517
+ data_summary = {
518
+ 'Type': ['Download', 'Upload'],
519
+ 'Total (GB)': [
520
+ df['data_downlink_mb'].sum() / 1024,
521
+ df['data_uplink_mb'].sum() / 1024
522
+ ]
523
+ }
524
+
525
+ fig = px.pie(
526
+ data_summary,
527
+ values='Total (GB)',
528
+ names='Type',
529
+ title='Data Usage: Download vs Upload',
530
+ color_discrete_sequence=['#66C2A5', '#FC8D62']
531
+ )
532
+
533
+ return JSONResponse(content={"chart": fig.to_json()})
534
+
535
+
536
+ @app.get("/api/visualizations/customer-segments")
537
+ def viz_customer_segments():
538
+ """Generate customer segments visualization"""
539
+ if df is None or 'kmeans_cluster' not in df.columns:
540
+ raise HTTPException(status_code=500, detail="Clustering data not available")
541
+
542
+ # Get cluster statistics
543
+ cluster_stats = df.groupby('kmeans_cluster').agg({
544
+ 'subscriberid': 'count',
545
+ 'voice_total_calls': 'mean',
546
+ 'data_total_mb': 'mean',
547
+ 'sms_total_messages': 'mean'
548
+ }).reset_index()
549
+
550
+ cluster_stats.columns = ['Cluster', 'Customers', 'Avg Calls', 'Avg Data (MB)', 'Avg SMS']
551
+
552
+ fig = px.bar(
553
+ cluster_stats,
554
+ x='Cluster',
555
+ y='Customers',
556
+ title='Customer Distribution Across Segments',
557
+ color='Customers',
558
+ color_continuous_scale='viridis'
559
+ )
560
+
561
+ return JSONResponse(content={"chart": fig.to_json()})
562
+
563
+
564
+ @app.post("/api/cluster/run")
565
+ def run_clustering(request: ClusterRequest):
566
+ """Run clustering on-demand"""
567
+ if df is None:
568
+ raise HTTPException(status_code=500, detail="Data not loaded")
569
+
570
+ # Select features
571
+ feature_cols = [
572
+ 'voice_total_duration_mins', 'voice_total_calls',
573
+ 'data_total_mb', 'sms_total_messages'
574
+ ]
575
+
576
+ # Add international if exists
577
+ if 'intl_total_calls' in df.columns:
578
+ feature_cols.append('intl_total_calls')
579
+
580
+ X = df[feature_cols].fillna(0)
581
+
582
+ # Scale
583
+ scaler = StandardScaler()
584
+ X_scaled = scaler.fit_transform(X)
585
+
586
+ # Cluster
587
+ if request.algorithm == "kmeans":
588
+ model = MiniBatchKMeans(n_clusters=request.n_clusters, random_state=42, batch_size=1000)
589
+ labels = model.fit_predict(X_scaled)
590
+
591
+ # Calculate silhouette score
592
+ if len(df) > 10000:
593
+ sample_idx = np.random.choice(len(df), 10000, replace=False)
594
+ score = silhouette_score(X_scaled[sample_idx], labels[sample_idx])
595
+ else:
596
+ score = silhouette_score(X_scaled, labels)
597
+
598
+ elif request.algorithm == "dbscan":
599
+ model = DBSCAN(eps=0.3, min_samples=10)
600
+ labels = model.fit_predict(X_scaled)
601
+ score = None
602
+ else:
603
+ raise HTTPException(status_code=400, detail="Invalid algorithm")
604
+
605
+ # Get cluster stats
606
+ df_temp = df.copy()
607
+ df_temp['cluster'] = labels
608
+
609
+ cluster_info = []
610
+ for cluster_id in sorted(df_temp['cluster'].unique()):
611
+ cluster_data = df_temp[df_temp['cluster'] == cluster_id]
612
+ cluster_info.append({
613
+ "cluster_id": int(cluster_id),
614
+ "size": int(len(cluster_data)),
615
+ "percentage": float(len(cluster_data) / len(df) * 100),
616
+ "avg_voice_calls": float(cluster_data['voice_total_calls'].mean()),
617
+ "avg_data_mb": float(cluster_data['data_total_mb'].mean()),
618
+ "avg_sms": float(cluster_data.get('sms_total_messages', pd.Series([0])).mean()),
619
+ })
620
+
621
+ return {
622
+ "algorithm": request.algorithm,
623
+ "n_clusters": int(labels.max() + 1),
624
+ "silhouette_score": float(score) if score else None,
625
+ "clusters": cluster_info
626
+ }
627
+
628
+
629
+ @app.post("/api/query")
630
+ def query_with_llm(request: QueryRequest):
631
+ """Query data using Gemini LLM"""
632
+ if gemini_model is None:
633
+ raise HTTPException(status_code=503, detail="Gemini API not configured")
634
+
635
+ # Build context with safe column access
636
+ def safe_col_sum(col_name, default=0):
637
+ """Safely get column sum or return default"""
638
+ return df[col_name].sum() if col_name in df.columns else default
639
+
640
+ def safe_col_mean(col_name, default=0):
641
+ """Safely get column mean or return default"""
642
+ return df[col_name].mean() if col_name in df.columns else default
643
+
644
+ def safe_col_count(col_name, condition_value=0):
645
+ """Safely count rows where column > condition_value"""
646
+ if col_name in df.columns:
647
+ return (df[col_name] > condition_value).sum()
648
+ return 0
649
+
650
+ context = f"""
651
+ You are analyzing telecom customer data. Here are the key statistics:
652
+
653
+ Total Customers: {len(df):,}
654
+ International Users: {int(safe_col_count('intl_total_calls', 0)):,}
655
+
656
+ Voice Communication:
657
+ - Total Calls: {safe_col_sum('voice_total_calls'):,.0f}
658
+ - Total Duration: {safe_col_sum('voice_total_duration_mins'):,.0f} mins
659
+ - Average per User: {safe_col_mean('voice_total_calls'):.1f} calls
660
+
661
+ {'Time Distribution:' if 'voice_morning_calls' in df.columns else ''}
662
+ {f"- Morning Calls: {safe_col_sum('voice_morning_calls'):,.0f}" if 'voice_morning_calls' in df.columns else ''}
663
+ {f"- Evening Calls: {safe_col_sum('voice_evening_calls'):,.0f}" if 'voice_evening_calls' in df.columns else ''}
664
+ {f"- Night Calls: {safe_col_sum('voice_night_calls'):,.0f}" if 'voice_night_calls' in df.columns else ''}
665
+
666
+ {'SMS:' if 'sms_total_messages' in df.columns else ''}
667
+ {f"- Total Messages: {safe_col_sum('sms_total_messages'):,.0f}" if 'sms_total_messages' in df.columns else ''}
668
+ {f"- Average per User: {safe_col_mean('sms_total_messages'):.1f}" if 'sms_total_messages' in df.columns else ''}
669
+
670
+ Data Usage:
671
+ - Total Data (MB): {safe_col_sum('data_total_mb'):,.0f}
672
+ - Average per User (MB): {safe_col_mean('data_total_mb'):.1f}
673
+ {f"- Total Download (GB): {safe_col_sum('data_downlink_mb') / 1024:.1f}" if 'data_downlink_mb' in df.columns else ''}
674
+ {f"- Total Upload (GB): {safe_col_sum('data_uplink_mb') / 1024:.1f}" if 'data_uplink_mb' in df.columns else ''}
675
+
676
+ User Question: {request.question}
677
+
678
+ Provide a clear, concise answer based on the statistics above.
679
+ """
680
+
681
+ try:
682
+ response = gemini_model.generate_content(context)
683
+ return QueryResponse(answer=response.text, data=None)
684
+ except Exception as e:
685
+ import traceback
686
+ error_details = traceback.format_exc()
687
+ print(f"❌ Query error: {error_details}")
688
+ raise HTTPException(status_code=500, detail=f"LLM error: {str(e)}")
689
+
690
+
691
+ @app.get("/api/search")
692
+ def semantic_search(query: str = Query(..., description="Search query"), limit: int = 10):
693
+ """Semantic search for customers"""
694
+ global faiss_index, faiss_building
695
+
696
+ if embedding_model is None:
697
+ raise HTTPException(status_code=503, detail="Embedding model not available")
698
+
699
+ # Lazy load FAISS index on first request
700
+ if faiss_index is None and not faiss_building:
701
+ faiss_building = True
702
+ try:
703
+ init_faiss_index()
704
+ finally:
705
+ faiss_building = False
706
+
707
+ if faiss_index is None:
708
+ raise HTTPException(status_code=503, detail="FAISS index building, please try again in a moment")
709
+
710
+ # Embed query
711
+ query_embedding = embedding_model.encode([query])
712
+ faiss.normalize_L2(query_embedding)
713
+
714
+ # Search
715
+ scores, indices = faiss_index.search(query_embedding, limit)
716
+
717
+ results = []
718
+ for score, idx in zip(scores[0], indices[0]):
719
+ if idx < len(df):
720
+ customer = df.iloc[idx]
721
+ results.append({
722
+ "customer_id": int(customer['subscriberid']),
723
+ "similarity_score": float(score),
724
+ "voice_calls": float(customer['voice_total_calls']),
725
+ "data_mb": float(customer['data_total_mb']),
726
+ "sms": int(customer.get('sms_total_messages', 0)),
727
+ "is_international": bool(customer.get('intl_total_calls', 0) > 0)
728
+ })
729
+
730
+ return {"results": results}
731
+
732
+
733
+ @app.get("/api/clusters")
734
+ def get_clusters(cluster_type: str = "kmeans"):
735
+ """Get cluster information"""
736
+ if df is None:
737
+ raise HTTPException(status_code=500, detail="Data not loaded")
738
+
739
+ cluster_col = f"{cluster_type}_cluster"
740
+ if cluster_col not in df.columns:
741
+ raise HTTPException(status_code=404, detail=f"{cluster_type} clusters not found")
742
+
743
+ cluster_info = []
744
+ for cluster_id in sorted(df[cluster_col].unique()):
745
+ if pd.isna(cluster_id):
746
+ continue
747
+
748
+ cluster_data = df[df[cluster_col] == cluster_id]
749
+ cluster_info.append({
750
+ "cluster_id": int(cluster_id),
751
+ "size": int(len(cluster_data)),
752
+ "percentage": float(len(cluster_data) / len(df) * 100),
753
+ "avg_voice_mins": float(cluster_data['voice_total_duration_mins'].mean()),
754
+ "avg_data_mb": float(cluster_data['data_total_mb'].mean()),
755
+ "avg_sms": float(cluster_data.get('sms_total_messages', pd.Series([0])).mean()),
756
+ "avg_intl_calls": float(cluster_data.get('intl_total_calls', pd.Series([0])).mean()),
757
+ })
758
+
759
+ return {"cluster_type": cluster_type, "clusters": cluster_info}
760
+
761
+
762
+ if __name__ == "__main__":
763
+ import uvicorn
764
+ uvicorn.run(app, host="0.0.0.0", port=7860)