Hamza4100 commited on
Commit
0efd294
·
verified ·
1 Parent(s): 466e3d9

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ golden_table_clustered.csv filter=lfs diff=lfs merge=lfs -text
37
+ merged_subscriber_data.csv filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies
6
+ RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Install Python dependencies
11
+ COPY requirements.txt .
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ # Copy application files
15
+ COPY app.py .
16
+ COPY golden_table_clustered.csv .
17
+
18
+ # Create data directory
19
+ RUN mkdir -p /app/data
20
+
21
+ # Expose port (HF Spaces uses 7860)
22
+ EXPOSE 7860
23
+
24
+ # Set environment variables
25
+ ENV PYTHONUNBUFFERED=1
26
+
27
+ # Run the application
28
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,747 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced Telecom Customer Segmentation Backend API
3
+ =================================================
4
+ FastAPI backend with:
5
+ - Enhanced cluster analysis with ALL data fields
6
+ - Time-based analysis (morning/evening/night)
7
+ - SMS insights
8
+ - Upload/Download breakdown
9
+ - Dynamic visualization generation
10
+ - On-demand clustering
11
+ - Gemini LLM integration
12
+ - HuggingFace embeddings for semantic search
13
+ """
14
+
15
+ import os
16
+ import json
17
+ import sqlite3
18
+ import pickle
19
+ import io
20
+ import base64
21
+ from typing import Optional, List, Dict, Any
22
+ from contextlib import asynccontextmanager
23
+ from datetime import datetime
24
+
25
+ import pandas as pd
26
+ import numpy as np
27
+ from fastapi import FastAPI, HTTPException, Query
28
+ from fastapi.middleware.cors import CORSMiddleware
29
+ from fastapi.responses import JSONResponse, Response
30
+ from pydantic import BaseModel
31
+ import google.generativeai as genai
32
+ from sentence_transformers import SentenceTransformer
33
+ import faiss
34
+
35
+ # ML imports
36
+ from sklearn.cluster import MiniBatchKMeans, DBSCAN
37
+ from sklearn.preprocessing import StandardScaler
38
+ from sklearn.metrics import silhouette_score
39
+ from sklearn.decomposition import PCA
40
+
41
+ # Visualization imports
42
+ import matplotlib
43
+ matplotlib.use('Agg') # Non-interactive backend
44
+ import matplotlib.pyplot as plt
45
+ import plotly.graph_objects as go
46
+ import plotly.express as px
47
+
48
+ # ============================================
49
+ # CONFIGURATION
50
+ # ============================================
51
+
52
+ GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
53
+ if GEMINI_API_KEY:
54
+ genai.configure(api_key=GEMINI_API_KEY)
55
+
56
+ # Data paths
57
+ MERGED_DATA_PATH = "merged_subscriber_data.csv"
58
+ INTL_DATA_PATH = "international_calls.csv"
59
+ CLUSTERED_DATA_PATH = "golden_table_clustered.csv"
60
+ DB_PATH = "data/database.db"
61
+ FAISS_INDEX_PATH = "data/faiss_index.bin"
62
+ EMBEDDINGS_PATH = "data/embeddings.pkl"
63
+
64
+ # Global variables
65
+ df = None
66
+ df_full = None # Full data with all fields
67
+ conn = None
68
+ embedding_model = None
69
+ faiss_index = None
70
+ gemini_model = None
71
+
72
+
73
+ # ============================================
74
+ # STARTUP / SHUTDOWN
75
+ # ============================================
76
+
77
+ @asynccontextmanager
78
+ async def lifespan(app: FastAPI):
79
+ """Initialize resources on startup"""
80
+ global df, df_full, conn, embedding_model, faiss_index, gemini_model
81
+
82
+ print("🚀 Starting Enhanced Telecom API...")
83
+
84
+ # Load full data with all fields
85
+ if os.path.exists(MERGED_DATA_PATH):
86
+ df_merged = pd.read_csv(MERGED_DATA_PATH)
87
+ if os.path.exists(INTL_DATA_PATH):
88
+ df_intl = pd.read_csv(INTL_DATA_PATH)
89
+ df_full = pd.merge(df_merged, df_intl, on='subscriberid', how='left')
90
+ else:
91
+ df_full = df_merged
92
+
93
+ # Fill NaN values
94
+ df_full = df_full.fillna(0)
95
+ print(f"✓ Loaded {len(df_full):,} customers with enhanced data")
96
+
97
+ # Load clustered results if available
98
+ if os.path.exists(CLUSTERED_DATA_PATH):
99
+ df_clustered = pd.read_csv(CLUSTERED_DATA_PATH)
100
+ # Merge cluster labels into full data
101
+ df_full = pd.merge(
102
+ df_full,
103
+ df_clustered[['subscriberid', 'kmeans_cluster', 'dbscan_cluster']],
104
+ on='subscriberid',
105
+ how='left'
106
+ )
107
+
108
+ df = df_full.copy()
109
+ else:
110
+ print("⚠ Data files not found")
111
+ df = df_full = create_sample_data()
112
+
113
+ # Initialize database
114
+ init_database()
115
+
116
+ # Load models
117
+ try:
118
+ embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
119
+ print("✓ Loaded embedding model")
120
+ except Exception as e:
121
+ print(f"⚠ Embedding model error: {e}")
122
+
123
+ if GEMINI_API_KEY:
124
+ try:
125
+ gemini_model = genai.GenerativeModel('gemini-2.5-flash')
126
+ print("✓ Initialized Gemini")
127
+ except Exception as e:
128
+ print(f"⚠ Gemini error: {e}")
129
+
130
+ init_faiss_index()
131
+ print("✅ API ready!")
132
+
133
+ yield
134
+
135
+ if conn:
136
+ conn.close()
137
+ print("👋 Shutdown complete")
138
+
139
+
140
+ # ============================================
141
+ # INITIALIZE APP
142
+ # ============================================
143
+
144
+ app = FastAPI(
145
+ title="Enhanced Telecom Segmentation API",
146
+ description="Advanced telecom customer analytics with time-based insights",
147
+ version="2.0.0",
148
+ lifespan=lifespan
149
+ )
150
+
151
+ app.add_middleware(
152
+ CORSMiddleware,
153
+ allow_origins=["*"],
154
+ allow_credentials=True,
155
+ allow_methods=["*"],
156
+ allow_headers=["*"],
157
+ )
158
+
159
+
160
+ # ============================================
161
+ # PYDANTIC MODELS
162
+ # ============================================
163
+
164
+ class QueryRequest(BaseModel):
165
+ question: str
166
+
167
+ class QueryResponse(BaseModel):
168
+ answer: str
169
+ data: Optional[Dict[str, Any]] = None
170
+
171
+ class EnhancedCustomerInfo(BaseModel):
172
+ subscriberid: int
173
+
174
+ # Voice communication
175
+ voice_total_duration_mins: float
176
+ voice_total_calls: float
177
+ voice_morning_calls: float
178
+ voice_evening_calls: float
179
+ voice_night_calls: float
180
+
181
+ # SMS
182
+ sms_total_messages: float
183
+
184
+ # Data
185
+ data_total_mb: float
186
+ data_downlink_mb: float
187
+ data_uplink_mb: float
188
+
189
+ # International
190
+ intl_total_calls: float
191
+ intl_total_duration_mins: float
192
+ intl_countries_called: float
193
+ intl_top_country: Optional[str]
194
+
195
+ # User types
196
+ call_lover: int
197
+ download_lover: int
198
+ upload_lover: int
199
+ data_lover: int
200
+
201
+ # Clustering
202
+ kmeans_cluster: Optional[int]
203
+ dbscan_cluster: Optional[int]
204
+
205
+ class ClusterRequest(BaseModel):
206
+ n_clusters: int = 6
207
+ algorithm: str = "kmeans" # kmeans or dbscan
208
+
209
+
210
+ # ============================================
211
+ # HELPER FUNCTIONS
212
+ # ============================================
213
+
214
+ def create_sample_data():
215
+ """Create sample data"""
216
+ np.random.seed(42)
217
+ n = 1000
218
+ return pd.DataFrame({
219
+ 'subscriberid': range(1, n+1),
220
+ 'voice_total_duration_mins': np.random.exponential(10, n),
221
+ 'voice_total_calls': np.random.poisson(10, n),
222
+ 'voice_morning_calls': np.random.poisson(3, n),
223
+ 'voice_evening_calls': np.random.poisson(4, n),
224
+ 'voice_night_calls': np.random.poisson(3, n),
225
+ 'sms_total_messages': np.random.poisson(5, n),
226
+ 'data_total_mb': np.random.exponential(400, n),
227
+ 'data_downlink_mb': np.random.exponential(300, n),
228
+ 'data_uplink_mb': np.random.exponential(100, n),
229
+ 'intl_total_calls': np.random.poisson(0.5, n),
230
+ 'intl_total_duration_mins': np.random.exponential(0.5, n),
231
+ 'intl_countries_called': np.random.poisson(0.3, n),
232
+ 'call_lover': np.random.choice([0, 1], n, p=[0.75, 0.25]),
233
+ 'data_lover': np.random.choice([0, 1], n, p=[0.75, 0.25]),
234
+ 'kmeans_cluster': np.random.choice(range(6), n),
235
+ 'dbscan_cluster': np.random.choice(range(12), n),
236
+ })
237
+
238
+
239
+ def init_database():
240
+ """Initialize SQLite database"""
241
+ global conn, df_full
242
+ os.makedirs("data", exist_ok=True)
243
+ conn = sqlite3.connect(DB_PATH, check_same_thread=False)
244
+ df_full.to_sql('customers', conn, if_exists='replace', index=False)
245
+ conn.execute("CREATE INDEX IF NOT EXISTS idx_subscriberid ON customers(subscriberid)")
246
+ print("✓ Database initialized")
247
+
248
+
249
+ def init_faiss_index():
250
+ """Build FAISS index for semantic search"""
251
+ global faiss_index, embedding_model, df
252
+
253
+ if embedding_model is None:
254
+ return
255
+
256
+ if os.path.exists(FAISS_INDEX_PATH):
257
+ try:
258
+ faiss_index = faiss.read_index(FAISS_INDEX_PATH)
259
+ print("✓ Loaded FAISS index")
260
+ return
261
+ except:
262
+ pass
263
+
264
+ # Build index
265
+ print("Building FAISS index...")
266
+ descriptions = []
267
+ for _, row in df.iterrows():
268
+ desc = f"Customer {row['subscriberid']}: "
269
+ desc += f"{row.get('voice_total_calls', 0):.0f} voice calls, "
270
+ desc += f"{row.get('data_total_mb', 0):.0f} MB data, "
271
+ desc += f"{row.get('sms_total_messages', 0):.0f} SMS, "
272
+ if row.get('intl_total_calls', 0) > 0:
273
+ desc += f"{row.get('intl_total_calls', 0):.0f} international calls"
274
+ descriptions.append(desc)
275
+
276
+ embeddings = embedding_model.encode(descriptions, show_progress_bar=True, batch_size=32)
277
+
278
+ dimension = embeddings.shape[1]
279
+ faiss_index = faiss.IndexFlatIP(dimension)
280
+ faiss.normalize_L2(embeddings)
281
+ faiss_index.add(embeddings)
282
+
283
+ faiss.write_index(faiss_index, FAISS_INDEX_PATH)
284
+ print("✓ Built FAISS index")
285
+
286
+
287
+ def get_cluster_label(row):
288
+ """Get human-readable cluster label"""
289
+ if row['intl_total_calls'] > 0:
290
+ if row['data_total_mb'] > row['data_total_mb'].median():
291
+ return "International Data Users"
292
+ else:
293
+ return "International Callers"
294
+ elif row['voice_total_calls'] > row['voice_total_calls'].quantile(0.75):
295
+ return "Heavy Voice Users"
296
+ elif row['data_total_mb'] > row['data_total_mb'].quantile(0.75):
297
+ return "Heavy Data Users"
298
+ elif row['sms_total_messages'] > row['sms_total_messages'].quantile(0.75):
299
+ return "SMS Enthusiasts"
300
+ else:
301
+ return "Light Users"
302
+
303
+
304
+ # ============================================
305
+ # ENDPOINTS
306
+ # ============================================
307
+
308
+ @app.get("/")
309
+ def health_check():
310
+ """Health check"""
311
+ return {
312
+ "status": "healthy",
313
+ "version": "2.0",
314
+ "customers": len(df) if df is not None else 0,
315
+ "columns": list(df.columns) if df is not None else [],
316
+ "features": [
317
+ "time_analysis",
318
+ "sms_insights",
319
+ "upload_download_split",
320
+ "international_details",
321
+ "dynamic_clustering",
322
+ "dynamic_visualizations"
323
+ ]
324
+ }
325
+
326
+
327
+ @app.get("/api/stats")
328
+ def get_stats():
329
+ """Get overall statistics with enhanced metrics"""
330
+ if df is None:
331
+ raise HTTPException(status_code=500, detail="Data not loaded")
332
+
333
+ return {
334
+ "total_customers": int(len(df)),
335
+ "international_users": int(df[df['intl_total_calls'] > 0]['subscriberid'].nunique()),
336
+ "international_percentage": float((df['intl_total_calls'] > 0).sum() / len(df) * 100),
337
+
338
+ # Voice stats
339
+ "avg_voice_mins": float(df['voice_total_duration_mins'].mean()),
340
+ "avg_voice_calls": float(df['voice_total_calls'].mean()),
341
+ "total_voice_mins": float(df['voice_total_duration_mins'].sum()),
342
+
343
+ # Time breakdown
344
+ "morning_calls": int(df['voice_morning_calls'].sum()),
345
+ "evening_calls": int(df['voice_evening_calls'].sum()),
346
+ "night_calls": int(df['voice_night_calls'].sum()),
347
+
348
+ # SMS stats
349
+ "total_sms": int(df['sms_total_messages'].sum()),
350
+ "avg_sms_per_user": float(df['sms_total_messages'].mean()),
351
+ "sms_users": int((df['sms_total_messages'] > 0).sum()),
352
+
353
+ # Data stats
354
+ "avg_data_mb": float(df['data_total_mb'].mean()),
355
+ "avg_download_mb": float(df['data_downlink_mb'].mean()),
356
+ "avg_upload_mb": float(df['data_uplink_mb'].mean()),
357
+ "total_data_gb": float(df['data_total_mb'].sum() / 1024),
358
+
359
+ # User types
360
+ "call_lovers": int(df['call_lover'].sum()),
361
+ "data_lovers": int(df['data_lover'].sum()),
362
+ "download_lovers": int(df.get('download_lover', pd.Series([0])).sum()),
363
+ "upload_lovers": int(df.get('upload_lover', pd.Series([0])).sum()),
364
+ }
365
+
366
+
367
+ @app.get("/api/customers/{customer_id}")
368
+ def get_customer(customer_id: int):
369
+ """Get detailed customer information"""
370
+ if df is None:
371
+ raise HTTPException(status_code=500, detail="Data not loaded")
372
+
373
+ customer = df[df['subscriberid'] == customer_id]
374
+
375
+ if customer.empty:
376
+ raise HTTPException(status_code=404, detail=f"Customer {customer_id} not found")
377
+
378
+ row = customer.iloc[0]
379
+
380
+ # Calculate time distribution
381
+ total_calls_by_time = (
382
+ row.get('voice_morning_calls', 0) +
383
+ row.get('voice_evening_calls', 0) +
384
+ row.get('voice_night_calls', 0)
385
+ )
386
+
387
+ return {
388
+ "subscriberid": int(row['subscriberid']),
389
+
390
+ # Communication
391
+ "communication": {
392
+ "voice_total_duration_mins": float(row['voice_total_duration_mins']),
393
+ "voice_total_calls": float(row['voice_total_calls']),
394
+ "voice_avg_duration_mins": float(row.get('voice_avg_duration_mins', 0)),
395
+ "time_distribution": {
396
+ "morning_calls": int(row.get('voice_morning_calls', 0)),
397
+ "evening_calls": int(row.get('voice_evening_calls', 0)),
398
+ "night_calls": int(row.get('voice_night_calls', 0)),
399
+ "morning_pct": float(row.get('voice_morning_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
400
+ "evening_pct": float(row.get('voice_evening_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
401
+ "night_pct": float(row.get('voice_night_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
402
+ }
403
+ },
404
+
405
+ # International
406
+ "international": {
407
+ "total_calls": float(row.get('intl_total_calls', 0)),
408
+ "total_duration_mins": float(row.get('intl_total_duration_mins', 0)),
409
+ "countries_called": int(row.get('intl_countries_called', 0)),
410
+ "top_country": str(row.get('intl_top_country', 'N/A')) if pd.notna(row.get('intl_top_country')) else 'N/A',
411
+ "all_countries": str(row.get('intl_all_countries', 'N/A')) if pd.notna(row.get('intl_all_countries')) else 'N/A',
412
+ "is_international_user": bool(row.get('intl_total_calls', 0) > 0)
413
+ },
414
+
415
+ # Internet
416
+ "internet": {
417
+ "total_mb": float(row['data_total_mb']),
418
+ "download_mb": float(row.get('data_downlink_mb', 0)),
419
+ "upload_mb": float(row.get('data_uplink_mb', 0)),
420
+ "download_pct": float(row.get('data_downlink_mb', 0) / row['data_total_mb'] * 100 if row['data_total_mb'] > 0 else 0),
421
+ "upload_pct": float(row.get('data_uplink_mb', 0) / row['data_total_mb'] * 100 if row['data_total_mb'] > 0 else 0),
422
+ },
423
+
424
+ # SMS
425
+ "sms": {
426
+ "total_messages": int(row.get('sms_total_messages', 0)),
427
+ "frequency": "High" if row.get('sms_total_messages', 0) > df['sms_total_messages'].quantile(0.75) else
428
+ "Medium" if row.get('sms_total_messages', 0) > df['sms_total_messages'].quantile(0.25) else "Low"
429
+ },
430
+
431
+ # User profile
432
+ "profile": {
433
+ "call_lover": bool(row.get('call_lover', 0)),
434
+ "data_lover": bool(row.get('data_lover', 0)),
435
+ "download_lover": bool(row.get('download_lover', 0)),
436
+ "upload_lover": bool(row.get('upload_lover', 0)),
437
+ "kmeans_cluster": int(row.get('kmeans_cluster', -1)) if pd.notna(row.get('kmeans_cluster')) else None,
438
+ "dbscan_cluster": int(row.get('dbscan_cluster', -1)) if pd.notna(row.get('dbscan_cluster')) else None,
439
+ }
440
+ }
441
+
442
+
443
+ @app.get("/api/time-analysis")
444
+ def get_time_analysis():
445
+ """Get time-based analysis of voice calls"""
446
+ if df is None:
447
+ raise HTTPException(status_code=500, detail="Data not loaded")
448
+
449
+ total_morning = df['voice_morning_calls'].sum()
450
+ total_evening = df['voice_evening_calls'].sum()
451
+ total_night = df['voice_night_calls'].sum()
452
+ total_all = total_morning + total_evening + total_night
453
+
454
+ return {
455
+ "overall": {
456
+ "morning_calls": int(total_morning),
457
+ "evening_calls": int(total_evening),
458
+ "night_calls": int(total_night),
459
+ "morning_pct": float(total_morning / total_all * 100 if total_all > 0 else 0),
460
+ "evening_pct": float(total_evening / total_all * 100 if total_all > 0 else 0),
461
+ "night_pct": float(total_night / total_all * 100 if total_all > 0 else 0),
462
+ },
463
+ "peak_time": "Morning" if total_morning == max(total_morning, total_evening, total_night) else
464
+ "Evening" if total_evening == max(total_morning, total_evening, total_night) else "Night",
465
+ "by_user_type": {
466
+ "call_lovers": {
467
+ "morning": int(df[df['call_lover'] == 1]['voice_morning_calls'].sum()),
468
+ "evening": int(df[df['call_lover'] == 1]['voice_evening_calls'].sum()),
469
+ "night": int(df[df['call_lover'] == 1]['voice_night_calls'].sum()),
470
+ },
471
+ "others": {
472
+ "morning": int(df[df['call_lover'] == 0]['voice_morning_calls'].sum()),
473
+ "evening": int(df[df['call_lover'] == 0]['voice_evening_calls'].sum()),
474
+ "night": int(df[df['call_lover'] == 0]['voice_night_calls'].sum()),
475
+ }
476
+ }
477
+ }
478
+
479
+
480
+ @app.get("/api/visualizations/time-distribution")
481
+ def viz_time_distribution():
482
+ """Generate time distribution chart"""
483
+ if df is None:
484
+ raise HTTPException(status_code=500, detail="Data not loaded")
485
+
486
+ time_data = {
487
+ 'Time Period': ['Morning', 'Evening', 'Night'],
488
+ 'Total Calls': [
489
+ df['voice_morning_calls'].sum(),
490
+ df['voice_evening_calls'].sum(),
491
+ df['voice_night_calls'].sum()
492
+ ]
493
+ }
494
+
495
+ fig = px.bar(
496
+ time_data,
497
+ x='Time Period',
498
+ y='Total Calls',
499
+ title='Call Distribution by Time of Day',
500
+ color='Time Period',
501
+ color_discrete_map={'Morning': '#FDB462', 'Evening': '#80B1D3', 'Night': '#8DD3C7'}
502
+ )
503
+
504
+ return JSONResponse(content={"chart": fig.to_json()})
505
+
506
+
507
+ @app.get("/api/visualizations/data-breakdown")
508
+ def viz_data_breakdown():
509
+ """Generate upload/download breakdown chart"""
510
+ if df is None:
511
+ raise HTTPException(status_code=500, detail="Data not loaded")
512
+
513
+ data_summary = {
514
+ 'Type': ['Download', 'Upload'],
515
+ 'Total (GB)': [
516
+ df['data_downlink_mb'].sum() / 1024,
517
+ df['data_uplink_mb'].sum() / 1024
518
+ ]
519
+ }
520
+
521
+ fig = px.pie(
522
+ data_summary,
523
+ values='Total (GB)',
524
+ names='Type',
525
+ title='Data Usage: Download vs Upload',
526
+ color_discrete_sequence=['#66C2A5', '#FC8D62']
527
+ )
528
+
529
+ return JSONResponse(content={"chart": fig.to_json()})
530
+
531
+
532
+ @app.get("/api/visualizations/customer-segments")
533
+ def viz_customer_segments():
534
+ """Generate customer segments visualization"""
535
+ if df is None or 'kmeans_cluster' not in df.columns:
536
+ raise HTTPException(status_code=500, detail="Clustering data not available")
537
+
538
+ # Get cluster statistics
539
+ cluster_stats = df.groupby('kmeans_cluster').agg({
540
+ 'subscriberid': 'count',
541
+ 'voice_total_calls': 'mean',
542
+ 'data_total_mb': 'mean',
543
+ 'sms_total_messages': 'mean'
544
+ }).reset_index()
545
+
546
+ cluster_stats.columns = ['Cluster', 'Customers', 'Avg Calls', 'Avg Data (MB)', 'Avg SMS']
547
+
548
+ fig = px.bar(
549
+ cluster_stats,
550
+ x='Cluster',
551
+ y='Customers',
552
+ title='Customer Distribution Across Segments',
553
+ color='Customers',
554
+ color_continuous_scale='viridis'
555
+ )
556
+
557
+ return JSONResponse(content={"chart": fig.to_json()})
558
+
559
+
560
+ @app.post("/api/cluster/run")
561
+ def run_clustering(request: ClusterRequest):
562
+ """Run clustering on-demand"""
563
+ if df is None:
564
+ raise HTTPException(status_code=500, detail="Data not loaded")
565
+
566
+ # Select features
567
+ feature_cols = [
568
+ 'voice_total_duration_mins', 'voice_total_calls',
569
+ 'data_total_mb', 'sms_total_messages'
570
+ ]
571
+
572
+ # Add international if exists
573
+ if 'intl_total_calls' in df.columns:
574
+ feature_cols.append('intl_total_calls')
575
+
576
+ X = df[feature_cols].fillna(0)
577
+
578
+ # Scale
579
+ scaler = StandardScaler()
580
+ X_scaled = scaler.fit_transform(X)
581
+
582
+ # Cluster
583
+ if request.algorithm == "kmeans":
584
+ model = MiniBatchKMeans(n_clusters=request.n_clusters, random_state=42, batch_size=1000)
585
+ labels = model.fit_predict(X_scaled)
586
+
587
+ # Calculate silhouette score
588
+ if len(df) > 10000:
589
+ sample_idx = np.random.choice(len(df), 10000, replace=False)
590
+ score = silhouette_score(X_scaled[sample_idx], labels[sample_idx])
591
+ else:
592
+ score = silhouette_score(X_scaled, labels)
593
+
594
+ elif request.algorithm == "dbscan":
595
+ model = DBSCAN(eps=0.3, min_samples=10)
596
+ labels = model.fit_predict(X_scaled)
597
+ score = None
598
+ else:
599
+ raise HTTPException(status_code=400, detail="Invalid algorithm")
600
+
601
+ # Get cluster stats
602
+ df_temp = df.copy()
603
+ df_temp['cluster'] = labels
604
+
605
+ cluster_info = []
606
+ for cluster_id in sorted(df_temp['cluster'].unique()):
607
+ cluster_data = df_temp[df_temp['cluster'] == cluster_id]
608
+ cluster_info.append({
609
+ "cluster_id": int(cluster_id),
610
+ "size": int(len(cluster_data)),
611
+ "percentage": float(len(cluster_data) / len(df) * 100),
612
+ "avg_voice_calls": float(cluster_data['voice_total_calls'].mean()),
613
+ "avg_data_mb": float(cluster_data['data_total_mb'].mean()),
614
+ "avg_sms": float(cluster_data.get('sms_total_messages', pd.Series([0])).mean()),
615
+ })
616
+
617
+ return {
618
+ "algorithm": request.algorithm,
619
+ "n_clusters": int(labels.max() + 1),
620
+ "silhouette_score": float(score) if score else None,
621
+ "clusters": cluster_info
622
+ }
623
+
624
+
625
+ @app.post("/api/query")
626
+ def query_with_llm(request: QueryRequest):
627
+ """Query data using Gemini LLM"""
628
+ if gemini_model is None:
629
+ raise HTTPException(status_code=503, detail="Gemini API not configured")
630
+
631
+ # Build context with safe column access
632
+ def safe_col_sum(col_name, default=0):
633
+ """Safely get column sum or return default"""
634
+ return df[col_name].sum() if col_name in df.columns else default
635
+
636
+ def safe_col_mean(col_name, default=0):
637
+ """Safely get column mean or return default"""
638
+ return df[col_name].mean() if col_name in df.columns else default
639
+
640
+ def safe_col_count(col_name, condition_value=0):
641
+ """Safely count rows where column > condition_value"""
642
+ if col_name in df.columns:
643
+ return (df[col_name] > condition_value).sum()
644
+ return 0
645
+
646
+ context = f"""
647
+ You are analyzing telecom customer data. Here are the key statistics:
648
+
649
+ Total Customers: {len(df):,}
650
+ International Users: {int(safe_col_count('intl_total_calls', 0)):,}
651
+
652
+ Voice Communication:
653
+ - Total Calls: {safe_col_sum('voice_total_calls'):,.0f}
654
+ - Total Duration: {safe_col_sum('voice_total_duration_mins'):,.0f} mins
655
+ - Average per User: {safe_col_mean('voice_total_calls'):.1f} calls
656
+
657
+ {'Time Distribution:' if 'voice_morning_calls' in df.columns else ''}
658
+ {f"- Morning Calls: {safe_col_sum('voice_morning_calls'):,.0f}" if 'voice_morning_calls' in df.columns else ''}
659
+ {f"- Evening Calls: {safe_col_sum('voice_evening_calls'):,.0f}" if 'voice_evening_calls' in df.columns else ''}
660
+ {f"- Night Calls: {safe_col_sum('voice_night_calls'):,.0f}" if 'voice_night_calls' in df.columns else ''}
661
+
662
+ {'SMS:' if 'sms_total_messages' in df.columns else ''}
663
+ {f"- Total Messages: {safe_col_sum('sms_total_messages'):,.0f}" if 'sms_total_messages' in df.columns else ''}
664
+ {f"- Average per User: {safe_col_mean('sms_total_messages'):.1f}" if 'sms_total_messages' in df.columns else ''}
665
+
666
+ Data Usage:
667
+ - Total Data (MB): {safe_col_sum('data_total_mb'):,.0f}
668
+ - Average per User (MB): {safe_col_mean('data_total_mb'):.1f}
669
+ {f"- Total Download (GB): {safe_col_sum('data_downlink_mb') / 1024:.1f}" if 'data_downlink_mb' in df.columns else ''}
670
+ {f"- Total Upload (GB): {safe_col_sum('data_uplink_mb') / 1024:.1f}" if 'data_uplink_mb' in df.columns else ''}
671
+
672
+ User Question: {request.question}
673
+
674
+ Provide a clear, concise answer based on the statistics above.
675
+ """
676
+
677
+ try:
678
+ response = gemini_model.generate_content(context)
679
+ return QueryResponse(answer=response.text, data=None)
680
+ except Exception as e:
681
+ import traceback
682
+ error_details = traceback.format_exc()
683
+ print(f"❌ Query error: {error_details}")
684
+ raise HTTPException(status_code=500, detail=f"LLM error: {str(e)}")
685
+
686
+
687
+ @app.get("/api/search")
688
+ def semantic_search(query: str = Query(..., description="Search query"), limit: int = 10):
689
+ """Semantic search for customers"""
690
+ if embedding_model is None or faiss_index is None:
691
+ raise HTTPException(status_code=503, detail="Search not available")
692
+
693
+ # Embed query
694
+ query_embedding = embedding_model.encode([query])
695
+ faiss.normalize_L2(query_embedding)
696
+
697
+ # Search
698
+ scores, indices = faiss_index.search(query_embedding, limit)
699
+
700
+ results = []
701
+ for score, idx in zip(scores[0], indices[0]):
702
+ if idx < len(df):
703
+ customer = df.iloc[idx]
704
+ results.append({
705
+ "customer_id": int(customer['subscriberid']),
706
+ "similarity_score": float(score),
707
+ "voice_calls": float(customer['voice_total_calls']),
708
+ "data_mb": float(customer['data_total_mb']),
709
+ "sms": int(customer.get('sms_total_messages', 0)),
710
+ "is_international": bool(customer.get('intl_total_calls', 0) > 0)
711
+ })
712
+
713
+ return {"results": results}
714
+
715
+
716
+ @app.get("/api/clusters")
717
+ def get_clusters(cluster_type: str = "kmeans"):
718
+ """Get cluster information"""
719
+ if df is None:
720
+ raise HTTPException(status_code=500, detail="Data not loaded")
721
+
722
+ cluster_col = f"{cluster_type}_cluster"
723
+ if cluster_col not in df.columns:
724
+ raise HTTPException(status_code=404, detail=f"{cluster_type} clusters not found")
725
+
726
+ cluster_info = []
727
+ for cluster_id in sorted(df[cluster_col].unique()):
728
+ if pd.isna(cluster_id):
729
+ continue
730
+
731
+ cluster_data = df[df[cluster_col] == cluster_id]
732
+ cluster_info.append({
733
+ "cluster_id": int(cluster_id),
734
+ "size": int(len(cluster_data)),
735
+ "percentage": float(len(cluster_data) / len(df) * 100),
736
+ "avg_voice_mins": float(cluster_data['voice_total_duration_mins'].mean()),
737
+ "avg_data_mb": float(cluster_data['data_total_mb'].mean()),
738
+ "avg_sms": float(cluster_data.get('sms_total_messages', pd.Series([0])).mean()),
739
+ "avg_intl_calls": float(cluster_data.get('intl_total_calls', pd.Series([0])).mean()),
740
+ })
741
+
742
+ return {"cluster_type": cluster_type, "clusters": cluster_info}
743
+
744
+
745
+ if __name__ == "__main__":
746
+ import uvicorn
747
+ uvicorn.run(app, host="0.0.0.0", port=7860)
golden_table_clustered.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0006dbc2e097a706c352677071d536a0fa3a0f7fbb84c2382e64a5f457644250
3
+ size 49627961
international_calls.csv ADDED
The diff for this file is too large to render. See raw diff
 
merged_subscriber_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad2fd497c543269f3a003a07ce1e3b4c620de752f02a13a460584943d7c2eafd
3
+ size 24336135
requirements.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Backend Dependencies for HuggingFace Spaces
2
+ # FastAPI + ML + LLM Integration
3
+
4
+ # API Framework
5
+ fastapi==0.109.0
6
+ uvicorn[standard]==0.27.0
7
+ python-multipart==0.0.6
8
+
9
+ # Data Processing
10
+ pandas==2.1.4
11
+ numpy==1.26.3
12
+
13
+ # ML & Embeddings
14
+ scikit-learn==1.4.0
15
+ sentence-transformers==2.3.1
16
+ huggingface_hub==0.20.3
17
+ faiss-cpu==1.7.4
18
+
19
+ # LLM
20
+ google-generativeai==0.3.2
21
+
22
+ # Database
23
+ aiosqlite==0.19.0
24
+
25
+ # Visualization
26
+ matplotlib==3.8.2
27
+ plotly==5.18.0
28
+ kaleido==0.2.1
29
+
30
+ # Utilities
31
+ python-dotenv==1.0.0
32
+ pydantic==2.5.3