| """ |
| Enhanced Telecom Customer Segmentation Backend API |
| ================================================= |
| FastAPI backend with: |
| - Enhanced cluster analysis with ALL data fields |
| - Time-based analysis (morning/evening/night) |
| - SMS insights |
| - Upload/Download breakdown |
| - Dynamic visualization generation |
| - On-demand clustering |
| - Groq LLM integration |
| - HuggingFace embeddings for semantic search |
| """ |
|
|
| import os |
| import json |
| import sqlite3 |
| import pickle |
| import io |
| import base64 |
| from typing import Optional, List, Dict, Any |
| from contextlib import asynccontextmanager |
| from datetime import datetime |
|
|
| import pandas as pd |
| import numpy as np |
| from fastapi import FastAPI, HTTPException, Query |
| from fastapi.middleware.cors import CORSMiddleware |
| from fastapi.responses import JSONResponse, Response |
| from pydantic import BaseModel |
| from groq import Groq |
| from sentence_transformers import SentenceTransformer |
| import faiss |
|
|
| |
| from sklearn.cluster import MiniBatchKMeans, DBSCAN |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.metrics import silhouette_score |
| from sklearn.decomposition import PCA |
|
|
| |
| faiss_building = False |
|
|
| |
| import matplotlib |
| matplotlib.use('Agg') |
| import matplotlib.pyplot as plt |
| import plotly.graph_objects as go |
| import plotly.express as px |
|
|
| |
| |
| |
|
|
| GROQ_API_KEY = os.getenv("GROQ_API_KEY", "") |
|
|
| |
| MERGED_DATA_PATH = "merged_subscriber_data.csv" |
| INTL_DATA_PATH = "international_calls.csv" |
| CLUSTERED_DATA_PATH = "golden_table_clustered.csv" |
| DB_PATH = "data/database.db" |
| FAISS_INDEX_PATH = "data/faiss_index.bin" |
| EMBEDDINGS_PATH = "data/embeddings.pkl" |
|
|
| |
| df = None |
| df_full = None |
| conn = None |
| embedding_model = None |
| faiss_index = None |
| groq_client = None |
|
|
|
|
| |
| |
| |
|
|
| @asynccontextmanager |
| async def lifespan(app: FastAPI): |
| """Initialize resources on startup""" |
| global df, df_full, conn, embedding_model, faiss_index, groq_client |
| |
| print("π Starting Enhanced Telecom API...") |
| |
| |
| if os.path.exists(MERGED_DATA_PATH): |
| df_merged = pd.read_csv(MERGED_DATA_PATH) |
| if os.path.exists(INTL_DATA_PATH): |
| df_intl = pd.read_csv(INTL_DATA_PATH) |
| df_full = pd.merge(df_merged, df_intl, on='subscriberid', how='left') |
| else: |
| df_full = df_merged |
| |
| |
| df_full = df_full.fillna(0) |
| print(f"β Loaded {len(df_full):,} customers with enhanced data") |
| |
| |
| if os.path.exists(CLUSTERED_DATA_PATH): |
| df_clustered = pd.read_csv(CLUSTERED_DATA_PATH) |
| |
| df_full = pd.merge( |
| df_full, |
| df_clustered[['subscriberid', 'kmeans_cluster', 'dbscan_cluster']], |
| on='subscriberid', |
| how='left' |
| ) |
| |
| df = df_full.copy() |
| else: |
| print("β Data files not found") |
| df = df_full = create_sample_data() |
| |
| |
| init_database() |
| |
| |
| try: |
| embedding_model = SentenceTransformer('all-MiniLM-L6-v2') |
| print("β Loaded embedding model") |
| except Exception as e: |
| print(f"β Embedding model error: {e}") |
| |
| if GROQ_API_KEY: |
| try: |
| groq_client = Groq(api_key=GROQ_API_KEY) |
| print("β Initialized Groq") |
| except Exception as e: |
| print(f"β Groq error: {e}") |
| |
| |
| print("βΉ FAISS index will build on first search request") |
| print("β
API ready!") |
| |
| yield |
| |
| if conn: |
| conn.close() |
| print("π Shutdown complete") |
|
|
|
|
| |
| |
| |
|
|
| app = FastAPI( |
| title="Enhanced Telecom Segmentation API", |
| description="Advanced telecom customer analytics with time-based insights", |
| version="2.0.0", |
| lifespan=lifespan |
| ) |
|
|
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
|
|
| |
| |
| |
|
|
| class QueryRequest(BaseModel): |
| question: str |
|
|
| class QueryResponse(BaseModel): |
| answer: str |
| data: Optional[Dict[str, Any]] = None |
|
|
| class EnhancedCustomerInfo(BaseModel): |
| subscriberid: int |
| |
| |
| voice_total_duration_mins: float |
| voice_total_calls: float |
| voice_morning_calls: float |
| voice_evening_calls: float |
| voice_night_calls: float |
| |
| |
| sms_total_messages: float |
| |
| |
| data_total_mb: float |
| data_downlink_mb: float |
| data_uplink_mb: float |
| |
| |
| intl_total_calls: float |
| intl_total_duration_mins: float |
| intl_countries_called: float |
| intl_top_country: Optional[str] |
| |
| |
| call_lover: int |
| download_lover: int |
| upload_lover: int |
| data_lover: int |
| |
| |
| kmeans_cluster: Optional[int] |
| dbscan_cluster: Optional[int] |
|
|
| class ClusterRequest(BaseModel): |
| n_clusters: int = 6 |
| algorithm: str = "kmeans" |
|
|
|
|
| |
| |
| |
|
|
| def create_sample_data(): |
| """Create sample data""" |
| np.random.seed(42) |
| n = 1000 |
| return pd.DataFrame({ |
| 'subscriberid': range(1, n+1), |
| 'voice_total_duration_mins': np.random.exponential(10, n), |
| 'voice_total_calls': np.random.poisson(10, n), |
| 'voice_morning_calls': np.random.poisson(3, n), |
| 'voice_evening_calls': np.random.poisson(4, n), |
| 'voice_night_calls': np.random.poisson(3, n), |
| 'sms_total_messages': np.random.poisson(5, n), |
| 'data_total_mb': np.random.exponential(400, n), |
| 'data_downlink_mb': np.random.exponential(300, n), |
| 'data_uplink_mb': np.random.exponential(100, n), |
| 'intl_total_calls': np.random.poisson(0.5, n), |
| 'intl_total_duration_mins': np.random.exponential(0.5, n), |
| 'intl_countries_called': np.random.poisson(0.3, n), |
| 'call_lover': np.random.choice([0, 1], n, p=[0.75, 0.25]), |
| 'data_lover': np.random.choice([0, 1], n, p=[0.75, 0.25]), |
| 'kmeans_cluster': np.random.choice(range(6), n), |
| 'dbscan_cluster': np.random.choice(range(12), n), |
| }) |
|
|
|
|
| def init_database(): |
| """Initialize SQLite database""" |
| global conn, df_full |
| os.makedirs("data", exist_ok=True) |
| conn = sqlite3.connect(DB_PATH, check_same_thread=False) |
| df_full.to_sql('customers', conn, if_exists='replace', index=False) |
| conn.execute("CREATE INDEX IF NOT EXISTS idx_subscriberid ON customers(subscriberid)") |
| print("β Database initialized") |
|
|
|
|
| def init_faiss_index(): |
| """Build FAISS index for semantic search""" |
| global faiss_index, embedding_model, df |
| |
| if embedding_model is None: |
| return |
| |
| if os.path.exists(FAISS_INDEX_PATH): |
| try: |
| faiss_index = faiss.read_index(FAISS_INDEX_PATH) |
| print("β Loaded FAISS index") |
| return |
| except: |
| pass |
| |
| |
| print("Building FAISS index...") |
| descriptions = [] |
| for _, row in df.iterrows(): |
| desc = f"Customer {row['subscriberid']}: " |
| desc += f"{row.get('voice_total_calls', 0):.0f} voice calls, " |
| desc += f"{row.get('data_total_mb', 0):.0f} MB data, " |
| desc += f"{row.get('sms_total_messages', 0):.0f} SMS, " |
| if row.get('intl_total_calls', 0) > 0: |
| desc += f"{row.get('intl_total_calls', 0):.0f} international calls" |
| descriptions.append(desc) |
| |
| embeddings = embedding_model.encode(descriptions, show_progress_bar=True, batch_size=32) |
| |
| dimension = embeddings.shape[1] |
| faiss_index = faiss.IndexFlatIP(dimension) |
| faiss.normalize_L2(embeddings) |
| faiss_index.add(embeddings) |
| |
| faiss.write_index(faiss_index, FAISS_INDEX_PATH) |
| print("β Built FAISS index") |
|
|
|
|
| def get_cluster_label(row): |
| """Get human-readable cluster label""" |
| if row['intl_total_calls'] > 0: |
| if row['data_total_mb'] > row['data_total_mb'].median(): |
| return "International Data Users" |
| else: |
| return "International Callers" |
| elif row['voice_total_calls'] > row['voice_total_calls'].quantile(0.75): |
| return "Heavy Voice Users" |
| elif row['data_total_mb'] > row['data_total_mb'].quantile(0.75): |
| return "Heavy Data Users" |
| elif row['sms_total_messages'] > row['sms_total_messages'].quantile(0.75): |
| return "SMS Enthusiasts" |
| else: |
| return "Light Users" |
|
|
|
|
| |
| |
| |
|
|
| @app.get("/") |
| def health_check(): |
| """Health check""" |
| return { |
| "status": "healthy", |
| "version": "2.0", |
| "customers": len(df) if df is not None else 0, |
| "columns": list(df.columns) if df is not None else [], |
| "features": [ |
| "time_analysis", |
| "sms_insights", |
| "upload_download_split", |
| "international_details", |
| "dynamic_clustering", |
| "dynamic_visualizations" |
| ] |
| } |
|
|
|
|
| @app.get("/api/stats") |
| def get_stats(): |
| """Get overall statistics with enhanced metrics""" |
| if df is None: |
| raise HTTPException(status_code=500, detail="Data not loaded") |
| |
| return { |
| "total_customers": int(len(df)), |
| "international_users": int(df[df['intl_total_calls'] > 0]['subscriberid'].nunique()), |
| "international_percentage": float((df['intl_total_calls'] > 0).sum() / len(df) * 100), |
| |
| |
| "avg_voice_mins": float(df['voice_total_duration_mins'].mean()), |
| "avg_voice_calls": float(df['voice_total_calls'].mean()), |
| "total_voice_mins": float(df['voice_total_duration_mins'].sum()), |
| |
| |
| "morning_calls": int(df['voice_morning_calls'].sum()), |
| "evening_calls": int(df['voice_evening_calls'].sum()), |
| "night_calls": int(df['voice_night_calls'].sum()), |
| |
| |
| "total_sms": int(df['sms_total_messages'].sum()), |
| "avg_sms_per_user": float(df['sms_total_messages'].mean()), |
| "avg_sms_per_active_user": float(df[df['sms_total_messages'] > 0]['sms_total_messages'].mean()) if (df['sms_total_messages'] > 0).sum() > 0 else 0, |
| "sms_users": int((df['sms_total_messages'] > 0).sum()), |
| |
| |
| "avg_data_mb": float(df['data_total_mb'].mean()), |
| "avg_download_mb": float(df['data_downlink_mb'].mean()), |
| "avg_upload_mb": float(df['data_uplink_mb'].mean()), |
| "total_data_gb": float(df['data_total_mb'].sum() / 1024), |
| |
| |
| "call_lovers": int(df['call_lover'].sum()), |
| "data_lovers": int(df['data_lover'].sum()), |
| "download_lovers": int(df.get('download_lover', pd.Series([0])).sum()), |
| "upload_lovers": int(df.get('upload_lover', pd.Series([0])).sum()), |
| } |
|
|
|
|
| @app.get("/api/customers/{customer_id}") |
| def get_customer(customer_id: int): |
| """Get detailed customer information""" |
| if df is None: |
| raise HTTPException(status_code=500, detail="Data not loaded") |
| |
| customer = df[df['subscriberid'] == customer_id] |
| |
| if customer.empty: |
| raise HTTPException(status_code=404, detail=f"Customer {customer_id} not found") |
| |
| row = customer.iloc[0] |
| |
| |
| total_calls_by_time = ( |
| row.get('voice_morning_calls', 0) + |
| row.get('voice_evening_calls', 0) + |
| row.get('voice_night_calls', 0) |
| ) |
| |
| return { |
| "subscriberid": int(row['subscriberid']), |
| |
| |
| "communication": { |
| "voice_total_duration_mins": float(row['voice_total_duration_mins']), |
| "voice_total_calls": float(row['voice_total_calls']), |
| "voice_avg_duration_mins": float(row.get('voice_avg_duration_mins', 0)), |
| "time_distribution": { |
| "morning_calls": int(row.get('voice_morning_calls', 0)), |
| "evening_calls": int(row.get('voice_evening_calls', 0)), |
| "night_calls": int(row.get('voice_night_calls', 0)), |
| "morning_pct": float(row.get('voice_morning_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0), |
| "evening_pct": float(row.get('voice_evening_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0), |
| "night_pct": float(row.get('voice_night_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0), |
| } |
| }, |
| |
| |
| "international": { |
| "total_calls": float(row.get('intl_total_calls', 0)), |
| "total_duration_mins": float(row.get('intl_total_duration_mins', 0)), |
| "countries_called": int(row.get('intl_countries_called', 0)), |
| "top_country": str(row.get('intl_top_country', 'N/A')) if pd.notna(row.get('intl_top_country')) else 'N/A', |
| "all_countries": str(row.get('intl_all_countries', 'N/A')) if pd.notna(row.get('intl_all_countries')) else 'N/A', |
| "is_international_user": bool(row.get('intl_total_calls', 0) > 0) |
| }, |
| |
| |
| "internet": { |
| "total_mb": float(row['data_total_mb']), |
| "download_mb": float(row.get('data_downlink_mb', 0)), |
| "upload_mb": float(row.get('data_uplink_mb', 0)), |
| "download_pct": float(row.get('data_downlink_mb', 0) / row['data_total_mb'] * 100 if row['data_total_mb'] > 0 else 0), |
| "upload_pct": float(row.get('data_uplink_mb', 0) / row['data_total_mb'] * 100 if row['data_total_mb'] > 0 else 0), |
| }, |
| |
| |
| "sms": { |
| "total_messages": int(row.get('sms_total_messages', 0)), |
| "frequency": "High" if row.get('sms_total_messages', 0) > df['sms_total_messages'].quantile(0.75) else |
| "Medium" if row.get('sms_total_messages', 0) > df['sms_total_messages'].quantile(0.25) else "Low" |
| }, |
| |
| |
| "profile": { |
| "call_lover": bool(row.get('call_lover', 0)), |
| "data_lover": bool(row.get('data_lover', 0)), |
| "download_lover": bool(row.get('download_lover', 0)), |
| "upload_lover": bool(row.get('upload_lover', 0)), |
| "kmeans_cluster": int(row.get('kmeans_cluster', -1)) if pd.notna(row.get('kmeans_cluster')) else None, |
| "dbscan_cluster": int(row.get('dbscan_cluster', -1)) if pd.notna(row.get('dbscan_cluster')) else None, |
| } |
| } |
|
|
|
|
| @app.get("/api/time-analysis") |
| def get_time_analysis(): |
| """Get time-based analysis of voice calls""" |
| if df is None: |
| raise HTTPException(status_code=500, detail="Data not loaded") |
| |
| total_morning = df['voice_morning_calls'].sum() |
| total_evening = df['voice_evening_calls'].sum() |
| total_night = df['voice_night_calls'].sum() |
| total_all = total_morning + total_evening + total_night |
| |
| return { |
| "overall": { |
| "morning_calls": int(total_morning), |
| "evening_calls": int(total_evening), |
| "night_calls": int(total_night), |
| "morning_pct": float(total_morning / total_all * 100 if total_all > 0 else 0), |
| "evening_pct": float(total_evening / total_all * 100 if total_all > 0 else 0), |
| "night_pct": float(total_night / total_all * 100 if total_all > 0 else 0), |
| }, |
| "peak_time": "Morning" if total_morning == max(total_morning, total_evening, total_night) else |
| "Evening" if total_evening == max(total_morning, total_evening, total_night) else "Night", |
| "by_user_type": { |
| "call_lovers": { |
| "morning": int(df[df['call_lover'] == 1]['voice_morning_calls'].sum()), |
| "evening": int(df[df['call_lover'] == 1]['voice_evening_calls'].sum()), |
| "night": int(df[df['call_lover'] == 1]['voice_night_calls'].sum()), |
| }, |
| "others": { |
| "morning": int(df[df['call_lover'] == 0]['voice_morning_calls'].sum()), |
| "evening": int(df[df['call_lover'] == 0]['voice_evening_calls'].sum()), |
| "night": int(df[df['call_lover'] == 0]['voice_night_calls'].sum()), |
| } |
| } |
| } |
|
|
|
|
| @app.get("/api/visualizations/time-distribution") |
| def viz_time_distribution(): |
| """Generate time distribution chart""" |
| if df is None: |
| raise HTTPException(status_code=500, detail="Data not loaded") |
| |
| time_data = { |
| 'Time Period': ['Morning', 'Evening', 'Night'], |
| 'Total Calls': [ |
| df['voice_morning_calls'].sum(), |
| df['voice_evening_calls'].sum(), |
| df['voice_night_calls'].sum() |
| ] |
| } |
| |
| fig = px.bar( |
| time_data, |
| x='Time Period', |
| y='Total Calls', |
| title='Call Distribution by Time of Day', |
| color='Time Period', |
| color_discrete_map={'Morning': '#FDB462', 'Evening': '#80B1D3', 'Night': '#8DD3C7'} |
| ) |
| |
| return JSONResponse(content={"chart": fig.to_json()}) |
|
|
|
|
| @app.get("/api/visualizations/data-breakdown") |
| def viz_data_breakdown(): |
| """Generate upload/download breakdown chart""" |
| if df is None: |
| raise HTTPException(status_code=500, detail="Data not loaded") |
| |
| data_summary = { |
| 'Type': ['Download', 'Upload'], |
| 'Total (GB)': [ |
| df['data_downlink_mb'].sum() / 1024, |
| df['data_uplink_mb'].sum() / 1024 |
| ] |
| } |
| |
| fig = px.pie( |
| data_summary, |
| values='Total (GB)', |
| names='Type', |
| title='Data Usage: Download vs Upload', |
| color_discrete_sequence=['#66C2A5', '#FC8D62'] |
| ) |
| |
| return JSONResponse(content={"chart": fig.to_json()}) |
|
|
|
|
| @app.get("/api/visualizations/customer-segments") |
| def viz_customer_segments(): |
| """Generate customer segments visualization""" |
| if df is None or 'kmeans_cluster' not in df.columns: |
| raise HTTPException(status_code=500, detail="Clustering data not available") |
| |
| |
| cluster_stats = df.groupby('kmeans_cluster').agg({ |
| 'subscriberid': 'count', |
| 'voice_total_calls': 'mean', |
| 'data_total_mb': 'mean', |
| 'sms_total_messages': 'mean' |
| }).reset_index() |
| |
| cluster_stats.columns = ['Cluster', 'Customers', 'Avg Calls', 'Avg Data (MB)', 'Avg SMS'] |
| |
| fig = px.bar( |
| cluster_stats, |
| x='Cluster', |
| y='Customers', |
| title='Customer Distribution Across Segments', |
| color='Customers', |
| color_continuous_scale='viridis' |
| ) |
| |
| return JSONResponse(content={"chart": fig.to_json()}) |
|
|
|
|
| @app.post("/api/cluster/run") |
| def run_clustering(request: ClusterRequest): |
| """Run clustering on-demand""" |
| if df is None: |
| raise HTTPException(status_code=500, detail="Data not loaded") |
| |
| |
| feature_cols = [ |
| 'voice_total_duration_mins', 'voice_total_calls', |
| 'data_total_mb', 'sms_total_messages' |
| ] |
| |
| |
| if 'intl_total_calls' in df.columns: |
| feature_cols.append('intl_total_calls') |
| |
| X = df[feature_cols].fillna(0) |
| |
| |
| scaler = StandardScaler() |
| X_scaled = scaler.fit_transform(X) |
| |
| |
| if request.algorithm == "kmeans": |
| model = MiniBatchKMeans(n_clusters=request.n_clusters, random_state=42, batch_size=1000) |
| labels = model.fit_predict(X_scaled) |
| |
| |
| if len(df) > 10000: |
| sample_idx = np.random.choice(len(df), 10000, replace=False) |
| score = silhouette_score(X_scaled[sample_idx], labels[sample_idx]) |
| else: |
| score = silhouette_score(X_scaled, labels) |
| |
| elif request.algorithm == "dbscan": |
| model = DBSCAN(eps=0.3, min_samples=10) |
| labels = model.fit_predict(X_scaled) |
| score = None |
| else: |
| raise HTTPException(status_code=400, detail="Invalid algorithm") |
| |
| |
| df_temp = df.copy() |
| df_temp['cluster'] = labels |
| |
| cluster_info = [] |
| for cluster_id in sorted(df_temp['cluster'].unique()): |
| cluster_data = df_temp[df_temp['cluster'] == cluster_id] |
| cluster_info.append({ |
| "cluster_id": int(cluster_id), |
| "size": int(len(cluster_data)), |
| "percentage": float(len(cluster_data) / len(df) * 100), |
| "avg_voice_calls": float(cluster_data['voice_total_calls'].mean()), |
| "avg_data_mb": float(cluster_data['data_total_mb'].mean()), |
| "avg_sms": float(cluster_data.get('sms_total_messages', pd.Series([0])).mean()), |
| }) |
| |
| return { |
| "algorithm": request.algorithm, |
| "n_clusters": int(labels.max() + 1), |
| "silhouette_score": float(score) if score else None, |
| "clusters": cluster_info |
| } |
|
|
|
|
| @app.post("/api/query") |
| def query_with_llm(request: QueryRequest): |
| """Query data using Groq LLM""" |
| if groq_client is None: |
| raise HTTPException(status_code=503, detail="Groq API not configured") |
| |
| |
| def safe_col_sum(col_name, default=0): |
| """Safely get column sum or return default""" |
| return df[col_name].sum() if col_name in df.columns else default |
| |
| def safe_col_mean(col_name, default=0): |
| """Safely get column mean or return default""" |
| return df[col_name].mean() if col_name in df.columns else default |
| |
| def safe_col_count(col_name, condition_value=0): |
| """Safely count rows where column > condition_value""" |
| if col_name in df.columns: |
| return (df[col_name] > condition_value).sum() |
| return 0 |
| |
| context = f""" |
| You are a telecom analytics AI assistant analyzing Pakistani telecom customer data. Provide clear, actionable insights. |
| |
| IMPORTANT: This is Pakistani telecom data. Use PKR (Pakistani Rupees) for all pricing. Market context: Pakistan has competitive telecom pricing with packages ranging PKR 500-2500/month. |
| |
| CUSTOMER DATABASE STATISTICS: |
| |
| π Overview: |
| - Total Customers: {len(df):,} |
| - International Users: {int(safe_col_count('intl_total_calls', 0)):,} ({safe_col_count('intl_total_calls', 0)/len(df)*100:.1f}%) |
| |
| π Voice Communication: |
| - Total Calls: {safe_col_sum('voice_total_calls'):,.0f} |
| - Total Duration: {safe_col_sum('voice_total_duration_mins'):,.0f} mins |
| - Average per User: {safe_col_mean('voice_total_calls'):.1f} calls, {safe_col_mean('voice_total_duration_mins'):.1f} mins |
| |
| {'π
Time Distribution:' if 'voice_morning_calls' in df.columns else ''} |
| {f"- Morning (6am-12pm): {safe_col_sum('voice_morning_calls'):,.0f} calls" if 'voice_morning_calls' in df.columns else ''} |
| {f"- Evening (12pm-6pm): {safe_col_sum('voice_evening_calls'):,.0f} calls" if 'voice_evening_calls' in df.columns else ''} |
| {f"- Night (6pm-6am): {safe_col_sum('voice_night_calls'):,.0f} calls" if 'voice_night_calls' in df.columns else ''} |
| |
| {'π¬ SMS:' if 'sms_total_messages' in df.columns else ''} |
| {f"- Total Messages: {safe_col_sum('sms_total_messages'):,.0f}" if 'sms_total_messages' in df.columns else ''} |
| {f"- Average per User: {safe_col_mean('sms_total_messages'):.1f} messages" if 'sms_total_messages' in df.columns else ''} |
| |
| π Data Usage: |
| - Total Data: {safe_col_sum('data_total_mb'):,.0f} MB ({safe_col_sum('data_total_mb')/1024:.1f} GB) |
| - Average per User: {safe_col_mean('data_total_mb'):.1f} MB |
| {f"- Total Download: {safe_col_sum('data_downlink_mb') / 1024:.1f} GB" if 'data_downlink_mb' in df.columns else ''} |
| {f"- Total Upload: {safe_col_sum('data_uplink_mb') / 1024:.1f} GB" if 'data_uplink_mb' in df.columns else ''} |
| |
| --- |
| |
| USER QUESTION: {request.question} |
| |
| RESPONSE INSTRUCTIONS: |
| |
| π **ONLY use the 4-section package format below if:** |
| - The question explicitly contains "package", "recommend", "plan", "pricing", or "offer" |
| - AND it's about an INDIVIDUAL customer (mentions specific usage numbers for one person) |
| |
| π **For all other questions** (insights, trends, analysis, comparisons): |
| - Provide 3 concise, actionable insights with clear formatting |
| - Use markdown: **bold** for emphasis, bullet points (β’) for lists |
| - Include specific numbers and percentages |
| - Add visual separators between insights |
| - DO NOT format as package recommendations |
| - Keep it brief and data-driven |
| |
| FORMAT FOR INSIGHTS (use this structure): |
| ### π― Insight 1: [Title] |
| **Key Finding:** [Main point with numbers] |
| **Action:** [What to do] |
| |
| ### π Insight 2: [Title] |
| **Key Finding:** [Main point with numbers] |
| **Action:** [What to do] |
| |
| ### π‘ Insight 3: [Title] |
| **Key Finding:** [Main point with numbers] |
| **Action:** [What to do] |
| |
| --- |
| |
| IF PACKAGE RECOMMENDATION (Individual Customer Only): |
| |
| FORMAT: |
| ### π USAGE PROFILE |
| **Pattern:** [Describe time distribution pattern - bimodal/uniform/concentrated] |
| **Behavior:** [Commuter/Night owl/etc based on morning+night/night-heavy patterns] |
| β’ Mention ALL significant time periods (>25% is significant) |
| β’ Recognize patterns clearly |
| |
| ### π RECOMMENDED PACKAGE |
| **Package Name:** [Match actual Zong package names when possible, e.g., "Super Weekly Plus", "Monthly Social"] |
| **Details:** [Specific allocations matching customer usage] |
| **Price:** PKR [amount]/[day|week|month] |
| β’ EXCLUDE services with 0 usage (if data=0 MB, don't include data) |
| β’ **ZONG PAKISTAN ACTUAL PRICING (choose appropriate validity based on usage):** |
| |
| π
**DAILY Packages** (for ultra-low/occasional users): |
| - Call-only: PKR 5-17/day (~75-150 mins) |
| - Data-only: PKR 23/day (100MB basic, 2.5GB night) |
| - SMS: PKR 7/day (500 SMS + 30MB WhatsApp) |
| |
| π
**WEEKLY Packages** (for light-moderate users): |
| - Light: PKR 120/week (500 mins + 500 SMS + 500MB) - "Haftawar Offer" |
| - Mid: PKR 290/week (Unlimited on-net + 8GB) - "Super Weekly Plus" |
| - Premium: PKR 385-600/week (30-100GB + unlimited calls) - "Super Weekly Premium", "Weekly Digital Max" |
| |
| π
**MONTHLY Packages** (for regular-heavy users): |
| - Ultra-light: PKR 50-150/month (150-500MB data only) - "Monthly Mini/Basic" |
| - Social: PKR 130-240/month (6-12GB social data) |
| - Basic: PKR 420-575/month (1-8GB + 1000-3000 mins) - "Shandaar Mahana", "Monthly Superstar" |
| - Mid: PKR 1200-1300/month (12-20GB + unlimited on-net) - "Monthly Super Card", "Monthly Supreme" |
| - Premium: PKR 1500-2000/month (50-200GB + 3000-3500 mins) - "Monthly Diamond/Platinum" |
| - Heavy: PKR 4000/month (400GB + 5000 mins) - "Monthly Titanium" |
| |
| β’ **VALIDITY SELECTION LOGIC:** |
| - Ultra-low usage (<5 calls, <50MB) β Daily packages (PKR 5-23/day) |
| - Occasional usage (<50 calls, <500MB) β Weekly packages (PKR 120-600/week) |
| - Regular usage (>50 calls OR >500MB) β Monthly packages (PKR 150-4000/month) |
| |
| ### β¨ KEY BENEFITS |
| β’ **Benefit 1:** [Quantified if possible - "save 20%", "covers 150% of usage"] |
| β’ **Benefit 2:** [Cost savings, usage coverage, flexibility] |
| β’ **Benefit 3:** [Value match for their usage pattern] |
| β’ **Benefit 4:** [Additional value proposition] |
| |
| ### π° PRICING STRATEGY |
| **Discount Offer:** PKR [amount] discount/benefit |
| **Business Logic:** [Why this discount - ARPU increase/churn reduction] |
| **Upsell Opportunity:** [Services they could use more] |
| **Expected Impact:** [Quantified business results] |
| |
| **SPECIAL NOTES:** |
| β’ For ultra-low usage (<5 calls/day, <50 MB/day): Recommend DAILY packages (PKR 5-23/day) |
| β’ For light weekly usage: Recommend WEEKLY packages (PKR 120-600/week) |
| β’ For regular monthly usage: Recommend MONTHLY packages (PKR 150-4000/month) |
| β’ Always calculate total cost correctly: Daily packages cost ~PKR 150-690/month (30 days), Weekly packages cost ~PKR 480-2400/month (4 weeks) |
| β’ Match customer usage patterns to appropriate validity period! |
| """ |
| |
| try: |
| response = groq_client.chat.completions.create( |
| model="llama-3.3-70b-versatile", |
| messages=[{"role": "user", "content": context}], |
| temperature=0.7, |
| max_tokens=1024 |
| ) |
| return QueryResponse(answer=response.choices[0].message.content, data=None) |
| except Exception as e: |
| import traceback |
| error_details = traceback.format_exc() |
| print(f"β Query error: {error_details}") |
| raise HTTPException(status_code=500, detail=f"LLM error: {str(e)}") |
|
|
|
|
| @app.get("/api/search") |
| def semantic_search(query: str = Query(..., description="Search query"), limit: int = 10): |
| """Semantic search for customers""" |
| global faiss_index, faiss_building |
| |
| if embedding_model is None: |
| raise HTTPException(status_code=503, detail="Embedding model not available") |
| |
| |
| if faiss_index is None and not faiss_building: |
| faiss_building = True |
| try: |
| init_faiss_index() |
| finally: |
| faiss_building = False |
| |
| if faiss_index is None: |
| raise HTTPException(status_code=503, detail="FAISS index building, please try again in a moment") |
| |
| |
| query_embedding = embedding_model.encode([query]) |
| faiss.normalize_L2(query_embedding) |
| |
| |
| scores, indices = faiss_index.search(query_embedding, limit) |
| |
| results = [] |
| for score, idx in zip(scores[0], indices[0]): |
| if idx < len(df): |
| customer = df.iloc[idx] |
| results.append({ |
| "customer_id": int(customer['subscriberid']), |
| "similarity_score": float(score), |
| "voice_calls": float(customer['voice_total_calls']), |
| "data_mb": float(customer['data_total_mb']), |
| "sms": int(customer.get('sms_total_messages', 0)), |
| "is_international": bool(customer.get('intl_total_calls', 0) > 0) |
| }) |
| |
| return {"results": results} |
|
|
|
|
| @app.get("/api/clusters") |
| def get_clusters(cluster_type: str = "kmeans"): |
| """Get cluster information""" |
| if df is None: |
| raise HTTPException(status_code=500, detail="Data not loaded") |
| |
| cluster_col = f"{cluster_type}_cluster" |
| if cluster_col not in df.columns: |
| raise HTTPException(status_code=404, detail=f"{cluster_type} clusters not found") |
| |
| cluster_info = [] |
| for cluster_id in sorted(df[cluster_col].unique()): |
| if pd.isna(cluster_id): |
| continue |
| |
| cluster_data = df[df[cluster_col] == cluster_id] |
| cluster_info.append({ |
| "cluster_id": int(cluster_id), |
| "size": int(len(cluster_data)), |
| "percentage": float(len(cluster_data) / len(df) * 100), |
| "avg_voice_mins": float(cluster_data['voice_total_duration_mins'].mean()), |
| "avg_data_mb": float(cluster_data['data_total_mb'].mean()), |
| "avg_sms": float(cluster_data.get('sms_total_messages', pd.Series([0])).mean()), |
| "avg_intl_calls": float(cluster_data.get('intl_total_calls', pd.Series([0])).mean()), |
| }) |
| |
| return {"cluster_type": cluster_type, "clusters": cluster_info} |
|
|
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|