Spaces:

Hamza4100
/

telecom

Sleeping

App Files Files Community

Hamza4100 commited on Feb 18

Commit

0efd294

verified ·

1 Parent(s): 466e3d9

Upload 6 files

Browse files

Files changed (7) hide show

.gitattributes +2 -0
Dockerfile +28 -0
app.py +747 -0
golden_table_clustered.csv +3 -0
international_calls.csv +0 -0
merged_subscriber_data.csv +3 -0
requirements.txt +32 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+golden_table_clustered.csv filter=lfs diff=lfs merge=lfs -text
+merged_subscriber_data.csv filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    && rm -rf /var/lib/apt/lists/*
+# Install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application files
+COPY app.py .
+COPY golden_table_clustered.csv .
+# Create data directory
+RUN mkdir -p /app/data
+# Expose port (HF Spaces uses 7860)
+EXPOSE 7860
+# Set environment variables
+ENV PYTHONUNBUFFERED=1
+# Run the application
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,747 @@

+"""
+Enhanced Telecom Customer Segmentation Backend API
+=================================================
+FastAPI backend with:
+- Enhanced cluster analysis with ALL data fields
+- Time-based analysis (morning/evening/night)
+- SMS insights
+- Upload/Download breakdown
+- Dynamic visualization generation
+- On-demand clustering
+- Gemini LLM integration
+- HuggingFace embeddings for semantic search
+"""
+import os
+import json
+import sqlite3
+import pickle
+import io
+import base64
+from typing import Optional, List, Dict, Any
+from contextlib import asynccontextmanager
+from datetime import datetime
+import pandas as pd
+import numpy as np
+from fastapi import FastAPI, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, Response
+from pydantic import BaseModel
+import google.generativeai as genai
+from sentence_transformers import SentenceTransformer
+import faiss
+# ML imports
+from sklearn.cluster import MiniBatchKMeans, DBSCAN
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import silhouette_score
+from sklearn.decomposition import PCA
+# Visualization imports
+import matplotlib
+matplotlib.use('Agg')  # Non-interactive backend
+import matplotlib.pyplot as plt
+import plotly.graph_objects as go
+import plotly.express as px
+# ============================================
+# CONFIGURATION
+# ============================================
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY", "")
+if GEMINI_API_KEY:
+    genai.configure(api_key=GEMINI_API_KEY)
+# Data paths
+MERGED_DATA_PATH = "merged_subscriber_data.csv"
+INTL_DATA_PATH = "international_calls.csv"
+CLUSTERED_DATA_PATH = "golden_table_clustered.csv"
+DB_PATH = "data/database.db"
+FAISS_INDEX_PATH = "data/faiss_index.bin"
+EMBEDDINGS_PATH = "data/embeddings.pkl"
+# Global variables
+df = None
+df_full = None  # Full data with all fields
+conn = None
+embedding_model = None
+faiss_index = None
+gemini_model = None
+# ============================================
+# STARTUP / SHUTDOWN
+# ============================================
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Initialize resources on startup"""
+    global df, df_full, conn, embedding_model, faiss_index, gemini_model
+    print("🚀 Starting Enhanced Telecom API...")
+    # Load full data with all fields
+    if os.path.exists(MERGED_DATA_PATH):
+        df_merged = pd.read_csv(MERGED_DATA_PATH)
+        if os.path.exists(INTL_DATA_PATH):
+            df_intl = pd.read_csv(INTL_DATA_PATH)
+            df_full = pd.merge(df_merged, df_intl, on='subscriberid', how='left')
+        else:
+            df_full = df_merged
+        # Fill NaN values
+        df_full = df_full.fillna(0)
+        print(f"✓ Loaded {len(df_full):,} customers with enhanced data")
+        # Load clustered results if available
+        if os.path.exists(CLUSTERED_DATA_PATH):
+            df_clustered = pd.read_csv(CLUSTERED_DATA_PATH)
+            # Merge cluster labels into full data
+            df_full = pd.merge(
+                df_full,
+                df_clustered[['subscriberid', 'kmeans_cluster', 'dbscan_cluster']],
+                on='subscriberid',
+                how='left'
+            )
+        df = df_full.copy()
+    else:
+        print("⚠ Data files not found")
+        df = df_full = create_sample_data()
+    # Initialize database
+    init_database()
+    # Load models
+    try:
+        embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+        print("✓ Loaded embedding model")
+    except Exception as e:
+        print(f"⚠ Embedding model error: {e}")
+    if GEMINI_API_KEY:
+        try:
+            gemini_model = genai.GenerativeModel('gemini-2.5-flash')
+            print("✓ Initialized Gemini")
+        except Exception as e:
+            print(f"⚠ Gemini error: {e}")
+    init_faiss_index()
+    print("✅ API ready!")
+    yield
+    if conn:
+        conn.close()
+    print("👋 Shutdown complete")
+# ============================================
+# INITIALIZE APP
+# ============================================
+app = FastAPI(
+    title="Enhanced Telecom Segmentation API",
+    description="Advanced telecom customer analytics with time-based insights",
+    version="2.0.0",
+    lifespan=lifespan
+)
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ============================================
+# PYDANTIC MODELS
+# ============================================
+class QueryRequest(BaseModel):
+    question: str
+class QueryResponse(BaseModel):
+    answer: str
+    data: Optional[Dict[str, Any]] = None
+class EnhancedCustomerInfo(BaseModel):
+    subscriberid: int
+    # Voice communication
+    voice_total_duration_mins: float
+    voice_total_calls: float
+    voice_morning_calls: float
+    voice_evening_calls: float
+    voice_night_calls: float
+    # SMS
+    sms_total_messages: float
+    # Data
+    data_total_mb: float
+    data_downlink_mb: float
+    data_uplink_mb: float
+    # International
+    intl_total_calls: float
+    intl_total_duration_mins: float
+    intl_countries_called: float
+    intl_top_country: Optional[str]
+    # User types
+    call_lover: int
+    download_lover: int
+    upload_lover: int
+    data_lover: int
+    # Clustering
+    kmeans_cluster: Optional[int]
+    dbscan_cluster: Optional[int]
+class ClusterRequest(BaseModel):
+    n_clusters: int = 6
+    algorithm: str = "kmeans"  # kmeans or dbscan
+# ============================================
+# HELPER FUNCTIONS
+# ============================================
+def create_sample_data():
+    """Create sample data"""
+    np.random.seed(42)
+    n = 1000
+    return pd.DataFrame({
+        'subscriberid': range(1, n+1),
+        'voice_total_duration_mins': np.random.exponential(10, n),
+        'voice_total_calls': np.random.poisson(10, n),
+        'voice_morning_calls': np.random.poisson(3, n),
+        'voice_evening_calls': np.random.poisson(4, n),
+        'voice_night_calls': np.random.poisson(3, n),
+        'sms_total_messages': np.random.poisson(5, n),
+        'data_total_mb': np.random.exponential(400, n),
+        'data_downlink_mb': np.random.exponential(300, n),
+        'data_uplink_mb': np.random.exponential(100, n),
+        'intl_total_calls': np.random.poisson(0.5, n),
+        'intl_total_duration_mins': np.random.exponential(0.5, n),
+        'intl_countries_called': np.random.poisson(0.3, n),
+        'call_lover': np.random.choice([0, 1], n, p=[0.75, 0.25]),
+        'data_lover': np.random.choice([0, 1], n, p=[0.75, 0.25]),
+        'kmeans_cluster': np.random.choice(range(6), n),
+        'dbscan_cluster': np.random.choice(range(12), n),
+    })
+def init_database():
+    """Initialize SQLite database"""
+    global conn, df_full
+    os.makedirs("data", exist_ok=True)
+    conn = sqlite3.connect(DB_PATH, check_same_thread=False)
+    df_full.to_sql('customers', conn, if_exists='replace', index=False)
+    conn.execute("CREATE INDEX IF NOT EXISTS idx_subscriberid ON customers(subscriberid)")
+    print("✓ Database initialized")
+def init_faiss_index():
+    """Build FAISS index for semantic search"""
+    global faiss_index, embedding_model, df
+    if embedding_model is None:
+        return
+    if os.path.exists(FAISS_INDEX_PATH):
+        try:
+            faiss_index = faiss.read_index(FAISS_INDEX_PATH)
+            print("✓ Loaded FAISS index")
+            return
+        except:
+            pass
+    # Build index
+    print("Building FAISS index...")
+    descriptions = []
+    for _, row in df.iterrows():
+        desc = f"Customer {row['subscriberid']}: "
+        desc += f"{row.get('voice_total_calls', 0):.0f} voice calls, "
+        desc += f"{row.get('data_total_mb', 0):.0f} MB data, "
+        desc += f"{row.get('sms_total_messages', 0):.0f} SMS, "
+        if row.get('intl_total_calls', 0) > 0:
+            desc += f"{row.get('intl_total_calls', 0):.0f} international calls"
+        descriptions.append(desc)
+    embeddings = embedding_model.encode(descriptions, show_progress_bar=True, batch_size=32)
+    dimension = embeddings.shape[1]
+    faiss_index = faiss.IndexFlatIP(dimension)
+    faiss.normalize_L2(embeddings)
+    faiss_index.add(embeddings)
+    faiss.write_index(faiss_index, FAISS_INDEX_PATH)
+    print("✓ Built FAISS index")
+def get_cluster_label(row):
+    """Get human-readable cluster label"""
+    if row['intl_total_calls'] > 0:
+        if row['data_total_mb'] > row['data_total_mb'].median():
+            return "International Data Users"
+        else:
+            return "International Callers"
+    elif row['voice_total_calls'] > row['voice_total_calls'].quantile(0.75):
+        return "Heavy Voice Users"
+    elif row['data_total_mb'] > row['data_total_mb'].quantile(0.75):
+        return "Heavy Data Users"
+    elif row['sms_total_messages'] > row['sms_total_messages'].quantile(0.75):
+        return "SMS Enthusiasts"
+    else:
+        return "Light Users"
+# ============================================
+# ENDPOINTS
+# ============================================
+@app.get("/")
+def health_check():
+    """Health check"""
+    return {
+        "status": "healthy",
+        "version": "2.0",
+        "customers": len(df) if df is not None else 0,
+        "columns": list(df.columns) if df is not None else [],
+        "features": [
+            "time_analysis",
+            "sms_insights",
+            "upload_download_split",
+            "international_details",
+            "dynamic_clustering",
+            "dynamic_visualizations"
+        ]
+    }
+@app.get("/api/stats")
+def get_stats():
+    """Get overall statistics with enhanced metrics"""
+    if df is None:
+        raise HTTPException(status_code=500, detail="Data not loaded")
+    return {
+        "total_customers": int(len(df)),
+        "international_users": int(df[df['intl_total_calls'] > 0]['subscriberid'].nunique()),
+        "international_percentage": float((df['intl_total_calls'] > 0).sum() / len(df) * 100),
+        # Voice stats
+        "avg_voice_mins": float(df['voice_total_duration_mins'].mean()),
+        "avg_voice_calls": float(df['voice_total_calls'].mean()),
+        "total_voice_mins": float(df['voice_total_duration_mins'].sum()),
+        # Time breakdown
+        "morning_calls": int(df['voice_morning_calls'].sum()),
+        "evening_calls": int(df['voice_evening_calls'].sum()),
+        "night_calls": int(df['voice_night_calls'].sum()),
+        # SMS stats
+        "total_sms": int(df['sms_total_messages'].sum()),
+        "avg_sms_per_user": float(df['sms_total_messages'].mean()),
+        "sms_users": int((df['sms_total_messages'] > 0).sum()),
+        # Data stats
+        "avg_data_mb": float(df['data_total_mb'].mean()),
+        "avg_download_mb": float(df['data_downlink_mb'].mean()),
+        "avg_upload_mb": float(df['data_uplink_mb'].mean()),
+        "total_data_gb": float(df['data_total_mb'].sum() / 1024),
+        # User types
+        "call_lovers": int(df['call_lover'].sum()),
+        "data_lovers": int(df['data_lover'].sum()),
+        "download_lovers": int(df.get('download_lover', pd.Series([0])).sum()),
+        "upload_lovers": int(df.get('upload_lover', pd.Series([0])).sum()),
+    }
+@app.get("/api/customers/{customer_id}")
+def get_customer(customer_id: int):
+    """Get detailed customer information"""
+    if df is None:
+        raise HTTPException(status_code=500, detail="Data not loaded")
+    customer = df[df['subscriberid'] == customer_id]
+    if customer.empty:
+        raise HTTPException(status_code=404, detail=f"Customer {customer_id} not found")
+    row = customer.iloc[0]
+    # Calculate time distribution
+    total_calls_by_time = (
+        row.get('voice_morning_calls', 0) +
+        row.get('voice_evening_calls', 0) +
+        row.get('voice_night_calls', 0)
+    )
+    return {
+        "subscriberid": int(row['subscriberid']),
+        # Communication
+        "communication": {
+            "voice_total_duration_mins": float(row['voice_total_duration_mins']),
+            "voice_total_calls": float(row['voice_total_calls']),
+            "voice_avg_duration_mins": float(row.get('voice_avg_duration_mins', 0)),
+            "time_distribution": {
+                "morning_calls": int(row.get('voice_morning_calls', 0)),
+                "evening_calls": int(row.get('voice_evening_calls', 0)),
+                "night_calls": int(row.get('voice_night_calls', 0)),
+                "morning_pct": float(row.get('voice_morning_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
+                "evening_pct": float(row.get('voice_evening_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
+                "night_pct": float(row.get('voice_night_calls', 0) / total_calls_by_time * 100 if total_calls_by_time > 0 else 0),
+            }
+        },
+        # International
+        "international": {
+            "total_calls": float(row.get('intl_total_calls', 0)),
+            "total_duration_mins": float(row.get('intl_total_duration_mins', 0)),
+            "countries_called": int(row.get('intl_countries_called', 0)),
+            "top_country": str(row.get('intl_top_country', 'N/A')) if pd.notna(row.get('intl_top_country')) else 'N/A',
+            "all_countries": str(row.get('intl_all_countries', 'N/A')) if pd.notna(row.get('intl_all_countries')) else 'N/A',
+            "is_international_user": bool(row.get('intl_total_calls', 0) > 0)
+        },
+        # Internet
+        "internet": {
+            "total_mb": float(row['data_total_mb']),
+            "download_mb": float(row.get('data_downlink_mb', 0)),
+            "upload_mb": float(row.get('data_uplink_mb', 0)),
+            "download_pct": float(row.get('data_downlink_mb', 0) / row['data_total_mb'] * 100 if row['data_total_mb'] > 0 else 0),
+            "upload_pct": float(row.get('data_uplink_mb', 0) / row['data_total_mb'] * 100 if row['data_total_mb'] > 0 else 0),
+        },
+        # SMS
+        "sms": {
+            "total_messages": int(row.get('sms_total_messages', 0)),
+            "frequency": "High" if row.get('sms_total_messages', 0) > df['sms_total_messages'].quantile(0.75) else
+                        "Medium" if row.get('sms_total_messages', 0) > df['sms_total_messages'].quantile(0.25) else "Low"
+        },
+        # User profile
+        "profile": {
+            "call_lover": bool(row.get('call_lover', 0)),
+            "data_lover": bool(row.get('data_lover', 0)),
+            "download_lover": bool(row.get('download_lover', 0)),
+            "upload_lover": bool(row.get('upload_lover', 0)),
+            "kmeans_cluster": int(row.get('kmeans_cluster', -1)) if pd.notna(row.get('kmeans_cluster')) else None,
+            "dbscan_cluster": int(row.get('dbscan_cluster', -1)) if pd.notna(row.get('dbscan_cluster')) else None,
+        }
+    }
+@app.get("/api/time-analysis")
+def get_time_analysis():
+    """Get time-based analysis of voice calls"""
+    if df is None:
+        raise HTTPException(status_code=500, detail="Data not loaded")
+    total_morning = df['voice_morning_calls'].sum()
+    total_evening = df['voice_evening_calls'].sum()
+    total_night = df['voice_night_calls'].sum()
+    total_all = total_morning + total_evening + total_night
+    return {
+        "overall": {
+            "morning_calls": int(total_morning),
+            "evening_calls": int(total_evening),
+            "night_calls": int(total_night),
+            "morning_pct": float(total_morning / total_all * 100 if total_all > 0 else 0),
+            "evening_pct": float(total_evening / total_all * 100 if total_all > 0 else 0),
+            "night_pct": float(total_night / total_all * 100 if total_all > 0 else 0),
+        },
+        "peak_time": "Morning" if total_morning == max(total_morning, total_evening, total_night) else
+                     "Evening" if total_evening == max(total_morning, total_evening, total_night) else "Night",
+        "by_user_type": {
+            "call_lovers": {
+                "morning": int(df[df['call_lover'] == 1]['voice_morning_calls'].sum()),
+                "evening": int(df[df['call_lover'] == 1]['voice_evening_calls'].sum()),
+                "night": int(df[df['call_lover'] == 1]['voice_night_calls'].sum()),
+            },
+            "others": {
+                "morning": int(df[df['call_lover'] == 0]['voice_morning_calls'].sum()),
+                "evening": int(df[df['call_lover'] == 0]['voice_evening_calls'].sum()),
+                "night": int(df[df['call_lover'] == 0]['voice_night_calls'].sum()),
+            }
+        }
+    }
+@app.get("/api/visualizations/time-distribution")
+def viz_time_distribution():
+    """Generate time distribution chart"""
+    if df is None:
+        raise HTTPException(status_code=500, detail="Data not loaded")
+    time_data = {
+        'Time Period': ['Morning', 'Evening', 'Night'],
+        'Total Calls': [
+            df['voice_morning_calls'].sum(),
+            df['voice_evening_calls'].sum(),
+            df['voice_night_calls'].sum()
+        ]
+    }
+    fig = px.bar(
+        time_data,
+        x='Time Period',
+        y='Total Calls',
+        title='Call Distribution by Time of Day',
+        color='Time Period',
+        color_discrete_map={'Morning': '#FDB462', 'Evening': '#80B1D3', 'Night': '#8DD3C7'}
+    )
+    return JSONResponse(content={"chart": fig.to_json()})
+@app.get("/api/visualizations/data-breakdown")
+def viz_data_breakdown():
+    """Generate upload/download breakdown chart"""
+    if df is None:
+        raise HTTPException(status_code=500, detail="Data not loaded")
+    data_summary = {
+        'Type': ['Download', 'Upload'],
+        'Total (GB)': [
+            df['data_downlink_mb'].sum() / 1024,
+            df['data_uplink_mb'].sum() / 1024
+        ]
+    }
+    fig = px.pie(
+        data_summary,
+        values='Total (GB)',
+        names='Type',
+        title='Data Usage: Download vs Upload',
+        color_discrete_sequence=['#66C2A5', '#FC8D62']
+    )
+    return JSONResponse(content={"chart": fig.to_json()})
+@app.get("/api/visualizations/customer-segments")
+def viz_customer_segments():
+    """Generate customer segments visualization"""
+    if df is None or 'kmeans_cluster' not in df.columns:
+        raise HTTPException(status_code=500, detail="Clustering data not available")
+    # Get cluster statistics
+    cluster_stats = df.groupby('kmeans_cluster').agg({
+        'subscriberid': 'count',
+        'voice_total_calls': 'mean',
+        'data_total_mb': 'mean',
+        'sms_total_messages': 'mean'
+    }).reset_index()
+    cluster_stats.columns = ['Cluster', 'Customers', 'Avg Calls', 'Avg Data (MB)', 'Avg SMS']
+    fig = px.bar(
+        cluster_stats,
+        x='Cluster',
+        y='Customers',
+        title='Customer Distribution Across Segments',
+        color='Customers',
+        color_continuous_scale='viridis'
+    )
+    return JSONResponse(content={"chart": fig.to_json()})
+@app.post("/api/cluster/run")
+def run_clustering(request: ClusterRequest):
+    """Run clustering on-demand"""
+    if df is None:
+        raise HTTPException(status_code=500, detail="Data not loaded")
+    # Select features
+    feature_cols = [
+        'voice_total_duration_mins', 'voice_total_calls',
+        'data_total_mb', 'sms_total_messages'
+    ]
+    # Add international if exists
+    if 'intl_total_calls' in df.columns:
+        feature_cols.append('intl_total_calls')
+    X = df[feature_cols].fillna(0)
+    # Scale
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(X)
+    # Cluster
+    if request.algorithm == "kmeans":
+        model = MiniBatchKMeans(n_clusters=request.n_clusters, random_state=42, batch_size=1000)
+        labels = model.fit_predict(X_scaled)
+        # Calculate silhouette score
+        if len(df) > 10000:
+            sample_idx = np.random.choice(len(df), 10000, replace=False)
+            score = silhouette_score(X_scaled[sample_idx], labels[sample_idx])
+        else:
+            score = silhouette_score(X_scaled, labels)
+    elif request.algorithm == "dbscan":
+        model = DBSCAN(eps=0.3, min_samples=10)
+        labels = model.fit_predict(X_scaled)
+        score = None
+    else:
+        raise HTTPException(status_code=400, detail="Invalid algorithm")
+    # Get cluster stats
+    df_temp = df.copy()
+    df_temp['cluster'] = labels
+    cluster_info = []
+    for cluster_id in sorted(df_temp['cluster'].unique()):
+        cluster_data = df_temp[df_temp['cluster'] == cluster_id]
+        cluster_info.append({
+            "cluster_id": int(cluster_id),
+            "size": int(len(cluster_data)),
+            "percentage": float(len(cluster_data) / len(df) * 100),
+            "avg_voice_calls": float(cluster_data['voice_total_calls'].mean()),
+            "avg_data_mb": float(cluster_data['data_total_mb'].mean()),
+            "avg_sms": float(cluster_data.get('sms_total_messages', pd.Series([0])).mean()),
+        })
+    return {
+        "algorithm": request.algorithm,
+        "n_clusters": int(labels.max() + 1),
+        "silhouette_score": float(score) if score else None,
+        "clusters": cluster_info
+    }
+@app.post("/api/query")
+def query_with_llm(request: QueryRequest):
+    """Query data using Gemini LLM"""
+    if gemini_model is None:
+        raise HTTPException(status_code=503, detail="Gemini API not configured")
+    # Build context with safe column access
+    def safe_col_sum(col_name, default=0):
+        """Safely get column sum or return default"""
+        return df[col_name].sum() if col_name in df.columns else default
+    def safe_col_mean(col_name, default=0):
+        """Safely get column mean or return default"""
+        return df[col_name].mean() if col_name in df.columns else default
+    def safe_col_count(col_name, condition_value=0):
+        """Safely count rows where column > condition_value"""
+        if col_name in df.columns:
+            return (df[col_name] > condition_value).sum()
+        return 0
+    context = f"""
+You are analyzing telecom customer data. Here are the key statistics:
+Total Customers: {len(df):,}
+International Users: {int(safe_col_count('intl_total_calls', 0)):,}
+Voice Communication:
+- Total Calls: {safe_col_sum('voice_total_calls'):,.0f}
+- Total Duration: {safe_col_sum('voice_total_duration_mins'):,.0f} mins
+- Average per User: {safe_col_mean('voice_total_calls'):.1f} calls
+{'Time Distribution:' if 'voice_morning_calls' in df.columns else ''}
+{f"- Morning Calls: {safe_col_sum('voice_morning_calls'):,.0f}" if 'voice_morning_calls' in df.columns else ''}
+{f"- Evening Calls: {safe_col_sum('voice_evening_calls'):,.0f}" if 'voice_evening_calls' in df.columns else ''}
+{f"- Night Calls: {safe_col_sum('voice_night_calls'):,.0f}" if 'voice_night_calls' in df.columns else ''}
+{'SMS:' if 'sms_total_messages' in df.columns else ''}
+{f"- Total Messages: {safe_col_sum('sms_total_messages'):,.0f}" if 'sms_total_messages' in df.columns else ''}
+{f"- Average per User: {safe_col_mean('sms_total_messages'):.1f}" if 'sms_total_messages' in df.columns else ''}
+Data Usage:
+- Total Data (MB): {safe_col_sum('data_total_mb'):,.0f}
+- Average per User (MB): {safe_col_mean('data_total_mb'):.1f}
+{f"- Total Download (GB): {safe_col_sum('data_downlink_mb') / 1024:.1f}" if 'data_downlink_mb' in df.columns else ''}
+{f"- Total Upload (GB): {safe_col_sum('data_uplink_mb') / 1024:.1f}" if 'data_uplink_mb' in df.columns else ''}
+User Question: {request.question}
+Provide a clear, concise answer based on the statistics above.
+"""
+    try:
+        response = gemini_model.generate_content(context)
+        return QueryResponse(answer=response.text, data=None)
+    except Exception as e:
+        import traceback
+        error_details = traceback.format_exc()
+        print(f"❌ Query error: {error_details}")
+        raise HTTPException(status_code=500, detail=f"LLM error: {str(e)}")
+@app.get("/api/search")
+def semantic_search(query: str = Query(..., description="Search query"), limit: int = 10):
+    """Semantic search for customers"""
+    if embedding_model is None or faiss_index is None:
+        raise HTTPException(status_code=503, detail="Search not available")
+    # Embed query
+    query_embedding = embedding_model.encode([query])
+    faiss.normalize_L2(query_embedding)
+    # Search
+    scores, indices = faiss_index.search(query_embedding, limit)
+    results = []
+    for score, idx in zip(scores[0], indices[0]):
+        if idx < len(df):
+            customer = df.iloc[idx]
+            results.append({
+                "customer_id": int(customer['subscriberid']),
+                "similarity_score": float(score),
+                "voice_calls": float(customer['voice_total_calls']),
+                "data_mb": float(customer['data_total_mb']),
+                "sms": int(customer.get('sms_total_messages', 0)),
+                "is_international": bool(customer.get('intl_total_calls', 0) > 0)
+            })
+    return {"results": results}
+@app.get("/api/clusters")
+def get_clusters(cluster_type: str = "kmeans"):
+    """Get cluster information"""
+    if df is None:
+        raise HTTPException(status_code=500, detail="Data not loaded")
+    cluster_col = f"{cluster_type}_cluster"
+    if cluster_col not in df.columns:
+        raise HTTPException(status_code=404, detail=f"{cluster_type} clusters not found")
+    cluster_info = []
+    for cluster_id in sorted(df[cluster_col].unique()):
+        if pd.isna(cluster_id):
+            continue
+        cluster_data = df[df[cluster_col] == cluster_id]
+        cluster_info.append({
+            "cluster_id": int(cluster_id),
+            "size": int(len(cluster_data)),
+            "percentage": float(len(cluster_data) / len(df) * 100),
+            "avg_voice_mins": float(cluster_data['voice_total_duration_mins'].mean()),
+            "avg_data_mb": float(cluster_data['data_total_mb'].mean()),
+            "avg_sms": float(cluster_data.get('sms_total_messages', pd.Series([0])).mean()),
+            "avg_intl_calls": float(cluster_data.get('intl_total_calls', pd.Series([0])).mean()),
+        })
+    return {"cluster_type": cluster_type, "clusters": cluster_info}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=7860)

golden_table_clustered.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0006dbc2e097a706c352677071d536a0fa3a0f7fbb84c2382e64a5f457644250
+size 49627961

international_calls.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

merged_subscriber_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad2fd497c543269f3a003a07ce1e3b4c620de752f02a13a460584943d7c2eafd
+size 24336135

requirements.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+# Backend Dependencies for HuggingFace Spaces
+# FastAPI + ML + LLM Integration
+# API Framework
+fastapi==0.109.0
+uvicorn[standard]==0.27.0
+python-multipart==0.0.6
+# Data Processing
+pandas==2.1.4
+numpy==1.26.3
+# ML & Embeddings
+scikit-learn==1.4.0
+sentence-transformers==2.3.1
+huggingface_hub==0.20.3
+faiss-cpu==1.7.4
+# LLM
+google-generativeai==0.3.2
+# Database
+aiosqlite==0.19.0
+# Visualization
+matplotlib==3.8.2
+plotly==5.18.0
+kaleido==0.2.1
+# Utilities
+python-dotenv==1.0.0
+pydantic==2.5.3