pranit_churn_application / src /utils /enhanced_data_generator.py
rajkhanke's picture
Upload 45 files
1b70843 verified
"""
Enhanced Telecommunications Data Generator - Complete Implementation
Implements all data sources from Technical Requirements Document
"""
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from faker import Faker
import random
from tqdm import tqdm
np.random.seed(42)
random.seed(42)
fake = Faker()
Faker.seed(42)
class EnhancedTelecomDataGenerator:
"""
Comprehensive data generator implementing ALL technical requirements:
- Device Data (OS, apps, performance)
- Competitive Intelligence
- External Data (weather, events, demographics)
- Enhanced network metrics
- Customer journey analytics
"""
def __init__(self, num_customers=100000):
self.num_customers = num_customers
self.start_date = pd.to_datetime('2022-01-01')
self.end_date = pd.to_datetime('2024-12-31')
# Device configurations
self.os_versions = {
'iOS': ['16.0', '16.1', '16.2', '17.0', '17.1', '17.2'],
'Android': ['12', '13', '14'],
}
self.popular_apps = [
'WhatsApp', 'Facebook', 'Instagram', 'YouTube', 'TikTok',
'Netflix', 'Spotify', 'Gmail', 'Google Maps', 'Twitter',
'Snapchat', 'LinkedIn', 'Uber', 'Amazon', 'Zoom'
]
# Competitor data
self.competitors = ['Verizon', 'AT&T', 'T-Mobile', 'Sprint']
# Weather conditions
self.weather_conditions = ['Clear', 'Cloudy', 'Rainy', 'Stormy', 'Snowy', 'Foggy']
# Event types
self.event_types = ['Concert', 'Sports', 'Festival', 'Convention', 'Holiday']
print(f"Initialized Enhanced Data Generator for {num_customers:,} customers")
def generate_device_data(self, customers_df):
"""
Generate comprehensive device data:
- OS version & update history
- App usage patterns
- Device performance metrics
- Battery health & storage
"""
print("\nπŸ“± Generating Device Performance Data...")
device_data = []
for _, customer in tqdm(customers_df.iterrows(), total=len(customers_df), desc="Device data"):
# Determine OS from device manufacturer
manufacturer = customer['device_manufacturer']
if manufacturer == 'Apple':
os_type = 'iOS'
os_version = random.choice(self.os_versions['iOS'])
else:
os_type = 'Android'
os_version = random.choice(self.os_versions['Android'])
# App usage (select 5-12 apps per customer)
num_apps = random.randint(5, 12)
user_apps = random.sample(self.popular_apps, num_apps)
# Performance metrics
device = {
'customer_id': customer['customer_id'],
'os_type': os_type,
'os_version': os_version,
'os_last_updated': (self.end_date - timedelta(days=random.randint(0, 180))).date(),
'storage_total_gb': random.choice([64, 128, 256, 512]),
'storage_used_pct': random.uniform(40, 95),
'battery_health_pct': max(70, 100 - customer['device_age_months'] * 1.5),
'avg_battery_drain_pct_per_hour': random.uniform(3, 15),
'apps_installed': len(user_apps),
'top_apps': ','.join(user_apps[:5]),
'avg_daily_screen_time_hours': random.uniform(2, 8),
'data_saver_enabled': random.random() < 0.3,
'background_data_restricted': random.random() < 0.25,
'wifi_calling_enabled': random.random() < 0.6,
'volte_enabled': random.random() < 0.8,
'5g_enabled': random.random() < 0.7,
'device_temperature_avg_celsius': random.uniform(25, 40),
'crash_count_last_month': np.random.poisson(1),
'avg_app_load_time_sec': random.uniform(0.5, 3.0),
'memory_pressure_high_pct': random.uniform(5, 40),
}
device_data.append(device)
df = pd.DataFrame(device_data)
print(f"βœ… Generated device data for {len(df):,} customers")
return df
def generate_competitive_intelligence(self):
"""
Generate market intelligence data:
- Competitor pricing
- Market share trends
- Promotional campaigns
- Customer migration patterns
"""
print("\n🏒 Generating Competitive Intelligence Data...")
months = pd.date_range(start=self.start_date, end=self.end_date, freq='MS')
market_data = []
for month in tqdm(months, desc="Market analysis"):
for competitor in self.competitors:
# Pricing data
base_price = random.uniform(40, 120)
promo_active = random.random() < 0.3
market_entry = {
'month': month.date(),
'competitor': competitor,
'base_plan_price': round(base_price, 2),
'unlimited_plan_price': round(base_price * 1.8, 2),
'family_plan_price': round(base_price * 2.5, 2),
'promotion_active': promo_active,
'promotion_discount_pct': random.uniform(10, 30) if promo_active else 0,
'market_share_pct': random.uniform(15, 30),
'customer_satisfaction_score': random.uniform(3.5, 4.8),
'network_quality_score': random.uniform(7, 9.5),
'5g_coverage_pct': random.uniform(40, 85),
'avg_download_speed_mbps': random.uniform(50, 300),
'churn_rate_pct': random.uniform(1.5, 3.5),
'new_customer_acquisitions': random.randint(50000, 200000),
'advertising_spend_millions': random.uniform(5, 25),
}
market_data.append(market_entry)
df = pd.DataFrame(market_data)
print(f"βœ… Generated {len(df):,} market intelligence records")
return df
def generate_external_data(self, towers_df):
"""
Generate external data sources:
- Weather conditions by location
- Local events
- Demographic/census data
- Economic indicators
"""
print("\n🌍 Generating External Data Sources...")
# Weather data (daily by tower location)
print(" Generating weather data...")
weather_data = []
dates = pd.date_range(start=self.end_date - timedelta(days=90), end=self.end_date, freq='D')
# Sample subset of towers for weather
sample_towers = towers_df.sample(n=min(200, len(towers_df)), random_state=42)
for tower_id, tower in tqdm(sample_towers.iterrows(), total=len(sample_towers), desc="Weather"):
for date in dates:
weather = {
'date': date.date(),
'tower_id': tower['tower_id'],
'city': tower['city'],
'temperature_celsius': random.uniform(-10, 35),
'humidity_pct': random.uniform(30, 90),
'precipitation_mm': max(0, np.random.exponential(2)),
'wind_speed_kmh': random.uniform(5, 50),
'condition': random.choice(self.weather_conditions),
'severe_weather': random.random() < 0.05,
}
weather_data.append(weather)
weather_df = pd.DataFrame(weather_data)
# Events data
print(" Generating events data...")
events_data = []
num_events = 500
for i in range(num_events):
event_date = fake.date_between(start_date=self.start_date, end_date=self.end_date)
event = {
'event_id': f'EVT{i+1:05d}',
'event_name': f'{random.choice(self.event_types)} {i+1}',
'event_type': random.choice(self.event_types),
'event_date': event_date,
'city': random.choice(towers_df['city'].unique()),
'expected_attendance': random.randint(1000, 100000),
'duration_hours': random.randint(2, 48),
}
events_data.append(event)
events_df = pd.DataFrame(events_data)
# Demographics/Census data by city
print(" Generating demographic data...")
cities = towers_df['city'].unique()
demographics_data = []
for city in cities:
demo = {
'city': city,
'population': random.randint(100000, 5000000),
'median_age': random.uniform(30, 45),
'median_income': random.randint(40000, 100000),
'unemployment_rate_pct': random.uniform(3, 8),
'college_educated_pct': random.uniform(25, 60),
'homeownership_rate_pct': random.uniform(40, 70),
'population_density_per_sqkm': random.randint(100, 10000),
'urban_classification': random.choice(['Urban', 'Suburban', 'Rural']),
}
demographics_data.append(demo)
demographics_df = pd.DataFrame(demographics_data)
print(f"βœ… Weather: {len(weather_df):,} records")
print(f"βœ… Events: {len(events_df):,} records")
print(f"βœ… Demographics: {len(demographics_df):,} cities")
return {
'weather': weather_df,
'events': events_df,
'demographics': demographics_df
}
def generate_customer_journey_data(self, customers_df):
"""
Generate customer journey analytics:
- Lifecycle stages
- Service interaction history
- Payment behavior patterns
- Customer segmentation
"""
print("\nπŸ‘€ Generating Customer Journey Data...")
journey_data = []
for _, customer in tqdm(customers_df.iterrows(), total=len(customers_df), desc="Customer journeys"):
# Lifecycle stage based on tenure
tenure = customer['tenure_months']
if tenure < 3:
lifecycle_stage = 'New'
engagement_score = random.uniform(6, 9)
elif tenure < 12:
lifecycle_stage = 'Growing'
engagement_score = random.uniform(7, 10)
elif tenure < 36:
lifecycle_stage = 'Mature'
engagement_score = random.uniform(5, 9)
else:
lifecycle_stage = 'Tenured'
engagement_score = random.uniform(4, 8)
# Payment behavior
payment_score = random.uniform(1, 10)
late_payment_risk = 'Low' if payment_score > 7 else ('Medium' if payment_score > 4 else 'High')
journey = {
'customer_id': customer['customer_id'],
'lifecycle_stage': lifecycle_stage,
'engagement_score': round(engagement_score, 2),
'value_segment': random.choice(['High Value', 'Medium Value', 'Low Value']),
'loyalty_tier': random.choice(['Bronze', 'Silver', 'Gold', 'Platinum']),
'payment_behavior_score': round(payment_score, 2),
'late_payment_risk': late_payment_risk,
'total_interactions': np.random.poisson(tenure * 0.3),
'positive_interactions_pct': random.uniform(60, 95),
'nps_score': random.randint(-100, 100),
'referrals_made': customer.get('referral_count', 0),
'upsell_opportunities': np.random.poisson(2),
'cross_sell_score': random.uniform(0, 10),
'reactivation_risk': random.uniform(0, 1),
'social_influence_score': random.uniform(0, 10),
}
journey_data.append(journey)
df = pd.DataFrame(journey_data)
print(f"βœ… Generated journey data for {len(df):,} customers")
return df
def main():
"""Generate all enhanced datasets"""
print("="*80)
print("ENHANCED TELECOMMUNICATIONS DATA GENERATOR")
print("="*80)
# First generate base data using original generator
from synthetic_data_generator import TelecomDataGenerator
base_gen = TelecomDataGenerator(num_customers=100000, num_towers=1000)
print("\nπŸ“Š Generating Base Data...")
customers_df = base_gen.generate_customer_demographics()
towers_df = base_gen.generate_network_infrastructure()
# Initialize enhanced generator
enhanced_gen = EnhancedTelecomDataGenerator(num_customers=len(customers_df))
# Generate enhanced data
device_df = enhanced_gen.generate_device_data(customers_df)
competitive_df = enhanced_gen.generate_competitive_intelligence()
external_data = enhanced_gen.generate_external_data(towers_df)
journey_df = enhanced_gen.generate_customer_journey_data(customers_df)
# Save all datasets
print("\nπŸ’Ύ Saving Enhanced Datasets...")
device_df.to_csv('data/synthetic/device_data.csv', index=False)
print(" βœ… Saved device_data.csv")
competitive_df.to_csv('data/synthetic/competitive_intelligence.csv', index=False)
print(" βœ… Saved competitive_intelligence.csv")
external_data['weather'].to_csv('data/synthetic/weather_data.csv', index=False)
print(" βœ… Saved weather_data.csv")
external_data['events'].to_csv('data/synthetic/events_data.csv', index=False)
print(" βœ… Saved events_data.csv")
external_data['demographics'].to_csv('data/synthetic/demographics_data.csv', index=False)
print(" βœ… Saved demographics_data.csv")
journey_df.to_csv('data/synthetic/customer_journey.csv', index=False)
print(" βœ… Saved customer_journey.csv")
print("\n" + "="*80)
print("ENHANCED DATA GENERATION COMPLETE")
print("="*80)
print(f"\nπŸ“ˆ Summary:")
print(f" - Device Data: {len(device_df):,} records")
print(f" - Competitive Intelligence: {len(competitive_df):,} records")
print(f" - Weather Data: {len(external_data['weather']):,} records")
print(f" - Events Data: {len(external_data['events']):,} records")
print(f" - Demographics: {len(external_data['demographics']):,} cities")
print(f" - Customer Journey: {len(journey_df):,} customers")
print("\nβœ… All enhanced datasets saved to 'data/synthetic/' directory")
if __name__ == "__main__":
main()