Spaces:

rajkhanke
/

pranit_churn_application

Sleeping

App Files Files Community

pranit_churn_application / src /utils /enhanced_data_generator.py

rajkhanke

Upload 45 files

1b70843 verified 24 days ago

raw

history blame contribute delete

14.5 kB

	"""
	Enhanced Telecommunications Data Generator - Complete Implementation
	Implements all data sources from Technical Requirements Document
	"""

	import pandas as pd
	import numpy as np
	from datetime import datetime, timedelta
	from faker import Faker
	import random
	from tqdm import tqdm

	np.random.seed(42)
	random.seed(42)
	fake = Faker()
	Faker.seed(42)


	class EnhancedTelecomDataGenerator:
	"""
	Comprehensive data generator implementing ALL technical requirements:
	- Device Data (OS, apps, performance)
	- Competitive Intelligence
	- External Data (weather, events, demographics)
	- Enhanced network metrics
	- Customer journey analytics
	"""

	def __init__(self, num_customers=100000):
	self.num_customers = num_customers
	self.start_date = pd.to_datetime('2022-01-01')
	self.end_date = pd.to_datetime('2024-12-31')

	# Device configurations
	self.os_versions = {
	'iOS': ['16.0', '16.1', '16.2', '17.0', '17.1', '17.2'],
	'Android': ['12', '13', '14'],
	}

	self.popular_apps = [
	'WhatsApp', 'Facebook', 'Instagram', 'YouTube', 'TikTok',
	'Netflix', 'Spotify', 'Gmail', 'Google Maps', 'Twitter',
	'Snapchat', 'LinkedIn', 'Uber', 'Amazon', 'Zoom'
	]

	# Competitor data
	self.competitors = ['Verizon', 'AT&T', 'T-Mobile', 'Sprint']

	# Weather conditions
	self.weather_conditions = ['Clear', 'Cloudy', 'Rainy', 'Stormy', 'Snowy', 'Foggy']

	# Event types
	self.event_types = ['Concert', 'Sports', 'Festival', 'Convention', 'Holiday']

	print(f"Initialized Enhanced Data Generator for {num_customers:,} customers")

	def generate_device_data(self, customers_df):
	"""
	Generate comprehensive device data:
	- OS version & update history
	- App usage patterns
	- Device performance metrics
	- Battery health & storage
	"""
	print("\n📱 Generating Device Performance Data...")

	device_data = []

	for _, customer in tqdm(customers_df.iterrows(), total=len(customers_df), desc="Device data"):
	# Determine OS from device manufacturer
	manufacturer = customer['device_manufacturer']
	if manufacturer == 'Apple':
	os_type = 'iOS'
	os_version = random.choice(self.os_versions['iOS'])
	else:
	os_type = 'Android'
	os_version = random.choice(self.os_versions['Android'])

	# App usage (select 5-12 apps per customer)
	num_apps = random.randint(5, 12)
	user_apps = random.sample(self.popular_apps, num_apps)

	# Performance metrics
	device = {
	'customer_id': customer['customer_id'],
	'os_type': os_type,
	'os_version': os_version,
	'os_last_updated': (self.end_date - timedelta(days=random.randint(0, 180))).date(),
	'storage_total_gb': random.choice([64, 128, 256, 512]),
	'storage_used_pct': random.uniform(40, 95),
	'battery_health_pct': max(70, 100 - customer['device_age_months'] * 1.5),
	'avg_battery_drain_pct_per_hour': random.uniform(3, 15),
	'apps_installed': len(user_apps),
	'top_apps': ','.join(user_apps[:5]),
	'avg_daily_screen_time_hours': random.uniform(2, 8),
	'data_saver_enabled': random.random() < 0.3,
	'background_data_restricted': random.random() < 0.25,
	'wifi_calling_enabled': random.random() < 0.6,
	'volte_enabled': random.random() < 0.8,
	'5g_enabled': random.random() < 0.7,
	'device_temperature_avg_celsius': random.uniform(25, 40),
	'crash_count_last_month': np.random.poisson(1),
	'avg_app_load_time_sec': random.uniform(0.5, 3.0),
	'memory_pressure_high_pct': random.uniform(5, 40),
	}

	device_data.append(device)

	df = pd.DataFrame(device_data)
	print(f"✅ Generated device data for {len(df):,} customers")
	return df

	def generate_competitive_intelligence(self):
	"""
	Generate market intelligence data:
	- Competitor pricing
	- Market share trends
	- Promotional campaigns
	- Customer migration patterns
	"""
	print("\n🏢 Generating Competitive Intelligence Data...")

	months = pd.date_range(start=self.start_date, end=self.end_date, freq='MS')

	market_data = []

	for month in tqdm(months, desc="Market analysis"):
	for competitor in self.competitors:
	# Pricing data
	base_price = random.uniform(40, 120)
	promo_active = random.random() < 0.3

	market_entry = {
	'month': month.date(),
	'competitor': competitor,
	'base_plan_price': round(base_price, 2),
	'unlimited_plan_price': round(base_price * 1.8, 2),
	'family_plan_price': round(base_price * 2.5, 2),
	'promotion_active': promo_active,
	'promotion_discount_pct': random.uniform(10, 30) if promo_active else 0,
	'market_share_pct': random.uniform(15, 30),
	'customer_satisfaction_score': random.uniform(3.5, 4.8),
	'network_quality_score': random.uniform(7, 9.5),
	'5g_coverage_pct': random.uniform(40, 85),
	'avg_download_speed_mbps': random.uniform(50, 300),
	'churn_rate_pct': random.uniform(1.5, 3.5),
	'new_customer_acquisitions': random.randint(50000, 200000),
	'advertising_spend_millions': random.uniform(5, 25),
	}

	market_data.append(market_entry)

	df = pd.DataFrame(market_data)
	print(f"✅ Generated {len(df):,} market intelligence records")
	return df

	def generate_external_data(self, towers_df):
	"""
	Generate external data sources:
	- Weather conditions by location
	- Local events
	- Demographic/census data
	- Economic indicators
	"""
	print("\n🌍 Generating External Data Sources...")

	# Weather data (daily by tower location)
	print(" Generating weather data...")
	weather_data = []
	dates = pd.date_range(start=self.end_date - timedelta(days=90), end=self.end_date, freq='D')

	# Sample subset of towers for weather
	sample_towers = towers_df.sample(n=min(200, len(towers_df)), random_state=42)

	for tower_id, tower in tqdm(sample_towers.iterrows(), total=len(sample_towers), desc="Weather"):
	for date in dates:
	weather = {
	'date': date.date(),
	'tower_id': tower['tower_id'],
	'city': tower['city'],
	'temperature_celsius': random.uniform(-10, 35),
	'humidity_pct': random.uniform(30, 90),
	'precipitation_mm': max(0, np.random.exponential(2)),
	'wind_speed_kmh': random.uniform(5, 50),
	'condition': random.choice(self.weather_conditions),
	'severe_weather': random.random() < 0.05,
	}
	weather_data.append(weather)

	weather_df = pd.DataFrame(weather_data)

	# Events data
	print(" Generating events data...")
	events_data = []
	num_events = 500

	for i in range(num_events):
	event_date = fake.date_between(start_date=self.start_date, end_date=self.end_date)

	event = {
	'event_id': f'EVT{i+1:05d}',
	'event_name': f'{random.choice(self.event_types)} {i+1}',
	'event_type': random.choice(self.event_types),
	'event_date': event_date,
	'city': random.choice(towers_df['city'].unique()),
	'expected_attendance': random.randint(1000, 100000),
	'duration_hours': random.randint(2, 48),
	}
	events_data.append(event)

	events_df = pd.DataFrame(events_data)

	# Demographics/Census data by city
	print(" Generating demographic data...")
	cities = towers_df['city'].unique()
	demographics_data = []

	for city in cities:
	demo = {
	'city': city,
	'population': random.randint(100000, 5000000),
	'median_age': random.uniform(30, 45),
	'median_income': random.randint(40000, 100000),
	'unemployment_rate_pct': random.uniform(3, 8),
	'college_educated_pct': random.uniform(25, 60),
	'homeownership_rate_pct': random.uniform(40, 70),
	'population_density_per_sqkm': random.randint(100, 10000),
	'urban_classification': random.choice(['Urban', 'Suburban', 'Rural']),
	}
	demographics_data.append(demo)

	demographics_df = pd.DataFrame(demographics_data)

	print(f"✅ Weather: {len(weather_df):,} records")
	print(f"✅ Events: {len(events_df):,} records")
	print(f"✅ Demographics: {len(demographics_df):,} cities")

	return {
	'weather': weather_df,
	'events': events_df,
	'demographics': demographics_df
	}

	def generate_customer_journey_data(self, customers_df):
	"""
	Generate customer journey analytics:
	- Lifecycle stages
	- Service interaction history
	- Payment behavior patterns
	- Customer segmentation
	"""
	print("\n👤 Generating Customer Journey Data...")

	journey_data = []

	for _, customer in tqdm(customers_df.iterrows(), total=len(customers_df), desc="Customer journeys"):
	# Lifecycle stage based on tenure
	tenure = customer['tenure_months']
	if tenure < 3:
	lifecycle_stage = 'New'
	engagement_score = random.uniform(6, 9)
	elif tenure < 12:
	lifecycle_stage = 'Growing'
	engagement_score = random.uniform(7, 10)
	elif tenure < 36:
	lifecycle_stage = 'Mature'
	engagement_score = random.uniform(5, 9)
	else:
	lifecycle_stage = 'Tenured'
	engagement_score = random.uniform(4, 8)

	# Payment behavior
	payment_score = random.uniform(1, 10)
	late_payment_risk = 'Low' if payment_score > 7 else ('Medium' if payment_score > 4 else 'High')

	journey = {
	'customer_id': customer['customer_id'],
	'lifecycle_stage': lifecycle_stage,
	'engagement_score': round(engagement_score, 2),
	'value_segment': random.choice(['High Value', 'Medium Value', 'Low Value']),
	'loyalty_tier': random.choice(['Bronze', 'Silver', 'Gold', 'Platinum']),
	'payment_behavior_score': round(payment_score, 2),
	'late_payment_risk': late_payment_risk,
	'total_interactions': np.random.poisson(tenure * 0.3),
	'positive_interactions_pct': random.uniform(60, 95),
	'nps_score': random.randint(-100, 100),
	'referrals_made': customer.get('referral_count', 0),
	'upsell_opportunities': np.random.poisson(2),
	'cross_sell_score': random.uniform(0, 10),
	'reactivation_risk': random.uniform(0, 1),
	'social_influence_score': random.uniform(0, 10),
	}

	journey_data.append(journey)

	df = pd.DataFrame(journey_data)
	print(f"✅ Generated journey data for {len(df):,} customers")
	return df


	def main():
	"""Generate all enhanced datasets"""
	print("="*80)
	print("ENHANCED TELECOMMUNICATIONS DATA GENERATOR")
	print("="*80)

	# First generate base data using original generator
	from synthetic_data_generator import TelecomDataGenerator

	base_gen = TelecomDataGenerator(num_customers=100000, num_towers=1000)

	print("\n📊 Generating Base Data...")
	customers_df = base_gen.generate_customer_demographics()
	towers_df = base_gen.generate_network_infrastructure()

	# Initialize enhanced generator
	enhanced_gen = EnhancedTelecomDataGenerator(num_customers=len(customers_df))

	# Generate enhanced data
	device_df = enhanced_gen.generate_device_data(customers_df)
	competitive_df = enhanced_gen.generate_competitive_intelligence()
	external_data = enhanced_gen.generate_external_data(towers_df)
	journey_df = enhanced_gen.generate_customer_journey_data(customers_df)

	# Save all datasets
	print("\n💾 Saving Enhanced Datasets...")

	device_df.to_csv('data/synthetic/device_data.csv', index=False)
	print(" ✅ Saved device_data.csv")

	competitive_df.to_csv('data/synthetic/competitive_intelligence.csv', index=False)
	print(" ✅ Saved competitive_intelligence.csv")

	external_data['weather'].to_csv('data/synthetic/weather_data.csv', index=False)
	print(" ✅ Saved weather_data.csv")

	external_data['events'].to_csv('data/synthetic/events_data.csv', index=False)
	print(" ✅ Saved events_data.csv")

	external_data['demographics'].to_csv('data/synthetic/demographics_data.csv', index=False)
	print(" ✅ Saved demographics_data.csv")

	journey_df.to_csv('data/synthetic/customer_journey.csv', index=False)
	print(" ✅ Saved customer_journey.csv")

	print("\n" + "="*80)
	print("ENHANCED DATA GENERATION COMPLETE")
	print("="*80)
	print(f"\n📈 Summary:")
	print(f" - Device Data: {len(device_df):,} records")
	print(f" - Competitive Intelligence: {len(competitive_df):,} records")
	print(f" - Weather Data: {len(external_data['weather']):,} records")
	print(f" - Events Data: {len(external_data['events']):,} records")
	print(f" - Demographics: {len(external_data['demographics']):,} cities")
	print(f" - Customer Journey: {len(journey_df):,} customers")
	print("\n✅ All enhanced datasets saved to 'data/synthetic/' directory")


	if __name__ == "__main__":
	main()