""" Enhanced Telecommunications Data Generator - Complete Implementation Implements all data sources from Technical Requirements Document """ import pandas as pd import numpy as np from datetime import datetime, timedelta from faker import Faker import random from tqdm import tqdm np.random.seed(42) random.seed(42) fake = Faker() Faker.seed(42) class EnhancedTelecomDataGenerator: """ Comprehensive data generator implementing ALL technical requirements: - Device Data (OS, apps, performance) - Competitive Intelligence - External Data (weather, events, demographics) - Enhanced network metrics - Customer journey analytics """ def __init__(self, num_customers=100000): self.num_customers = num_customers self.start_date = pd.to_datetime('2022-01-01') self.end_date = pd.to_datetime('2024-12-31') # Device configurations self.os_versions = { 'iOS': ['16.0', '16.1', '16.2', '17.0', '17.1', '17.2'], 'Android': ['12', '13', '14'], } self.popular_apps = [ 'WhatsApp', 'Facebook', 'Instagram', 'YouTube', 'TikTok', 'Netflix', 'Spotify', 'Gmail', 'Google Maps', 'Twitter', 'Snapchat', 'LinkedIn', 'Uber', 'Amazon', 'Zoom' ] # Competitor data self.competitors = ['Verizon', 'AT&T', 'T-Mobile', 'Sprint'] # Weather conditions self.weather_conditions = ['Clear', 'Cloudy', 'Rainy', 'Stormy', 'Snowy', 'Foggy'] # Event types self.event_types = ['Concert', 'Sports', 'Festival', 'Convention', 'Holiday'] print(f"Initialized Enhanced Data Generator for {num_customers:,} customers") def generate_device_data(self, customers_df): """ Generate comprehensive device data: - OS version & update history - App usage patterns - Device performance metrics - Battery health & storage """ print("\nšŸ“± Generating Device Performance Data...") device_data = [] for _, customer in tqdm(customers_df.iterrows(), total=len(customers_df), desc="Device data"): # Determine OS from device manufacturer manufacturer = customer['device_manufacturer'] if manufacturer == 'Apple': os_type = 'iOS' os_version = random.choice(self.os_versions['iOS']) else: os_type = 'Android' os_version = random.choice(self.os_versions['Android']) # App usage (select 5-12 apps per customer) num_apps = random.randint(5, 12) user_apps = random.sample(self.popular_apps, num_apps) # Performance metrics device = { 'customer_id': customer['customer_id'], 'os_type': os_type, 'os_version': os_version, 'os_last_updated': (self.end_date - timedelta(days=random.randint(0, 180))).date(), 'storage_total_gb': random.choice([64, 128, 256, 512]), 'storage_used_pct': random.uniform(40, 95), 'battery_health_pct': max(70, 100 - customer['device_age_months'] * 1.5), 'avg_battery_drain_pct_per_hour': random.uniform(3, 15), 'apps_installed': len(user_apps), 'top_apps': ','.join(user_apps[:5]), 'avg_daily_screen_time_hours': random.uniform(2, 8), 'data_saver_enabled': random.random() < 0.3, 'background_data_restricted': random.random() < 0.25, 'wifi_calling_enabled': random.random() < 0.6, 'volte_enabled': random.random() < 0.8, '5g_enabled': random.random() < 0.7, 'device_temperature_avg_celsius': random.uniform(25, 40), 'crash_count_last_month': np.random.poisson(1), 'avg_app_load_time_sec': random.uniform(0.5, 3.0), 'memory_pressure_high_pct': random.uniform(5, 40), } device_data.append(device) df = pd.DataFrame(device_data) print(f"āœ… Generated device data for {len(df):,} customers") return df def generate_competitive_intelligence(self): """ Generate market intelligence data: - Competitor pricing - Market share trends - Promotional campaigns - Customer migration patterns """ print("\nšŸ¢ Generating Competitive Intelligence Data...") months = pd.date_range(start=self.start_date, end=self.end_date, freq='MS') market_data = [] for month in tqdm(months, desc="Market analysis"): for competitor in self.competitors: # Pricing data base_price = random.uniform(40, 120) promo_active = random.random() < 0.3 market_entry = { 'month': month.date(), 'competitor': competitor, 'base_plan_price': round(base_price, 2), 'unlimited_plan_price': round(base_price * 1.8, 2), 'family_plan_price': round(base_price * 2.5, 2), 'promotion_active': promo_active, 'promotion_discount_pct': random.uniform(10, 30) if promo_active else 0, 'market_share_pct': random.uniform(15, 30), 'customer_satisfaction_score': random.uniform(3.5, 4.8), 'network_quality_score': random.uniform(7, 9.5), '5g_coverage_pct': random.uniform(40, 85), 'avg_download_speed_mbps': random.uniform(50, 300), 'churn_rate_pct': random.uniform(1.5, 3.5), 'new_customer_acquisitions': random.randint(50000, 200000), 'advertising_spend_millions': random.uniform(5, 25), } market_data.append(market_entry) df = pd.DataFrame(market_data) print(f"āœ… Generated {len(df):,} market intelligence records") return df def generate_external_data(self, towers_df): """ Generate external data sources: - Weather conditions by location - Local events - Demographic/census data - Economic indicators """ print("\nšŸŒ Generating External Data Sources...") # Weather data (daily by tower location) print(" Generating weather data...") weather_data = [] dates = pd.date_range(start=self.end_date - timedelta(days=90), end=self.end_date, freq='D') # Sample subset of towers for weather sample_towers = towers_df.sample(n=min(200, len(towers_df)), random_state=42) for tower_id, tower in tqdm(sample_towers.iterrows(), total=len(sample_towers), desc="Weather"): for date in dates: weather = { 'date': date.date(), 'tower_id': tower['tower_id'], 'city': tower['city'], 'temperature_celsius': random.uniform(-10, 35), 'humidity_pct': random.uniform(30, 90), 'precipitation_mm': max(0, np.random.exponential(2)), 'wind_speed_kmh': random.uniform(5, 50), 'condition': random.choice(self.weather_conditions), 'severe_weather': random.random() < 0.05, } weather_data.append(weather) weather_df = pd.DataFrame(weather_data) # Events data print(" Generating events data...") events_data = [] num_events = 500 for i in range(num_events): event_date = fake.date_between(start_date=self.start_date, end_date=self.end_date) event = { 'event_id': f'EVT{i+1:05d}', 'event_name': f'{random.choice(self.event_types)} {i+1}', 'event_type': random.choice(self.event_types), 'event_date': event_date, 'city': random.choice(towers_df['city'].unique()), 'expected_attendance': random.randint(1000, 100000), 'duration_hours': random.randint(2, 48), } events_data.append(event) events_df = pd.DataFrame(events_data) # Demographics/Census data by city print(" Generating demographic data...") cities = towers_df['city'].unique() demographics_data = [] for city in cities: demo = { 'city': city, 'population': random.randint(100000, 5000000), 'median_age': random.uniform(30, 45), 'median_income': random.randint(40000, 100000), 'unemployment_rate_pct': random.uniform(3, 8), 'college_educated_pct': random.uniform(25, 60), 'homeownership_rate_pct': random.uniform(40, 70), 'population_density_per_sqkm': random.randint(100, 10000), 'urban_classification': random.choice(['Urban', 'Suburban', 'Rural']), } demographics_data.append(demo) demographics_df = pd.DataFrame(demographics_data) print(f"āœ… Weather: {len(weather_df):,} records") print(f"āœ… Events: {len(events_df):,} records") print(f"āœ… Demographics: {len(demographics_df):,} cities") return { 'weather': weather_df, 'events': events_df, 'demographics': demographics_df } def generate_customer_journey_data(self, customers_df): """ Generate customer journey analytics: - Lifecycle stages - Service interaction history - Payment behavior patterns - Customer segmentation """ print("\nšŸ‘¤ Generating Customer Journey Data...") journey_data = [] for _, customer in tqdm(customers_df.iterrows(), total=len(customers_df), desc="Customer journeys"): # Lifecycle stage based on tenure tenure = customer['tenure_months'] if tenure < 3: lifecycle_stage = 'New' engagement_score = random.uniform(6, 9) elif tenure < 12: lifecycle_stage = 'Growing' engagement_score = random.uniform(7, 10) elif tenure < 36: lifecycle_stage = 'Mature' engagement_score = random.uniform(5, 9) else: lifecycle_stage = 'Tenured' engagement_score = random.uniform(4, 8) # Payment behavior payment_score = random.uniform(1, 10) late_payment_risk = 'Low' if payment_score > 7 else ('Medium' if payment_score > 4 else 'High') journey = { 'customer_id': customer['customer_id'], 'lifecycle_stage': lifecycle_stage, 'engagement_score': round(engagement_score, 2), 'value_segment': random.choice(['High Value', 'Medium Value', 'Low Value']), 'loyalty_tier': random.choice(['Bronze', 'Silver', 'Gold', 'Platinum']), 'payment_behavior_score': round(payment_score, 2), 'late_payment_risk': late_payment_risk, 'total_interactions': np.random.poisson(tenure * 0.3), 'positive_interactions_pct': random.uniform(60, 95), 'nps_score': random.randint(-100, 100), 'referrals_made': customer.get('referral_count', 0), 'upsell_opportunities': np.random.poisson(2), 'cross_sell_score': random.uniform(0, 10), 'reactivation_risk': random.uniform(0, 1), 'social_influence_score': random.uniform(0, 10), } journey_data.append(journey) df = pd.DataFrame(journey_data) print(f"āœ… Generated journey data for {len(df):,} customers") return df def main(): """Generate all enhanced datasets""" print("="*80) print("ENHANCED TELECOMMUNICATIONS DATA GENERATOR") print("="*80) # First generate base data using original generator from synthetic_data_generator import TelecomDataGenerator base_gen = TelecomDataGenerator(num_customers=100000, num_towers=1000) print("\nšŸ“Š Generating Base Data...") customers_df = base_gen.generate_customer_demographics() towers_df = base_gen.generate_network_infrastructure() # Initialize enhanced generator enhanced_gen = EnhancedTelecomDataGenerator(num_customers=len(customers_df)) # Generate enhanced data device_df = enhanced_gen.generate_device_data(customers_df) competitive_df = enhanced_gen.generate_competitive_intelligence() external_data = enhanced_gen.generate_external_data(towers_df) journey_df = enhanced_gen.generate_customer_journey_data(customers_df) # Save all datasets print("\nšŸ’¾ Saving Enhanced Datasets...") device_df.to_csv('data/synthetic/device_data.csv', index=False) print(" āœ… Saved device_data.csv") competitive_df.to_csv('data/synthetic/competitive_intelligence.csv', index=False) print(" āœ… Saved competitive_intelligence.csv") external_data['weather'].to_csv('data/synthetic/weather_data.csv', index=False) print(" āœ… Saved weather_data.csv") external_data['events'].to_csv('data/synthetic/events_data.csv', index=False) print(" āœ… Saved events_data.csv") external_data['demographics'].to_csv('data/synthetic/demographics_data.csv', index=False) print(" āœ… Saved demographics_data.csv") journey_df.to_csv('data/synthetic/customer_journey.csv', index=False) print(" āœ… Saved customer_journey.csv") print("\n" + "="*80) print("ENHANCED DATA GENERATION COMPLETE") print("="*80) print(f"\nšŸ“ˆ Summary:") print(f" - Device Data: {len(device_df):,} records") print(f" - Competitive Intelligence: {len(competitive_df):,} records") print(f" - Weather Data: {len(external_data['weather']):,} records") print(f" - Events Data: {len(external_data['events']):,} records") print(f" - Demographics: {len(external_data['demographics']):,} cities") print(f" - Customer Journey: {len(journey_df):,} customers") print("\nāœ… All enhanced datasets saved to 'data/synthetic/' directory") if __name__ == "__main__": main()