Spaces:
Sleeping
Sleeping
| """ | |
| Enhanced Telecommunications Data Generator - Complete Implementation | |
| Implements all data sources from Technical Requirements Document | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime, timedelta | |
| from faker import Faker | |
| import random | |
| from tqdm import tqdm | |
| np.random.seed(42) | |
| random.seed(42) | |
| fake = Faker() | |
| Faker.seed(42) | |
| class EnhancedTelecomDataGenerator: | |
| """ | |
| Comprehensive data generator implementing ALL technical requirements: | |
| - Device Data (OS, apps, performance) | |
| - Competitive Intelligence | |
| - External Data (weather, events, demographics) | |
| - Enhanced network metrics | |
| - Customer journey analytics | |
| """ | |
| def __init__(self, num_customers=100000): | |
| self.num_customers = num_customers | |
| self.start_date = pd.to_datetime('2022-01-01') | |
| self.end_date = pd.to_datetime('2024-12-31') | |
| # Device configurations | |
| self.os_versions = { | |
| 'iOS': ['16.0', '16.1', '16.2', '17.0', '17.1', '17.2'], | |
| 'Android': ['12', '13', '14'], | |
| } | |
| self.popular_apps = [ | |
| 'WhatsApp', 'Facebook', 'Instagram', 'YouTube', 'TikTok', | |
| 'Netflix', 'Spotify', 'Gmail', 'Google Maps', 'Twitter', | |
| 'Snapchat', 'LinkedIn', 'Uber', 'Amazon', 'Zoom' | |
| ] | |
| # Competitor data | |
| self.competitors = ['Verizon', 'AT&T', 'T-Mobile', 'Sprint'] | |
| # Weather conditions | |
| self.weather_conditions = ['Clear', 'Cloudy', 'Rainy', 'Stormy', 'Snowy', 'Foggy'] | |
| # Event types | |
| self.event_types = ['Concert', 'Sports', 'Festival', 'Convention', 'Holiday'] | |
| print(f"Initialized Enhanced Data Generator for {num_customers:,} customers") | |
| def generate_device_data(self, customers_df): | |
| """ | |
| Generate comprehensive device data: | |
| - OS version & update history | |
| - App usage patterns | |
| - Device performance metrics | |
| - Battery health & storage | |
| """ | |
| print("\nπ± Generating Device Performance Data...") | |
| device_data = [] | |
| for _, customer in tqdm(customers_df.iterrows(), total=len(customers_df), desc="Device data"): | |
| # Determine OS from device manufacturer | |
| manufacturer = customer['device_manufacturer'] | |
| if manufacturer == 'Apple': | |
| os_type = 'iOS' | |
| os_version = random.choice(self.os_versions['iOS']) | |
| else: | |
| os_type = 'Android' | |
| os_version = random.choice(self.os_versions['Android']) | |
| # App usage (select 5-12 apps per customer) | |
| num_apps = random.randint(5, 12) | |
| user_apps = random.sample(self.popular_apps, num_apps) | |
| # Performance metrics | |
| device = { | |
| 'customer_id': customer['customer_id'], | |
| 'os_type': os_type, | |
| 'os_version': os_version, | |
| 'os_last_updated': (self.end_date - timedelta(days=random.randint(0, 180))).date(), | |
| 'storage_total_gb': random.choice([64, 128, 256, 512]), | |
| 'storage_used_pct': random.uniform(40, 95), | |
| 'battery_health_pct': max(70, 100 - customer['device_age_months'] * 1.5), | |
| 'avg_battery_drain_pct_per_hour': random.uniform(3, 15), | |
| 'apps_installed': len(user_apps), | |
| 'top_apps': ','.join(user_apps[:5]), | |
| 'avg_daily_screen_time_hours': random.uniform(2, 8), | |
| 'data_saver_enabled': random.random() < 0.3, | |
| 'background_data_restricted': random.random() < 0.25, | |
| 'wifi_calling_enabled': random.random() < 0.6, | |
| 'volte_enabled': random.random() < 0.8, | |
| '5g_enabled': random.random() < 0.7, | |
| 'device_temperature_avg_celsius': random.uniform(25, 40), | |
| 'crash_count_last_month': np.random.poisson(1), | |
| 'avg_app_load_time_sec': random.uniform(0.5, 3.0), | |
| 'memory_pressure_high_pct': random.uniform(5, 40), | |
| } | |
| device_data.append(device) | |
| df = pd.DataFrame(device_data) | |
| print(f"β Generated device data for {len(df):,} customers") | |
| return df | |
| def generate_competitive_intelligence(self): | |
| """ | |
| Generate market intelligence data: | |
| - Competitor pricing | |
| - Market share trends | |
| - Promotional campaigns | |
| - Customer migration patterns | |
| """ | |
| print("\nπ’ Generating Competitive Intelligence Data...") | |
| months = pd.date_range(start=self.start_date, end=self.end_date, freq='MS') | |
| market_data = [] | |
| for month in tqdm(months, desc="Market analysis"): | |
| for competitor in self.competitors: | |
| # Pricing data | |
| base_price = random.uniform(40, 120) | |
| promo_active = random.random() < 0.3 | |
| market_entry = { | |
| 'month': month.date(), | |
| 'competitor': competitor, | |
| 'base_plan_price': round(base_price, 2), | |
| 'unlimited_plan_price': round(base_price * 1.8, 2), | |
| 'family_plan_price': round(base_price * 2.5, 2), | |
| 'promotion_active': promo_active, | |
| 'promotion_discount_pct': random.uniform(10, 30) if promo_active else 0, | |
| 'market_share_pct': random.uniform(15, 30), | |
| 'customer_satisfaction_score': random.uniform(3.5, 4.8), | |
| 'network_quality_score': random.uniform(7, 9.5), | |
| '5g_coverage_pct': random.uniform(40, 85), | |
| 'avg_download_speed_mbps': random.uniform(50, 300), | |
| 'churn_rate_pct': random.uniform(1.5, 3.5), | |
| 'new_customer_acquisitions': random.randint(50000, 200000), | |
| 'advertising_spend_millions': random.uniform(5, 25), | |
| } | |
| market_data.append(market_entry) | |
| df = pd.DataFrame(market_data) | |
| print(f"β Generated {len(df):,} market intelligence records") | |
| return df | |
| def generate_external_data(self, towers_df): | |
| """ | |
| Generate external data sources: | |
| - Weather conditions by location | |
| - Local events | |
| - Demographic/census data | |
| - Economic indicators | |
| """ | |
| print("\nπ Generating External Data Sources...") | |
| # Weather data (daily by tower location) | |
| print(" Generating weather data...") | |
| weather_data = [] | |
| dates = pd.date_range(start=self.end_date - timedelta(days=90), end=self.end_date, freq='D') | |
| # Sample subset of towers for weather | |
| sample_towers = towers_df.sample(n=min(200, len(towers_df)), random_state=42) | |
| for tower_id, tower in tqdm(sample_towers.iterrows(), total=len(sample_towers), desc="Weather"): | |
| for date in dates: | |
| weather = { | |
| 'date': date.date(), | |
| 'tower_id': tower['tower_id'], | |
| 'city': tower['city'], | |
| 'temperature_celsius': random.uniform(-10, 35), | |
| 'humidity_pct': random.uniform(30, 90), | |
| 'precipitation_mm': max(0, np.random.exponential(2)), | |
| 'wind_speed_kmh': random.uniform(5, 50), | |
| 'condition': random.choice(self.weather_conditions), | |
| 'severe_weather': random.random() < 0.05, | |
| } | |
| weather_data.append(weather) | |
| weather_df = pd.DataFrame(weather_data) | |
| # Events data | |
| print(" Generating events data...") | |
| events_data = [] | |
| num_events = 500 | |
| for i in range(num_events): | |
| event_date = fake.date_between(start_date=self.start_date, end_date=self.end_date) | |
| event = { | |
| 'event_id': f'EVT{i+1:05d}', | |
| 'event_name': f'{random.choice(self.event_types)} {i+1}', | |
| 'event_type': random.choice(self.event_types), | |
| 'event_date': event_date, | |
| 'city': random.choice(towers_df['city'].unique()), | |
| 'expected_attendance': random.randint(1000, 100000), | |
| 'duration_hours': random.randint(2, 48), | |
| } | |
| events_data.append(event) | |
| events_df = pd.DataFrame(events_data) | |
| # Demographics/Census data by city | |
| print(" Generating demographic data...") | |
| cities = towers_df['city'].unique() | |
| demographics_data = [] | |
| for city in cities: | |
| demo = { | |
| 'city': city, | |
| 'population': random.randint(100000, 5000000), | |
| 'median_age': random.uniform(30, 45), | |
| 'median_income': random.randint(40000, 100000), | |
| 'unemployment_rate_pct': random.uniform(3, 8), | |
| 'college_educated_pct': random.uniform(25, 60), | |
| 'homeownership_rate_pct': random.uniform(40, 70), | |
| 'population_density_per_sqkm': random.randint(100, 10000), | |
| 'urban_classification': random.choice(['Urban', 'Suburban', 'Rural']), | |
| } | |
| demographics_data.append(demo) | |
| demographics_df = pd.DataFrame(demographics_data) | |
| print(f"β Weather: {len(weather_df):,} records") | |
| print(f"β Events: {len(events_df):,} records") | |
| print(f"β Demographics: {len(demographics_df):,} cities") | |
| return { | |
| 'weather': weather_df, | |
| 'events': events_df, | |
| 'demographics': demographics_df | |
| } | |
| def generate_customer_journey_data(self, customers_df): | |
| """ | |
| Generate customer journey analytics: | |
| - Lifecycle stages | |
| - Service interaction history | |
| - Payment behavior patterns | |
| - Customer segmentation | |
| """ | |
| print("\nπ€ Generating Customer Journey Data...") | |
| journey_data = [] | |
| for _, customer in tqdm(customers_df.iterrows(), total=len(customers_df), desc="Customer journeys"): | |
| # Lifecycle stage based on tenure | |
| tenure = customer['tenure_months'] | |
| if tenure < 3: | |
| lifecycle_stage = 'New' | |
| engagement_score = random.uniform(6, 9) | |
| elif tenure < 12: | |
| lifecycle_stage = 'Growing' | |
| engagement_score = random.uniform(7, 10) | |
| elif tenure < 36: | |
| lifecycle_stage = 'Mature' | |
| engagement_score = random.uniform(5, 9) | |
| else: | |
| lifecycle_stage = 'Tenured' | |
| engagement_score = random.uniform(4, 8) | |
| # Payment behavior | |
| payment_score = random.uniform(1, 10) | |
| late_payment_risk = 'Low' if payment_score > 7 else ('Medium' if payment_score > 4 else 'High') | |
| journey = { | |
| 'customer_id': customer['customer_id'], | |
| 'lifecycle_stage': lifecycle_stage, | |
| 'engagement_score': round(engagement_score, 2), | |
| 'value_segment': random.choice(['High Value', 'Medium Value', 'Low Value']), | |
| 'loyalty_tier': random.choice(['Bronze', 'Silver', 'Gold', 'Platinum']), | |
| 'payment_behavior_score': round(payment_score, 2), | |
| 'late_payment_risk': late_payment_risk, | |
| 'total_interactions': np.random.poisson(tenure * 0.3), | |
| 'positive_interactions_pct': random.uniform(60, 95), | |
| 'nps_score': random.randint(-100, 100), | |
| 'referrals_made': customer.get('referral_count', 0), | |
| 'upsell_opportunities': np.random.poisson(2), | |
| 'cross_sell_score': random.uniform(0, 10), | |
| 'reactivation_risk': random.uniform(0, 1), | |
| 'social_influence_score': random.uniform(0, 10), | |
| } | |
| journey_data.append(journey) | |
| df = pd.DataFrame(journey_data) | |
| print(f"β Generated journey data for {len(df):,} customers") | |
| return df | |
| def main(): | |
| """Generate all enhanced datasets""" | |
| print("="*80) | |
| print("ENHANCED TELECOMMUNICATIONS DATA GENERATOR") | |
| print("="*80) | |
| # First generate base data using original generator | |
| from synthetic_data_generator import TelecomDataGenerator | |
| base_gen = TelecomDataGenerator(num_customers=100000, num_towers=1000) | |
| print("\nπ Generating Base Data...") | |
| customers_df = base_gen.generate_customer_demographics() | |
| towers_df = base_gen.generate_network_infrastructure() | |
| # Initialize enhanced generator | |
| enhanced_gen = EnhancedTelecomDataGenerator(num_customers=len(customers_df)) | |
| # Generate enhanced data | |
| device_df = enhanced_gen.generate_device_data(customers_df) | |
| competitive_df = enhanced_gen.generate_competitive_intelligence() | |
| external_data = enhanced_gen.generate_external_data(towers_df) | |
| journey_df = enhanced_gen.generate_customer_journey_data(customers_df) | |
| # Save all datasets | |
| print("\nπΎ Saving Enhanced Datasets...") | |
| device_df.to_csv('data/synthetic/device_data.csv', index=False) | |
| print(" β Saved device_data.csv") | |
| competitive_df.to_csv('data/synthetic/competitive_intelligence.csv', index=False) | |
| print(" β Saved competitive_intelligence.csv") | |
| external_data['weather'].to_csv('data/synthetic/weather_data.csv', index=False) | |
| print(" β Saved weather_data.csv") | |
| external_data['events'].to_csv('data/synthetic/events_data.csv', index=False) | |
| print(" β Saved events_data.csv") | |
| external_data['demographics'].to_csv('data/synthetic/demographics_data.csv', index=False) | |
| print(" β Saved demographics_data.csv") | |
| journey_df.to_csv('data/synthetic/customer_journey.csv', index=False) | |
| print(" β Saved customer_journey.csv") | |
| print("\n" + "="*80) | |
| print("ENHANCED DATA GENERATION COMPLETE") | |
| print("="*80) | |
| print(f"\nπ Summary:") | |
| print(f" - Device Data: {len(device_df):,} records") | |
| print(f" - Competitive Intelligence: {len(competitive_df):,} records") | |
| print(f" - Weather Data: {len(external_data['weather']):,} records") | |
| print(f" - Events Data: {len(external_data['events']):,} records") | |
| print(f" - Demographics: {len(external_data['demographics']):,} cities") | |
| print(f" - Customer Journey: {len(journey_df):,} customers") | |
| print("\nβ All enhanced datasets saved to 'data/synthetic/' directory") | |
| if __name__ == "__main__": | |
| main() | |