import pandas as pd import numpy as np from faker import Faker from datetime import datetime, timedelta import random # Set random seed for reproducibility np.random.seed(42) random.seed(42) fake = Faker() print("Starting Data Generation...") # ==================== PART 1: Generate Services ==================== print("\nGenerating Services Data...") services_data = [] service_templates = [ "auth", "cache", "database", "api", "notification", "search", "recommendation", "payment", "inventory", "profile", "order", "analytics", "logging", "metrics", "config", "gateway", "queue", "processor", "manager", "service", "worker", "scheduler", "validator", "router", "balancer" ] # Generate 150 services by combining templates service_names = [] for i in range(6): for template in service_templates: service_names.append(f"{template}-service-{i+1}") for i, name in enumerate(service_names, start=1): services_data.append({ 'service_id': i, 'service_name': name, 'memory_mb': random.choice([256, 512, 1024, 2048, 4096]), 'cpu_cores': random.choice([0.5, 1, 2, 4]), 'latency_critical': random.choice([True, False]), 'traffic_volume_rps': random.randint(1000, 100000), # requests per second 'dependencies': random.randint(0, 5) # how many other services it depends on }) services_df = pd.DataFrame(services_data) services_df.to_csv('data/services.csv', index=False) print(f"Generated {len(services_df)} services") print(services_df.head()) # ==================== PART 2: Generate Regional Latency ==================== print("\nGenerating Regional Latency Data...") regions = ['us-east-1', 'us-west-2', 'eu-west-1', 'ap-southeast-1', 'ap-northeast-1'] latency_data = [] # Create latency matrix (some regions are closer than others) latency_matrix = { ('us-east-1', 'us-west-2'): (60, 80), ('us-east-1', 'eu-west-1'): (90, 110), ('us-east-1', 'ap-southeast-1'): (180, 220), ('us-east-1', 'ap-northeast-1'): (150, 190), ('us-west-2', 'eu-west-1'): (130, 160), ('us-west-2', 'ap-southeast-1'): (140, 170), ('us-west-2', 'ap-northeast-1'): (110, 140), ('eu-west-1', 'ap-southeast-1'): (200, 250), ('eu-west-1', 'ap-northeast-1'): (180, 230), ('ap-southeast-1', 'ap-northeast-1'): (50, 80), } # Generate latency measurements over time start_date = datetime(2024, 1, 1) for days in range(90): # 3 months timestamp = start_date + timedelta(days=days) for region1 in regions: for region2 in regions: if region1 == region2: latency_data.append({ 'region1': region1, 'region2': region2, 'latency_ms': random.gauss(2, 0.5), # same region: ~2ms 'timestamp': timestamp }) elif (region1, region2) in latency_matrix: min_lat, max_lat = latency_matrix[(region1, region2)] base_latency = np.random.uniform(min_lat, max_lat) # Add some noise latency = base_latency + random.gauss(0, 5) latency_data.append({ 'region1': region1, 'region2': region2, 'latency_ms': max(latency, 1), # ensure positive 'timestamp': timestamp }) elif (region2, region1) in latency_matrix: min_lat, max_lat = latency_matrix[(region2, region1)] base_latency = np.random.uniform(min_lat, max_lat) latency = base_latency + random.gauss(0, 5) latency_data.append({ 'region1': region1, 'region2': region2, 'latency_ms': max(latency, 1), 'timestamp': timestamp }) latency_df = pd.DataFrame(latency_data) latency_df.to_csv('data/regional_latency.csv', index=False) print(f"Generated {len(latency_df)} latency measurements") print(latency_df.head()) # ==================== PART 3: Generate Traffic Patterns ==================== print("\nGenerating Traffic Patterns...") traffic_data = [] start_date = datetime(2024, 1, 1) for days in range(90): # 3 months for hour in range(24): timestamp = start_date + timedelta(days=days, hours=hour) # Peak hours are 9-17 (business hours) hour_of_day = timestamp.hour if 9 <= hour_of_day <= 17: traffic_multiplier = random.uniform(1.5, 2.5) elif 22 <= hour_of_day or hour_of_day <= 6: traffic_multiplier = random.uniform(0.2, 0.5) # low traffic at night else: traffic_multiplier = random.uniform(0.8, 1.2) # Weekend traffic is lower if timestamp.weekday() >= 5: # Saturday = 5, Sunday = 6 traffic_multiplier *= 0.7 for service_id, service_row in services_df.iterrows(): base_traffic = service_row['traffic_volume_rps'] for region in regions: # Different regions have different traffic volumes region_factor = { 'us-east-1': 1.0, 'us-west-2': 0.8, 'eu-west-1': 0.6, 'ap-southeast-1': 0.5, 'ap-northeast-1': 0.4, }[region] requests = int(base_traffic * traffic_multiplier * region_factor) traffic_data.append({ 'service_id': service_id + 1, 'region': region, 'hour': hour, 'requests': requests, 'timestamp': timestamp }) traffic_df = pd.DataFrame(traffic_data) traffic_df.to_csv('data/traffic_patterns.csv', index=False) print(f"Generated {len(traffic_df)} traffic records") print(traffic_df.head()) # ==================== PART 4: Generate Placement History ==================== print("\nGenerating Service Placement History...") placement_data = [] start_date = datetime(2024, 1, 1) for days in range(90): timestamp = start_date + timedelta(days=days) for service_id in range(1, len(service_names) + 1): service = services_df[services_df['service_id'] == service_id].iloc[0] # Latency critical services are usually in fewer regions if service['latency_critical']: num_regions = random.choice([1, 2]) else: num_regions = random.choice([2, 3, 4]) placement_regions = random.sample(regions, num_regions) for region in placement_regions: placement_data.append({ 'service_id': service_id, 'region': region, 'timestamp': timestamp, 'instances': random.randint(1, 5), 'avg_latency_ms': random.uniform(5, 100), 'error_rate': random.uniform(0, 0.05) }) placement_df = pd.DataFrame(placement_data) placement_df.to_csv('data/service_placement.csv', index=False) print(f"Generated {len(placement_df)} placement records") print(placement_df.head()) # ==================== Summary ==================== print("\n" + "="*50) print("ALL DATA GENERATED SUCCESSFULLY!") print("="*50) print(f"\nFiles created in 'data/' folder:") print(f" • services.csv ({len(services_df)} rows)") print(f" • regional_latency.csv ({len(latency_df)} rows)") print(f" • traffic_patterns.csv ({len(traffic_df)} rows)") print(f" • service_placement.csv ({len(placement_df)} rows)") print(f"\nTotal records generated: {len(services_df) + len(latency_df) + len(traffic_df) + len(placement_df):,}")