Spaces:
Sleeping
Sleeping
File size: 7,771 Bytes
035d781 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 |
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random
# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)
fake = Faker()
print("Starting Data Generation...")
# ==================== PART 1: Generate Services ====================
print("\nGenerating Services Data...")
services_data = []
service_templates = [
"auth", "cache", "database", "api", "notification",
"search", "recommendation", "payment", "inventory", "profile",
"order", "analytics", "logging", "metrics", "config",
"gateway", "queue", "processor", "manager", "service",
"worker", "scheduler", "validator", "router", "balancer"
]
# Generate 150 services by combining templates
service_names = []
for i in range(6):
for template in service_templates:
service_names.append(f"{template}-service-{i+1}")
for i, name in enumerate(service_names, start=1):
services_data.append({
'service_id': i,
'service_name': name,
'memory_mb': random.choice([256, 512, 1024, 2048, 4096]),
'cpu_cores': random.choice([0.5, 1, 2, 4]),
'latency_critical': random.choice([True, False]),
'traffic_volume_rps': random.randint(1000, 100000), # requests per second
'dependencies': random.randint(0, 5) # how many other services it depends on
})
services_df = pd.DataFrame(services_data)
services_df.to_csv('data/services.csv', index=False)
print(f"Generated {len(services_df)} services")
print(services_df.head())
# ==================== PART 2: Generate Regional Latency ====================
print("\nGenerating Regional Latency Data...")
regions = ['us-east-1', 'us-west-2', 'eu-west-1', 'ap-southeast-1', 'ap-northeast-1']
latency_data = []
# Create latency matrix (some regions are closer than others)
latency_matrix = {
('us-east-1', 'us-west-2'): (60, 80),
('us-east-1', 'eu-west-1'): (90, 110),
('us-east-1', 'ap-southeast-1'): (180, 220),
('us-east-1', 'ap-northeast-1'): (150, 190),
('us-west-2', 'eu-west-1'): (130, 160),
('us-west-2', 'ap-southeast-1'): (140, 170),
('us-west-2', 'ap-northeast-1'): (110, 140),
('eu-west-1', 'ap-southeast-1'): (200, 250),
('eu-west-1', 'ap-northeast-1'): (180, 230),
('ap-southeast-1', 'ap-northeast-1'): (50, 80),
}
# Generate latency measurements over time
start_date = datetime(2024, 1, 1)
for days in range(90): # 3 months
timestamp = start_date + timedelta(days=days)
for region1 in regions:
for region2 in regions:
if region1 == region2:
latency_data.append({
'region1': region1,
'region2': region2,
'latency_ms': random.gauss(2, 0.5), # same region: ~2ms
'timestamp': timestamp
})
elif (region1, region2) in latency_matrix:
min_lat, max_lat = latency_matrix[(region1, region2)]
base_latency = np.random.uniform(min_lat, max_lat)
# Add some noise
latency = base_latency + random.gauss(0, 5)
latency_data.append({
'region1': region1,
'region2': region2,
'latency_ms': max(latency, 1), # ensure positive
'timestamp': timestamp
})
elif (region2, region1) in latency_matrix:
min_lat, max_lat = latency_matrix[(region2, region1)]
base_latency = np.random.uniform(min_lat, max_lat)
latency = base_latency + random.gauss(0, 5)
latency_data.append({
'region1': region1,
'region2': region2,
'latency_ms': max(latency, 1),
'timestamp': timestamp
})
latency_df = pd.DataFrame(latency_data)
latency_df.to_csv('data/regional_latency.csv', index=False)
print(f"Generated {len(latency_df)} latency measurements")
print(latency_df.head())
# ==================== PART 3: Generate Traffic Patterns ====================
print("\nGenerating Traffic Patterns...")
traffic_data = []
start_date = datetime(2024, 1, 1)
for days in range(90): # 3 months
for hour in range(24):
timestamp = start_date + timedelta(days=days, hours=hour)
# Peak hours are 9-17 (business hours)
hour_of_day = timestamp.hour
if 9 <= hour_of_day <= 17:
traffic_multiplier = random.uniform(1.5, 2.5)
elif 22 <= hour_of_day or hour_of_day <= 6:
traffic_multiplier = random.uniform(0.2, 0.5) # low traffic at night
else:
traffic_multiplier = random.uniform(0.8, 1.2)
# Weekend traffic is lower
if timestamp.weekday() >= 5: # Saturday = 5, Sunday = 6
traffic_multiplier *= 0.7
for service_id, service_row in services_df.iterrows():
base_traffic = service_row['traffic_volume_rps']
for region in regions:
# Different regions have different traffic volumes
region_factor = {
'us-east-1': 1.0,
'us-west-2': 0.8,
'eu-west-1': 0.6,
'ap-southeast-1': 0.5,
'ap-northeast-1': 0.4,
}[region]
requests = int(base_traffic * traffic_multiplier * region_factor)
traffic_data.append({
'service_id': service_id + 1,
'region': region,
'hour': hour,
'requests': requests,
'timestamp': timestamp
})
traffic_df = pd.DataFrame(traffic_data)
traffic_df.to_csv('data/traffic_patterns.csv', index=False)
print(f"Generated {len(traffic_df)} traffic records")
print(traffic_df.head())
# ==================== PART 4: Generate Placement History ====================
print("\nGenerating Service Placement History...")
placement_data = []
start_date = datetime(2024, 1, 1)
for days in range(90):
timestamp = start_date + timedelta(days=days)
for service_id in range(1, len(service_names) + 1):
service = services_df[services_df['service_id'] == service_id].iloc[0]
# Latency critical services are usually in fewer regions
if service['latency_critical']:
num_regions = random.choice([1, 2])
else:
num_regions = random.choice([2, 3, 4])
placement_regions = random.sample(regions, num_regions)
for region in placement_regions:
placement_data.append({
'service_id': service_id,
'region': region,
'timestamp': timestamp,
'instances': random.randint(1, 5),
'avg_latency_ms': random.uniform(5, 100),
'error_rate': random.uniform(0, 0.05)
})
placement_df = pd.DataFrame(placement_data)
placement_df.to_csv('data/service_placement.csv', index=False)
print(f"Generated {len(placement_df)} placement records")
print(placement_df.head())
# ==================== Summary ====================
print("\n" + "="*50)
print("ALL DATA GENERATED SUCCESSFULLY!")
print("="*50)
print(f"\nFiles created in 'data/' folder:")
print(f" • services.csv ({len(services_df)} rows)")
print(f" • regional_latency.csv ({len(latency_df)} rows)")
print(f" • traffic_patterns.csv ({len(traffic_df)} rows)")
print(f" • service_placement.csv ({len(placement_df)} rows)")
print(f"\nTotal records generated: {len(services_df) + len(latency_df) + len(traffic_df) + len(placement_df):,}")
|