| """Network traffic session generator with realistic correlated features. |
| |
| Each session is a 22-dimensional feature vector representing metadata and |
| behavioral signals from encrypted traffic (no payload inspection). |
| |
| Feature groups: |
| - Volume & timing: bytes, duration, packet stats, inter-arrival metrics |
| - Network metadata: ports, protocol, DNS, connection reuse |
| - TLS / certificate: TLS version, JA3 cluster, cert chain, self-signed |
| - Behavioral context: geo distance, time of day, reputation, entropy |
| |
| Benign traffic is drawn from 5 profile archetypes. Malicious traffic |
| profiles vary by attack scenario AND kill-chain phase, creating real |
| distributional differences an RL agent can learn to exploit. |
| """ |
| from __future__ import annotations |
|
|
| from dataclasses import dataclass |
| from typing import Dict, List |
| import math |
|
|
| import numpy as np |
|
|
|
|
| FEATURE_ORDER = [ |
| "bytes_sent", |
| "bytes_received", |
| "duration_ms", |
| "packet_count", |
| "avg_packet_size", |
| "packet_size_variance", |
| "inter_arrival_mean", |
| "inter_arrival_jitter", |
| "src_port", |
| "dst_port", |
| "protocol", |
| "tls_version", |
| "ja3_hash_cluster", |
| "cert_chain_length", |
| "cert_validity_days", |
| "is_self_signed", |
| "dns_query_count", |
| "connection_reuse", |
| "geo_distance", |
| "time_of_day", |
| "session_history_score", |
| "entropy_score", |
| ] |
|
|
| |
| FEATURE_BOUNDS: Dict[str, tuple] = { |
| "bytes_sent": (4.0, 14.0), |
| "bytes_received": (3.0, 13.0), |
| "duration_ms": (20.0, 25000.0), |
| "packet_count": (2.0, 1200.0), |
| "avg_packet_size": (40.0, 1400.0), |
| "packet_size_variance": (5.0, 500.0), |
| "inter_arrival_mean": (0.5, 600.0), |
| "inter_arrival_jitter": (0.0, 300.0), |
| "src_port": (1024.0, 65535.0), |
| "dst_port": (1.0, 65535.0), |
| "protocol": (0.0, 2.0), |
| "tls_version": (0.0, 2.0), |
| "ja3_hash_cluster": (0.0, 255.0), |
| "cert_chain_length": (0.0, 6.0), |
| "cert_validity_days": (1.0, 1200.0), |
| "is_self_signed": (0.0, 1.0), |
| "dns_query_count": (0.0, 12.0), |
| "connection_reuse": (0.0, 1.0), |
| "geo_distance": (0.0, 12000.0), |
| "time_of_day": (0.0, 1.0), |
| "session_history_score": (0.0, 1.0), |
| "entropy_score": (0.0, 1.0), |
| } |
|
|
|
|
| @dataclass(frozen=True) |
| class TrafficProfile: |
| name: str |
| packet_mean: float |
| packet_std_frac: float |
| duration_mean: float |
| entropy_mean: float |
| entropy_std: float |
| tls_probability: float |
| self_signed_prob: float |
| common_ports: List[int] |
| connection_reuse_mean: float |
| geo_distance_mean: float |
| history_score_mean: float |
| cert_validity_mean: float |
| ja3_cluster_range: tuple = (0, 128) |
|
|
|
|
| |
| BENIGN_PROFILES = [ |
| TrafficProfile( |
| name="WebBrowsing", packet_mean=50.0, packet_std_frac=0.35, |
| duration_mean=900.0, entropy_mean=0.32, entropy_std=0.06, |
| tls_probability=0.95, self_signed_prob=0.02, |
| common_ports=[80, 443], connection_reuse_mean=0.72, |
| geo_distance_mean=1400.0, history_score_mean=0.82, |
| cert_validity_mean=450.0, ja3_cluster_range=(0, 64), |
| ), |
| TrafficProfile( |
| name="Streaming", packet_mean=800.0, packet_std_frac=0.25, |
| duration_mean=18000.0, entropy_mean=0.22, entropy_std=0.04, |
| tls_probability=0.99, self_signed_prob=0.01, |
| common_ports=[443, 8080], connection_reuse_mean=0.88, |
| geo_distance_mean=2200.0, history_score_mean=0.90, |
| cert_validity_mean=500.0, ja3_cluster_range=(0, 32), |
| ), |
| TrafficProfile( |
| name="API", packet_mean=25.0, packet_std_frac=0.30, |
| duration_mean=350.0, entropy_mean=0.18, entropy_std=0.04, |
| tls_probability=0.98, self_signed_prob=0.01, |
| common_ports=[443, 8443], connection_reuse_mean=0.80, |
| geo_distance_mean=1000.0, history_score_mean=0.85, |
| cert_validity_mean=500.0, ja3_cluster_range=(0, 48), |
| ), |
| TrafficProfile( |
| name="IoT", packet_mean=10.0, packet_std_frac=0.40, |
| duration_mean=1500.0, entropy_mean=0.38, entropy_std=0.07, |
| tls_probability=0.30, self_signed_prob=0.08, |
| common_ports=[1883, 5683, 8883], connection_reuse_mean=0.55, |
| geo_distance_mean=800.0, history_score_mean=0.70, |
| cert_validity_mean=300.0, ja3_cluster_range=(80, 128), |
| ), |
| TrafficProfile( |
| name="Enterprise", packet_mean=120.0, packet_std_frac=0.35, |
| duration_mean=1200.0, entropy_mean=0.28, entropy_std=0.06, |
| tls_probability=0.85, self_signed_prob=0.04, |
| common_ports=[443, 445, 3389], connection_reuse_mean=0.65, |
| geo_distance_mean=500.0, history_score_mean=0.88, |
| cert_validity_mean=400.0, ja3_cluster_range=(0, 96), |
| ), |
| ] |
|
|
| |
| |
| MALICIOUS_PROFILES: Dict[str, Dict[int, TrafficProfile]] = { |
| "port_scan_exploit_c2": { |
| 0: TrafficProfile( |
| name="PortScan_Recon", packet_mean=6.0, packet_std_frac=0.5, |
| duration_mean=80.0, entropy_mean=0.12, entropy_std=0.04, |
| tls_probability=0.05, self_signed_prob=0.60, |
| common_ports=[21, 22, 23, 25, 445, 3389, 5900], |
| connection_reuse_mean=0.02, geo_distance_mean=5500.0, |
| history_score_mean=0.10, cert_validity_mean=60.0, |
| ja3_cluster_range=(200, 255), |
| ), |
| 1: TrafficProfile( |
| name="PortScan_Exploit", packet_mean=45.0, packet_std_frac=0.4, |
| duration_mean=300.0, entropy_mean=0.78, entropy_std=0.06, |
| tls_probability=0.40, self_signed_prob=0.45, |
| common_ports=[80, 443, 8080, 445], |
| connection_reuse_mean=0.08, geo_distance_mean=5200.0, |
| history_score_mean=0.12, cert_validity_mean=90.0, |
| ja3_cluster_range=(210, 255), |
| ), |
| 2: TrafficProfile( |
| name="PortScan_C2", packet_mean=4.0, packet_std_frac=0.6, |
| duration_mean=5000.0, entropy_mean=0.55, entropy_std=0.08, |
| tls_probability=0.92, self_signed_prob=0.35, |
| common_ports=[443, 53, 8443], |
| connection_reuse_mean=0.15, geo_distance_mean=6000.0, |
| history_score_mean=0.15, cert_validity_mean=45.0, |
| ja3_cluster_range=(220, 255), |
| ), |
| 3: TrafficProfile( |
| name="PortScan_Exfil", packet_mean=350.0, packet_std_frac=0.3, |
| duration_mean=12000.0, entropy_mean=0.88, entropy_std=0.04, |
| tls_probability=0.98, self_signed_prob=0.25, |
| common_ports=[443, 8443], |
| connection_reuse_mean=0.10, geo_distance_mean=6500.0, |
| history_score_mean=0.08, cert_validity_mean=30.0, |
| ja3_cluster_range=(230, 255), |
| ), |
| }, |
| "credential_stuffing_lateral": { |
| 0: TrafficProfile( |
| name="CredStuff_Probe", packet_mean=15.0, packet_std_frac=0.4, |
| duration_mean=200.0, entropy_mean=0.42, entropy_std=0.06, |
| tls_probability=0.90, self_signed_prob=0.10, |
| common_ports=[443, 80, 8443], |
| connection_reuse_mean=0.05, geo_distance_mean=3500.0, |
| history_score_mean=0.25, cert_validity_mean=300.0, |
| ja3_cluster_range=(140, 200), |
| ), |
| 1: TrafficProfile( |
| name="CredStuff_Auth", packet_mean=20.0, packet_std_frac=0.35, |
| duration_mean=150.0, entropy_mean=0.50, entropy_std=0.07, |
| tls_probability=0.95, self_signed_prob=0.08, |
| common_ports=[443, 389, 636], |
| connection_reuse_mean=0.10, geo_distance_mean=3200.0, |
| history_score_mean=0.30, cert_validity_mean=350.0, |
| ja3_cluster_range=(150, 210), |
| ), |
| 2: TrafficProfile( |
| name="CredStuff_Lateral", packet_mean=30.0, packet_std_frac=0.35, |
| duration_mean=500.0, entropy_mean=0.35, entropy_std=0.06, |
| tls_probability=0.80, self_signed_prob=0.12, |
| common_ports=[445, 3389, 5985, 22], |
| connection_reuse_mean=0.20, geo_distance_mean=300.0, |
| history_score_mean=0.40, cert_validity_mean=350.0, |
| ja3_cluster_range=(160, 220), |
| ), |
| 3: TrafficProfile( |
| name="CredStuff_Exfil", packet_mean=200.0, packet_std_frac=0.3, |
| duration_mean=8000.0, entropy_mean=0.80, entropy_std=0.05, |
| tls_probability=0.98, self_signed_prob=0.15, |
| common_ports=[443, 8443], |
| connection_reuse_mean=0.12, geo_distance_mean=4000.0, |
| history_score_mean=0.18, cert_validity_mean=90.0, |
| ja3_cluster_range=(180, 240), |
| ), |
| }, |
| "supply_chain_compromise": { |
| 0: TrafficProfile( |
| name="SupplyChain_Init", packet_mean=40.0, packet_std_frac=0.3, |
| duration_mean=600.0, entropy_mean=0.30, entropy_std=0.05, |
| tls_probability=0.98, self_signed_prob=0.03, |
| common_ports=[443, 8443], |
| connection_reuse_mean=0.60, geo_distance_mean=1800.0, |
| history_score_mean=0.70, cert_validity_mean=380.0, |
| ja3_cluster_range=(30, 80), |
| ), |
| 1: TrafficProfile( |
| name="SupplyChain_Inject", packet_mean=60.0, packet_std_frac=0.3, |
| duration_mean=800.0, entropy_mean=0.40, entropy_std=0.06, |
| tls_probability=0.98, self_signed_prob=0.04, |
| common_ports=[443, 8443], |
| connection_reuse_mean=0.55, geo_distance_mean=2000.0, |
| history_score_mean=0.65, cert_validity_mean=350.0, |
| ja3_cluster_range=(35, 90), |
| ), |
| 2: TrafficProfile( |
| name="SupplyChain_Beacon", packet_mean=8.0, packet_std_frac=0.5, |
| duration_mean=3000.0, entropy_mean=0.48, entropy_std=0.07, |
| tls_probability=0.99, self_signed_prob=0.05, |
| common_ports=[443], |
| connection_reuse_mean=0.50, geo_distance_mean=2500.0, |
| history_score_mean=0.55, cert_validity_mean=250.0, |
| ja3_cluster_range=(40, 100), |
| ), |
| 3: TrafficProfile( |
| name="SupplyChain_Exfil", packet_mean=100.0, packet_std_frac=0.3, |
| duration_mean=5000.0, entropy_mean=0.60, entropy_std=0.06, |
| tls_probability=0.99, self_signed_prob=0.06, |
| common_ports=[443, 8443], |
| connection_reuse_mean=0.42, geo_distance_mean=3000.0, |
| history_score_mean=0.45, cert_validity_mean=200.0, |
| ja3_cluster_range=(50, 110), |
| ), |
| }, |
| "low_and_slow_apt": { |
| 0: TrafficProfile( |
| name="APT_Recon", packet_mean=12.0, packet_std_frac=0.4, |
| duration_mean=400.0, entropy_mean=0.28, entropy_std=0.05, |
| tls_probability=0.92, self_signed_prob=0.05, |
| common_ports=[443, 80], |
| connection_reuse_mean=0.50, geo_distance_mean=2200.0, |
| history_score_mean=0.55, cert_validity_mean=320.0, |
| ja3_cluster_range=(60, 130), |
| ), |
| 1: TrafficProfile( |
| name="APT_Establish", packet_mean=18.0, packet_std_frac=0.35, |
| duration_mean=700.0, entropy_mean=0.35, entropy_std=0.06, |
| tls_probability=0.95, self_signed_prob=0.07, |
| common_ports=[443, 53], |
| connection_reuse_mean=0.45, geo_distance_mean=2600.0, |
| history_score_mean=0.48, cert_validity_mean=280.0, |
| ja3_cluster_range=(70, 140), |
| ), |
| 2: TrafficProfile( |
| name="APT_Persist", packet_mean=5.0, packet_std_frac=0.6, |
| duration_mean=8000.0, entropy_mean=0.42, entropy_std=0.07, |
| tls_probability=0.97, self_signed_prob=0.10, |
| common_ports=[443], |
| connection_reuse_mean=0.38, geo_distance_mean=3200.0, |
| history_score_mean=0.38, cert_validity_mean=200.0, |
| ja3_cluster_range=(80, 150), |
| ), |
| 3: TrafficProfile( |
| name="APT_Exfil", packet_mean=60.0, packet_std_frac=0.4, |
| duration_mean=15000.0, entropy_mean=0.65, entropy_std=0.06, |
| tls_probability=0.99, self_signed_prob=0.12, |
| common_ports=[443, 8443], |
| connection_reuse_mean=0.25, geo_distance_mean=4000.0, |
| history_score_mean=0.28, cert_validity_mean=120.0, |
| ja3_cluster_range=(90, 160), |
| ), |
| }, |
| "ddos_amplification": { |
| 0: TrafficProfile( |
| name="DDoS_Probe", packet_mean=20.0, packet_std_frac=0.5, |
| duration_mean=50.0, entropy_mean=0.15, entropy_std=0.04, |
| tls_probability=0.10, self_signed_prob=0.30, |
| common_ports=[53, 123, 161, 1900], |
| connection_reuse_mean=0.02, geo_distance_mean=6000.0, |
| history_score_mean=0.08, cert_validity_mean=60.0, |
| ja3_cluster_range=(230, 255), |
| ), |
| 1: TrafficProfile( |
| name="DDoS_Amplify", packet_mean=500.0, packet_std_frac=0.4, |
| duration_mean=30.0, entropy_mean=0.10, entropy_std=0.03, |
| tls_probability=0.05, self_signed_prob=0.40, |
| common_ports=[53, 123, 161, 1900, 11211], |
| connection_reuse_mean=0.01, geo_distance_mean=7000.0, |
| history_score_mean=0.05, cert_validity_mean=30.0, |
| ja3_cluster_range=(240, 255), |
| ), |
| 2: TrafficProfile( |
| name="DDoS_Sustained", packet_mean=900.0, packet_std_frac=0.3, |
| duration_mean=20.0, entropy_mean=0.08, entropy_std=0.02, |
| tls_probability=0.03, self_signed_prob=0.50, |
| common_ports=[53, 123, 80], |
| connection_reuse_mean=0.00, geo_distance_mean=8000.0, |
| history_score_mean=0.03, cert_validity_mean=20.0, |
| ja3_cluster_range=(245, 255), |
| ), |
| 3: TrafficProfile( |
| name="DDoS_Peak", packet_mean=1100.0, packet_std_frac=0.25, |
| duration_mean=15.0, entropy_mean=0.06, entropy_std=0.02, |
| tls_probability=0.02, self_signed_prob=0.55, |
| common_ports=[53, 123, 80], |
| connection_reuse_mean=0.00, geo_distance_mean=9000.0, |
| history_score_mean=0.02, cert_validity_mean=15.0, |
| ja3_cluster_range=(248, 255), |
| ), |
| }, |
| } |
|
|
| |
| _DEFAULT_MALICIOUS: Dict[int, TrafficProfile] = MALICIOUS_PROFILES["port_scan_exploit_c2"] |
|
|
| BENIGN_WEIGHTS = np.array([0.34, 0.16, 0.18, 0.12, 0.20]) |
|
|
|
|
| class TrafficGenerator: |
| """Generates correlated network session feature vectors. |
| |
| Each session is a dict with 'session_id', 'features' (dict), |
| and 'metadata' (malicious flag, attack info, profile name). |
| """ |
|
|
| def __init__(self, seed: int = 0) -> None: |
| self.rng = np.random.default_rng(seed) |
| self.session_counter = 0 |
|
|
| def generate_benign_sessions(self, tick: int, count: int) -> List[Dict]: |
| sessions: List[Dict] = [] |
| for _ in range(max(0, count)): |
| idx = self.rng.choice(len(BENIGN_PROFILES), p=BENIGN_WEIGHTS) |
| profile = BENIGN_PROFILES[idx] |
| sessions.append(self._build_session( |
| profile, tick=tick, malicious=False, |
| attack_phase=0, scenario="benign", attacker_id=None, |
| )) |
| return sessions |
|
|
| def generate_malicious_sessions( |
| self, tick: int, count: int, |
| attack_phase: int, scenario: str, |
| attacker_id: str | None = None, |
| ) -> List[Dict]: |
| sessions: List[Dict] = [] |
| profiles = MALICIOUS_PROFILES.get(scenario, _DEFAULT_MALICIOUS) |
| profile = profiles.get(attack_phase, profiles[max(profiles.keys())]) |
| for _ in range(max(0, count)): |
| sessions.append(self._build_session( |
| profile, tick=tick, malicious=True, |
| attack_phase=attack_phase, scenario=scenario, |
| attacker_id=attacker_id, |
| )) |
| return sessions |
|
|
| def to_observation_vector(self, session: Dict) -> List[float]: |
| """Return normalized [0, 1] feature vector.""" |
| raw = session["features"] |
| normalized = [] |
| for name in FEATURE_ORDER: |
| val = float(raw[name]) |
| lo, hi = FEATURE_BOUNDS[name] |
| normalized.append(max(0.0, min(1.0, (val - lo) / max(hi - lo, 1e-9)))) |
| return normalized |
|
|
| def to_raw_vector(self, session: Dict) -> List[float]: |
| """Return un-normalized feature vector (for inspection).""" |
| return [float(session["features"][name]) for name in FEATURE_ORDER] |
|
|
| |
|
|
| def _build_session( |
| self, profile: TrafficProfile, tick: int, |
| malicious: bool, attack_phase: int, scenario: str, |
| attacker_id: str | None, |
| ) -> Dict: |
| self.session_counter += 1 |
| rng = self.rng |
|
|
| |
| packet_count = int(max(3, rng.normal( |
| profile.packet_mean, profile.packet_mean * profile.packet_std_frac, |
| ))) |
| avg_packet_size = float(max(40.0, rng.normal(560.0, 160.0))) |
| |
| bytes_sent = float(max(200.0, packet_count * avg_packet_size * rng.uniform(0.40, 0.85))) |
| bytes_received = float(max(100.0, packet_count * avg_packet_size * rng.uniform(0.20, 0.60))) |
| duration_ms = float(max(10.0, rng.normal( |
| profile.duration_mean, profile.duration_mean * 0.30, |
| ))) |
| |
| inter_arrival_mean = float(duration_ms / max(packet_count, 1)) |
| inter_arrival_jitter = float(abs(rng.normal( |
| inter_arrival_mean * 0.30, inter_arrival_mean * 0.12, |
| ))) |
| packet_size_variance = float(max(5.0, abs(rng.normal( |
| 180.0 if malicious else 130.0, 60.0, |
| )))) |
|
|
| |
| tls_enabled = rng.random() < profile.tls_probability |
| tls_version = int(rng.choice([1, 2], p=[0.20, 0.80])) if tls_enabled else 0 |
| |
| is_self_signed = bool(rng.random() < profile.self_signed_prob) if tls_enabled else False |
| cert_chain_length = int(max(0, rng.normal(3.0 if (tls_enabled and not is_self_signed) else 1.0, 0.8))) |
| cert_validity_days = float(max(1.0, rng.normal( |
| profile.cert_validity_mean, profile.cert_validity_mean * 0.30, |
| ))) |
|
|
| |
| dst_port = int(rng.choice(profile.common_ports)) |
| src_port = int(rng.integers(1024, 65535)) |
| protocol = int(rng.choice([0, 1, 2], p=[0.50, 0.32, 0.18])) |
| dns_query_count = int(max(0, rng.poisson(3 if malicious else 1))) |
|
|
| |
| connection_reuse = float(np.clip(rng.normal( |
| profile.connection_reuse_mean, 0.12, |
| ), 0.0, 1.0)) |
| geo_distance = float(max(0.0, rng.normal( |
| profile.geo_distance_mean, profile.geo_distance_mean * 0.25, |
| ))) |
| session_history_score = float(np.clip(rng.normal( |
| profile.history_score_mean, 0.10, |
| ), 0.0, 1.0)) |
| entropy_score = float(np.clip(rng.normal( |
| profile.entropy_mean, profile.entropy_std, |
| ), 0.02, 0.99)) |
| ja3_lo, ja3_hi = profile.ja3_cluster_range |
| ja3_hash_cluster = int(rng.integers(ja3_lo, max(ja3_lo + 1, ja3_hi))) |
| time_of_day = float((tick % 1440) / 1440.0) |
|
|
| features = { |
| "bytes_sent": math.log1p(bytes_sent), |
| "bytes_received": math.log1p(bytes_received), |
| "duration_ms": duration_ms, |
| "packet_count": packet_count, |
| "avg_packet_size": avg_packet_size, |
| "packet_size_variance": packet_size_variance, |
| "inter_arrival_mean": inter_arrival_mean, |
| "inter_arrival_jitter": inter_arrival_jitter, |
| "src_port": src_port, |
| "dst_port": dst_port, |
| "protocol": protocol, |
| "tls_version": tls_version, |
| "ja3_hash_cluster": ja3_hash_cluster, |
| "cert_chain_length": cert_chain_length, |
| "cert_validity_days": cert_validity_days, |
| "is_self_signed": int(is_self_signed), |
| "dns_query_count": dns_query_count, |
| "connection_reuse": connection_reuse, |
| "geo_distance": geo_distance, |
| "time_of_day": time_of_day, |
| "session_history_score": session_history_score, |
| "entropy_score": entropy_score, |
| } |
|
|
| |
| ttl = 2 if malicious else 3 |
|
|
| return { |
| "session_id": f"s-{self.session_counter:07d}", |
| "features": features, |
| "metadata": { |
| "malicious": malicious, |
| "attack_phase": attack_phase, |
| "scenario": scenario, |
| "profile": profile.name, |
| "attacker_id": attacker_id, |
| "revealed": False, |
| }, |
| "created_tick": tick, |
| "expires_tick": tick + ttl, |
| } |
|
|