#!/usr/bin/env python3 """ Autoencoder-based Unsupervised Breach Detection Input logs: timestamp,src_ip,src_port,dst_ip,dst_port,packet_size,tcp_flags,seq,ack,window """ import pandas as pd import numpy as np from scipy.stats import entropy from sklearn.preprocessing import StandardScaler from sklearn.mixture import GaussianMixture import tensorflow as tf from tensorflow.keras import layers, models LOG_FILE = "network_logs.csv" # ============================================================ # 1. LOAD LOG DATA # ============================================================ print("[*] Loading logs...") df = pd.read_csv(LOG_FILE) df["timestamp"] = pd.to_datetime(df["timestamp"]) df = df.sort_values("timestamp") df["ts_float"] = df["timestamp"].astype(np.int64) / 1e9 # ============================================================ # 2. FEATURE ENGINEERING – FLOW-LEVEL # ============================================================ print("[*] Calculating flow-based features...") df["iat"] = df.groupby( ["src_ip", "src_port", "dst_ip", "dst_port"] )["ts_float"].diff().fillna(0) df["seq_delta"] = df.groupby( ["src_ip", "src_port", "dst_ip", "dst_port"] )["seq"].diff().fillna(0) df["ack_delta"] = df.groupby( ["src_ip", "src_port", "dst_ip", "dst_port"] )["ack"].diff().fillna(0) def flow_features(flow): p = flow["packet_size"].values iat = flow["iat"].values wnd = flow["window"].values sd = flow["seq_delta"].values ad = flow["ack_delta"].values # Packet-size entropy hist = np.histogram(p, bins=10, density=True)[0] p_entropy = entropy(hist + 1e-9) return pd.Series({ "psize_mean": p.mean(), "psize_std": p.std(), "iat_mean": iat.mean(), "iat_std": iat.std(), "window_mean": wnd.mean(), "seq_delta_std": sd.std(), "ack_delta_std": ad.std(), "psize_entropy": p_entropy }) flows = df.groupby( ["src_ip", "src_port", "dst_ip", "dst_port"] ).apply(flow_features).fillna(0) print(f"[*] Extracted {len(flows)} flows.") # ============================================================ # 3. SCALE FEATURES # ============================================================ scaler = StandardScaler() X = scaler.fit_transform(flows.values) print("[*] Features scaled.") # ============================================================ # 4. AUTOENCODER MODEL # ============================================================ print("[*] Building autoencoder model...") input_dim = X.shape[1] inputs = layers.Input(shape=(input_dim,)) e = layers.Dense(32, activation="relu")(inputs) e = layers.Dense(16, activation="relu")(e) latent = layers.Dense(8, activation="relu")(e) d = layers.Dense(16, activation="relu")(latent) d = layers.Dense(32, activation="relu")(d) outputs = layers.Dense(input_dim, activation="linear")(d) autoencoder = models.Model(inputs, outputs) autoencoder.compile(optimizer="adam", loss="mse") autoencoder.summary() print("[*] Training autoencoder...") autoencoder.fit( X, X, epochs=30, batch_size=32, validation_split=0.1, verbose=1 ) # ============================================================ # 5. RECONSTRUCTION ERROR = ANOMALY SCORE # ============================================================ print("[*] Computing anomaly scores...") preds = autoencoder.predict(X) mse = np.mean((X - preds) ** 2, axis=1) flows["recon_error"] = mse # ============================================================ # 6. BREACH PROBABILITY USING GAUSSIAN MIXTURE MODEL # (2 clusters: normal & suspicious) # ============================================================ print("[*] Fitting Gaussian Mixture Model for breach probability...") m = GaussianMixture(n_components=2, random_state=42) m.fit(mse.reshape(-1, 1)) breach_prob = m.predict_proba(mse.reshape(-1, 1)) breach_prob = breach_prob[:, breach_prob.mean(axis=0).argmax()] # take "anomalous" cluster flows["breach_probability"] = breach_prob # ============================================================ # 7. FINAL BREACH PREDICTION # ============================================================ threshold_prob = 0.60 # you can tune this cutoff flows["breach_predicted"] = flows["breach_probability"] > threshold_prob print(f"[+] Breach threshold probability = {threshold_prob}") print("[*] Breach predictions complete.") # ============================================================ # 8. SAVE RESULTS # ============================================================ flows.to_csv("breach_predictions.csv") print("[+] Saved results to breach_predictions.csv") num_breaches = flows["breach_predicted"].sum() print(f"[!] Predicted potential breaches: {num_breaches}") print("[DONE]")