amide-models / XGB_lstm.py
Samarth Naik
Initial commit: Flask app with ML models for breach prediction
6e9f386
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
import argparse
import warnings
warnings.filterwarnings("ignore")
# ----------------------------------------------------------
# 1. LOAD LOGS AND FEATURE ENGINEERING
# ----------------------------------------------------------
def flow_features(flow):
packet_sizes = flow["packet_size"].values
times = flow["timestamp"].values
return pd.Series({
"packet_count": len(packet_sizes),
"packet_min": float(packet_sizes.min()),
"packet_max": float(packet_sizes.max()),
"packet_mean": float(packet_sizes.mean()),
"packet_std": float(packet_sizes.std(ddof=0)),
"interarrival_mean": float(np.mean(np.diff(times))) if len(times) > 1 else 0.0,
"interarrival_std": float(np.std(np.diff(times))) if len(times) > 1 else 0.0,
"bytes_total": float(packet_sizes.sum()),
})
def extract_flows(path):
print("[*] Loading logs...")
df = pd.read_csv(path)
df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce").fillna(0)
print("[*] Calculating flow-level features...")
flows = (
df.groupby(["src_ip", "dst_ip", "src_port", "dst_port"], group_keys=False)
.apply(flow_features)
.reset_index()
.fillna(0)
)
print(f"[*] Extracted {len(flows)} flows.")
return flows
# ----------------------------------------------------------
# 2. LSTM AUTOENCODER (COMPLEX, MULTI-LAYER)
# ----------------------------------------------------------
class LSTMAutoencoder(nn.Module):
def __init__(self, input_dim, hidden_dim=64, latent_dim=32):
super().__init__()
self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True, num_layers=2)
self.enc_linear = nn.Linear(hidden_dim, latent_dim)
self.dec_linear = nn.Linear(latent_dim, hidden_dim)
self.decoder = nn.LSTM(hidden_dim, input_dim, batch_first=True, num_layers=2)
def forward(self, x):
enc_out, _ = self.encoder(x)
h_last = enc_out[:, -1, :]
z = torch.relu(self.enc_linear(h_last))
dec_h = torch.relu(self.dec_linear(z)).unsqueeze(1)
dec_out, _ = self.decoder(dec_h.repeat(1, x.size(1), 1))
return dec_out
# ----------------------------------------------------------
# 3. TRAIN AUTOENCODER
# ----------------------------------------------------------
def train_autoencoder(model, X_tensor, device):
print("[*] Training LSTM autoencoder...")
criterion = nn.MSELoss()
optim = torch.optim.Adam(model.parameters(), lr=0.001)
for epoch in range(1, 31):
model.train()
optim.zero_grad()
recon = model(X_tensor.to(device))
loss = criterion(recon, X_tensor.to(device))
loss.backward()
optim.step()
print(f"Epoch {epoch}/30 - Loss: {loss.item():.4f}")
print("[+] Autoencoder training completed.")
return model
# ----------------------------------------------------------
# 4. ANOMALY SCORING (NO NUMPY USED!)
# ----------------------------------------------------------
def compute_ae_scores(model, X_tensor, device):
print("[*] Computing anomaly scores...")
model.eval()
with torch.no_grad():
recon = model(X_tensor.to(device))
mse = torch.mean((X_tensor - recon) ** 2, dim=(1, 2))
scores = mse.cpu().tolist() # PURE PYTHON, NO NUMPY
print("[+] Scores computed safely without NumPy.")
return scores
# ----------------------------------------------------------
# 5. MAIN PIPELINE
# ----------------------------------------------------------
def main(logfile):
flows = extract_flows(logfile)
features = [
"packet_count", "packet_min", "packet_max", "packet_mean",
"packet_std", "interarrival_mean", "interarrival_std", "bytes_total"
]
print("[*] Scaling features...")
scaler = StandardScaler()
X = scaler.fit_transform(flows[features])
device = "cpu"
print(f"[*] Using device: {device}")
# Torch input shape: (batch, seq_len=1, features)
X_tensor = torch.tensor(X, dtype=torch.float32).unsqueeze(1)
# Build model
print("[*] Building LSTM autoencoder...")
model = LSTMAutoencoder(input_dim=X.shape[1]).to(device)
# Train
model = train_autoencoder(model, X_tensor, device)
# Anomaly scoring
flows["ae_score"] = compute_ae_scores(model, X_tensor, device)
# Isolation Forest Pseudo Labels
print("[*] Generating anomaly labels via Isolation Forest...")
iso = IsolationForest(contamination=0.15, random_state=42)
labels = iso.fit_predict(flows[features])
flows["label"] = (labels == -1).astype(int)
print(f"[!] Pseudo anomalies: {flows['label'].sum()}")
# Train XGBoost classifier
print("[*] Training XGBoost...")
clf = XGBClassifier(
n_estimators=400,
max_depth=6,
learning_rate=0.05,
subsample=0.9,
colsample_bytree=0.9,
reg_lambda=2,
tree_method="hist"
)
clf.fit(X, flows["label"])
# Predict breach probability
probs = clf.predict_proba(X)[:, 1]
flows["breach_prob"] = probs
threshold = 0.6
flows["breach"] = (probs >= threshold).astype(int)
print(f"[!] Total predicted breaches: {flows['breach'].sum()}")
flows.to_csv("xgb_lstm_predictions.csv", index=False)
print("[+] Results saved to xgb_lstm_predictions.csv")
print("[DONE]")
# ----------------------------------------------------------
# RUN
# ----------------------------------------------------------
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--logfile", type=str, required=True)
args = parser.parse_args()
main(args.logfile)