Spaces:

samarthnaikk
/

amide-models

Sleeping

Samarth Naik

Initial commit: Flask app with ML models for breach prediction

6e9f386 5 months ago

5.89 kB

	import pandas as pd
	import numpy as np
	import torch
	import torch.nn as nn
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import IsolationForest
	from xgboost import XGBClassifier
	import argparse
	import warnings

	warnings.filterwarnings("ignore")

	# ----------------------------------------------------------
	# 1. LOAD LOGS AND FEATURE ENGINEERING
	# ----------------------------------------------------------

	def flow_features(flow):
	packet_sizes = flow["packet_size"].values
	times = flow["timestamp"].values

	return pd.Series({
	"packet_count": len(packet_sizes),
	"packet_min": float(packet_sizes.min()),
	"packet_max": float(packet_sizes.max()),
	"packet_mean": float(packet_sizes.mean()),
	"packet_std": float(packet_sizes.std(ddof=0)),
	"interarrival_mean": float(np.mean(np.diff(times))) if len(times) > 1 else 0.0,
	"interarrival_std": float(np.std(np.diff(times))) if len(times) > 1 else 0.0,
	"bytes_total": float(packet_sizes.sum()),
	})


	def extract_flows(path):
	print("[*] Loading logs...")
	df = pd.read_csv(path)

	df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce").fillna(0)

	print("[*] Calculating flow-level features...")

	flows = (
	df.groupby(["src_ip", "dst_ip", "src_port", "dst_port"], group_keys=False)
	.apply(flow_features)
	.reset_index()
	.fillna(0)
	)

	print(f"[*] Extracted {len(flows)} flows.")
	return flows


	# ----------------------------------------------------------
	# 2. LSTM AUTOENCODER (COMPLEX, MULTI-LAYER)
	# ----------------------------------------------------------

	class LSTMAutoencoder(nn.Module):
	def __init__(self, input_dim, hidden_dim=64, latent_dim=32):
	super().__init__()

	self.encoder = nn.LSTM(input_dim, hidden_dim, batch_first=True, num_layers=2)
	self.enc_linear = nn.Linear(hidden_dim, latent_dim)

	self.dec_linear = nn.Linear(latent_dim, hidden_dim)
	self.decoder = nn.LSTM(hidden_dim, input_dim, batch_first=True, num_layers=2)

	def forward(self, x):
	enc_out, _ = self.encoder(x)
	h_last = enc_out[:, -1, :]
	z = torch.relu(self.enc_linear(h_last))
	dec_h = torch.relu(self.dec_linear(z)).unsqueeze(1)
	dec_out, _ = self.decoder(dec_h.repeat(1, x.size(1), 1))
	return dec_out


	# ----------------------------------------------------------
	# 3. TRAIN AUTOENCODER
	# ----------------------------------------------------------

	def train_autoencoder(model, X_tensor, device):
	print("[*] Training LSTM autoencoder...")

	criterion = nn.MSELoss()
	optim = torch.optim.Adam(model.parameters(), lr=0.001)

	for epoch in range(1, 31):
	model.train()
	optim.zero_grad()

	recon = model(X_tensor.to(device))
	loss = criterion(recon, X_tensor.to(device))
	loss.backward()
	optim.step()

	print(f"Epoch {epoch}/30 - Loss: {loss.item():.4f}")

	print("[+] Autoencoder training completed.")
	return model


	# ----------------------------------------------------------
	# 4. ANOMALY SCORING (NO NUMPY USED!)
	# ----------------------------------------------------------

	def compute_ae_scores(model, X_tensor, device):
	print("[*] Computing anomaly scores...")

	model.eval()
	with torch.no_grad():
	recon = model(X_tensor.to(device))
	mse = torch.mean((X_tensor - recon) ** 2, dim=(1, 2))

	scores = mse.cpu().tolist() # PURE PYTHON, NO NUMPY
	print("[+] Scores computed safely without NumPy.")
	return scores


	# ----------------------------------------------------------
	# 5. MAIN PIPELINE
	# ----------------------------------------------------------

	def main(logfile):

	flows = extract_flows(logfile)

	features = [
	"packet_count", "packet_min", "packet_max", "packet_mean",
	"packet_std", "interarrival_mean", "interarrival_std", "bytes_total"
	]

	print("[*] Scaling features...")
	scaler = StandardScaler()
	X = scaler.fit_transform(flows[features])

	device = "cpu"
	print(f"[*] Using device: {device}")

	# Torch input shape: (batch, seq_len=1, features)
	X_tensor = torch.tensor(X, dtype=torch.float32).unsqueeze(1)

	# Build model
	print("[*] Building LSTM autoencoder...")
	model = LSTMAutoencoder(input_dim=X.shape[1]).to(device)

	# Train
	model = train_autoencoder(model, X_tensor, device)

	# Anomaly scoring
	flows["ae_score"] = compute_ae_scores(model, X_tensor, device)

	# Isolation Forest Pseudo Labels
	print("[*] Generating anomaly labels via Isolation Forest...")
	iso = IsolationForest(contamination=0.15, random_state=42)
	labels = iso.fit_predict(flows[features])
	flows["label"] = (labels == -1).astype(int)
	print(f"[!] Pseudo anomalies: {flows['label'].sum()}")

	# Train XGBoost classifier
	print("[*] Training XGBoost...")

	clf = XGBClassifier(
	n_estimators=400,
	max_depth=6,
	learning_rate=0.05,
	subsample=0.9,
	colsample_bytree=0.9,
	reg_lambda=2,
	tree_method="hist"
	)

	clf.fit(X, flows["label"])

	# Predict breach probability
	probs = clf.predict_proba(X)[:, 1]
	flows["breach_prob"] = probs

	threshold = 0.6
	flows["breach"] = (probs >= threshold).astype(int)

	print(f"[!] Total predicted breaches: {flows['breach'].sum()}")

	flows.to_csv("xgb_lstm_predictions.csv", index=False)
	print("[+] Results saved to xgb_lstm_predictions.csv")
	print("[DONE]")


	# ----------------------------------------------------------
	# RUN
	# ----------------------------------------------------------

	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--logfile", type=str, required=True)
	args = parser.parse_args()

	main(args.logfile)