Spaces:

yomnafarag95
/

Log_Classifier

Configuration error

App Files Files Community

Log_Classifier / retrain.py

yomnafarag95

Upload 6 files

7377758 verified 8 days ago

raw

history blame contribute delete

5.6 kB

	"""
	retrain.py — Firewall Log Classifier
	Trains the Random Forest model on Dataset__log2_.csv and saves:
	- model.joblib
	- scaler.joblib
	- label_encoder.joblib

	Run: python retrain.py
	Requires: pip install scikit-learn imbalanced-learn pandas joblib
	"""

	import os
	import pandas as pd
	import numpy as np
	import joblib
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.preprocessing import LabelEncoder, StandardScaler
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, f1_score, classification_report
	from imblearn.over_sampling import SMOTE

	# ── Config ────────────────────────────────────────────────────────────────────
	DATASET_PATH = "Dataset__log2_.csv"

	FEATURE_COLS = [
	"Source Port",
	"Destination Port",
	"NAT Source Port",
	"NAT Destination Port",
	"Bytes",
	"Bytes Sent",
	"Bytes Received",
	"Packets",
	"Elapsed Time (sec)",
	"pkts_sent",
	"pkts_received",
	]

	TARGET_COL = "Action"

	OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__))

	# ── 1. Load ───────────────────────────────────────────────────────────────────
	print(f"Loading {DATASET_PATH} ...")
	df = pd.read_csv(DATASET_PATH)
	print(f" Raw rows: {len(df):,} Columns: {list(df.columns)}")

	# ── 2. Verify columns exist ───────────────────────────────────────────────────
	missing = [c for c in FEATURE_COLS + [TARGET_COL] if c not in df.columns]
	if missing:
	print("\nERROR — these columns are missing from the CSV:")
	for m in missing:
	print(f" '{m}'")
	print("\nAvailable columns:", list(df.columns))
	raise SystemExit(1)

	# ── 3. Preprocessing ──────────────────────────────────────────────────────────
	df.drop_duplicates(inplace=True)
	print(f" After dedup: {len(df):,} rows")
	# IQR filtering skipped — it eliminates the rare reset-both class entirely.
	# SMOTE handles class imbalance instead.

	X = df[FEATURE_COLS].values
	y = df[TARGET_COL].values

	print(f" Class distribution:\n{pd.Series(y).value_counts().to_string()}")

	# ── 4. Encode labels ──────────────────────────────────────────────────────────
	le = LabelEncoder()
	y_enc = le.fit_transform(y)
	print(f" Classes: {list(le.classes_)}")

	# ── 5. Train / test split ─────────────────────────────────────────────────────
	X_train, X_test, y_train, y_test = train_test_split(
	X, y_enc, test_size=0.30, stratify=y_enc, random_state=42
	)
	print(f" Train: {len(X_train):,} Test: {len(X_test):,}")

	# ── 6. SMOTE ──────────────────────────────────────────────────────────────────
	print("Applying SMOTE ...")
	sm = SMOTE(random_state=42)
	X_train_res, y_train_res = sm.fit_resample(X_train, y_train)
	print(f" After SMOTE train size: {len(X_train_res):,}")

	# ── 7. Scale ──────────────────────────────────────────────────────────────────
	scaler = StandardScaler()
	X_train_sc = scaler.fit_transform(X_train_res)
	X_test_sc = scaler.transform(X_test)

	# ── 8. Train tuned Random Forest ──────────────────────────────────────────────
	print("Training Random Forest (n_estimators=200, max_depth=20) ...")
	rf = RandomForestClassifier(
	n_estimators=200,
	max_depth=20,
	min_samples_split=2,
	random_state=42,
	n_jobs=-1,
	)
	rf.fit(X_train_sc, y_train_res)

	# ── 9. Evaluate ───────────────────────────────────────────────────────────────
	y_pred = rf.predict(X_test_sc)
	acc = accuracy_score(y_test, y_pred)
	mf1 = f1_score(y_test, y_pred, average="macro")
	print(f"\nTest Accuracy : {acc*100:.2f}%")
	print(f"Macro F1 : {mf1:.4f}")
	print("\nClassification Report:")
	print(classification_report(y_test, y_pred, target_names=le.classes_))

	# ── 10. Save artifacts ────────────────────────────────────────────────────────
	model_path = os.path.join(OUTPUT_DIR, "model.joblib")
	scaler_path = os.path.join(OUTPUT_DIR, "scaler.joblib")
	le_path = os.path.join(OUTPUT_DIR, "label_encoder.joblib")

	joblib.dump(rf, model_path)
	joblib.dump(scaler, scaler_path)
	joblib.dump(le, le_path)

	print(f"\nSaved:")
	print(f" {model_path}")
	print(f" {scaler_path}")
	print(f" {le_path}")
	print("\nDone! Upload these 3 files to your repo alongside app.py.")