Spaces:
Configuration error
Configuration error
| """ | |
| retrain.py β Firewall Log Classifier | |
| Trains the Random Forest model on Dataset__log2_.csv and saves: | |
| - model.joblib | |
| - scaler.joblib | |
| - label_encoder.joblib | |
| Run: python retrain.py | |
| Requires: pip install scikit-learn imbalanced-learn pandas joblib | |
| """ | |
| import os | |
| import pandas as pd | |
| import numpy as np | |
| import joblib | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import accuracy_score, f1_score, classification_report | |
| from imblearn.over_sampling import SMOTE | |
| # ββ Config ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| DATASET_PATH = "Dataset__log2_.csv" | |
| FEATURE_COLS = [ | |
| "Source Port", | |
| "Destination Port", | |
| "NAT Source Port", | |
| "NAT Destination Port", | |
| "Bytes", | |
| "Bytes Sent", | |
| "Bytes Received", | |
| "Packets", | |
| "Elapsed Time (sec)", | |
| "pkts_sent", | |
| "pkts_received", | |
| ] | |
| TARGET_COL = "Action" | |
| OUTPUT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| # ββ 1. Load βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print(f"Loading {DATASET_PATH} ...") | |
| df = pd.read_csv(DATASET_PATH) | |
| print(f" Raw rows: {len(df):,} Columns: {list(df.columns)}") | |
| # ββ 2. Verify columns exist βββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| missing = [c for c in FEATURE_COLS + [TARGET_COL] if c not in df.columns] | |
| if missing: | |
| print("\nERROR β these columns are missing from the CSV:") | |
| for m in missing: | |
| print(f" '{m}'") | |
| print("\nAvailable columns:", list(df.columns)) | |
| raise SystemExit(1) | |
| # ββ 3. Preprocessing ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| df.drop_duplicates(inplace=True) | |
| print(f" After dedup: {len(df):,} rows") | |
| # IQR filtering skipped β it eliminates the rare reset-both class entirely. | |
| # SMOTE handles class imbalance instead. | |
| X = df[FEATURE_COLS].values | |
| y = df[TARGET_COL].values | |
| print(f" Class distribution:\n{pd.Series(y).value_counts().to_string()}") | |
| # ββ 4. Encode labels ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| le = LabelEncoder() | |
| y_enc = le.fit_transform(y) | |
| print(f" Classes: {list(le.classes_)}") | |
| # ββ 5. Train / test split βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y_enc, test_size=0.30, stratify=y_enc, random_state=42 | |
| ) | |
| print(f" Train: {len(X_train):,} Test: {len(X_test):,}") | |
| # ββ 6. SMOTE ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Applying SMOTE ...") | |
| sm = SMOTE(random_state=42) | |
| X_train_res, y_train_res = sm.fit_resample(X_train, y_train) | |
| print(f" After SMOTE train size: {len(X_train_res):,}") | |
| # ββ 7. Scale ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| scaler = StandardScaler() | |
| X_train_sc = scaler.fit_transform(X_train_res) | |
| X_test_sc = scaler.transform(X_test) | |
| # ββ 8. Train tuned Random Forest ββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Training Random Forest (n_estimators=200, max_depth=20) ...") | |
| rf = RandomForestClassifier( | |
| n_estimators=200, | |
| max_depth=20, | |
| min_samples_split=2, | |
| random_state=42, | |
| n_jobs=-1, | |
| ) | |
| rf.fit(X_train_sc, y_train_res) | |
| # ββ 9. Evaluate βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| y_pred = rf.predict(X_test_sc) | |
| acc = accuracy_score(y_test, y_pred) | |
| mf1 = f1_score(y_test, y_pred, average="macro") | |
| print(f"\nTest Accuracy : {acc*100:.2f}%") | |
| print(f"Macro F1 : {mf1:.4f}") | |
| print("\nClassification Report:") | |
| print(classification_report(y_test, y_pred, target_names=le.classes_)) | |
| # ββ 10. Save artifacts ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| model_path = os.path.join(OUTPUT_DIR, "model.joblib") | |
| scaler_path = os.path.join(OUTPUT_DIR, "scaler.joblib") | |
| le_path = os.path.join(OUTPUT_DIR, "label_encoder.joblib") | |
| joblib.dump(rf, model_path) | |
| joblib.dump(scaler, scaler_path) | |
| joblib.dump(le, le_path) | |
| print(f"\nSaved:") | |
| print(f" {model_path}") | |
| print(f" {scaler_path}") | |
| print(f" {le_path}") | |
| print("\nDone! Upload these 3 files to your repo alongside app.py.") |