NetSentinel / src /services /preprocess_features.py
Mekam's picture
refactor(prediction): refactor the controller code
70201a5
import pandas as pd
import numpy as np
def preprocess_features(data: pd.DataFrame) -> pd.DataFrame:
"""
Prépare et enrichit les features du CSV avant prédiction.
"""
print("start data preprocessing")
data = data.replace([np.inf, -np.inf], np.nan).dropna()
eps = 1e-6
number = data["Number"] + eps
iat = data["IAT"] + eps
min_val = data["Min"] + 1
# Ratios de flags
data["syn_ratio"] = data["syn_count"] / number
data["ack_ratio"] = data["ack_count"] / number
data["fin_ratio"] = data["fin_count"] / number
data["rst_ratio"] = data["rst_count"] / number
# Tailles de paquets
data["mean_pkt_size"] = data["Tot size"] / number
data["pkt_size_range"] = data["Max"] - data["Min"]
data["pkt_size_ratio"] = data["Max"] / min_val
# Timing
data["mean_iat"] = data["IAT"] / number
data["pkt_rate"] = data["Number"] / iat
# Débit
data["throughput"] = data["Tot size"] / iat
data["bytes_per_sec"] = data["Rate"] * data["Tot size"]
# Variation
data["coef_var"] = data["Std"] / (data["AVG"] + eps)
# Indicateurs combinés
data["tcp_udp_ratio"] = data["TCP"] / (data["UDP"] + 1)
data["flag_entropy"] = (
data["syn_ratio"] + data["ack_ratio"] + data["fin_ratio"] + data["rst_ratio"]
)
print("finish data preprocessing")
return data