import pandas as pd import numpy as np def preprocess_features(data: pd.DataFrame) -> pd.DataFrame: """ Prépare et enrichit les features du CSV avant prédiction. """ print("start data preprocessing") data = data.replace([np.inf, -np.inf], np.nan).dropna() eps = 1e-6 number = data["Number"] + eps iat = data["IAT"] + eps min_val = data["Min"] + 1 # Ratios de flags data["syn_ratio"] = data["syn_count"] / number data["ack_ratio"] = data["ack_count"] / number data["fin_ratio"] = data["fin_count"] / number data["rst_ratio"] = data["rst_count"] / number # Tailles de paquets data["mean_pkt_size"] = data["Tot size"] / number data["pkt_size_range"] = data["Max"] - data["Min"] data["pkt_size_ratio"] = data["Max"] / min_val # Timing data["mean_iat"] = data["IAT"] / number data["pkt_rate"] = data["Number"] / iat # Débit data["throughput"] = data["Tot size"] / iat data["bytes_per_sec"] = data["Rate"] * data["Tot size"] # Variation data["coef_var"] = data["Std"] / (data["AVG"] + eps) # Indicateurs combinés data["tcp_udp_ratio"] = data["TCP"] / (data["UDP"] + 1) data["flag_entropy"] = ( data["syn_ratio"] + data["ack_ratio"] + data["fin_ratio"] + data["rst_ratio"] ) print("finish data preprocessing") return data