Spaces:
Runtime error
Runtime error
File size: 1,375 Bytes
70201a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import pandas as pd
import numpy as np
def preprocess_features(data: pd.DataFrame) -> pd.DataFrame:
"""
Prépare et enrichit les features du CSV avant prédiction.
"""
print("start data preprocessing")
data = data.replace([np.inf, -np.inf], np.nan).dropna()
eps = 1e-6
number = data["Number"] + eps
iat = data["IAT"] + eps
min_val = data["Min"] + 1
# Ratios de flags
data["syn_ratio"] = data["syn_count"] / number
data["ack_ratio"] = data["ack_count"] / number
data["fin_ratio"] = data["fin_count"] / number
data["rst_ratio"] = data["rst_count"] / number
# Tailles de paquets
data["mean_pkt_size"] = data["Tot size"] / number
data["pkt_size_range"] = data["Max"] - data["Min"]
data["pkt_size_ratio"] = data["Max"] / min_val
# Timing
data["mean_iat"] = data["IAT"] / number
data["pkt_rate"] = data["Number"] / iat
# Débit
data["throughput"] = data["Tot size"] / iat
data["bytes_per_sec"] = data["Rate"] * data["Tot size"]
# Variation
data["coef_var"] = data["Std"] / (data["AVG"] + eps)
# Indicateurs combinés
data["tcp_udp_ratio"] = data["TCP"] / (data["UDP"] + 1)
data["flag_entropy"] = (
data["syn_ratio"] + data["ack_ratio"] + data["fin_ratio"] + data["rst_ratio"]
)
print("finish data preprocessing")
return data
|