File size: 1,375 Bytes
70201a5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
import numpy as np


def preprocess_features(data: pd.DataFrame) -> pd.DataFrame:
    """
    Prépare et enrichit les features du CSV avant prédiction.
    """
    
    print("start data preprocessing")
    data = data.replace([np.inf, -np.inf], np.nan).dropna()

    eps = 1e-6
    number = data["Number"] + eps
    iat = data["IAT"] + eps
    min_val = data["Min"] + 1

    # Ratios de flags
    data["syn_ratio"] = data["syn_count"] / number
    data["ack_ratio"] = data["ack_count"] / number
    data["fin_ratio"] = data["fin_count"] / number
    data["rst_ratio"] = data["rst_count"] / number

    # Tailles de paquets
    data["mean_pkt_size"] = data["Tot size"] / number
    data["pkt_size_range"] = data["Max"] - data["Min"]
    data["pkt_size_ratio"] = data["Max"] / min_val

    # Timing
    data["mean_iat"] = data["IAT"] / number
    data["pkt_rate"] = data["Number"] / iat

    # Débit
    data["throughput"] = data["Tot size"] / iat
    data["bytes_per_sec"] = data["Rate"] * data["Tot size"]

    # Variation
    data["coef_var"] = data["Std"] / (data["AVG"] + eps)

    # Indicateurs combinés
    data["tcp_udp_ratio"] = data["TCP"] / (data["UDP"] + 1)
    data["flag_entropy"] = (
        data["syn_ratio"] + data["ack_ratio"] + data["fin_ratio"] + data["rst_ratio"]
    )
    
    print("finish data preprocessing")

    return data