import pandas as pd import numpy as np from scapy.all import rdpcap, IP, TCP, UDP, IPv6 import os from collections import defaultdict import statistics def safe_div(x, y): return x / y if y != 0 else 0 def calculate_stats(values): if not values: return 0, 0, 0, 0 return min(values), max(values), statistics.mean(values), statistics.stdev(values) if len(values) > 1 else 0 def convert_pcap_to_csv(pcap_file_path): """ Convert a PCAP file to a Pandas DataFrame with CIC-IDS-2017 like features using Scapy. """ try: # Read PCAP file packets = rdpcap(pcap_file_path) flows = defaultdict(lambda: { "src_ip": None, "dst_ip": None, "src_port": 0, "dst_port": 0, "protocol": 0, "timestamps": [], "fwd_timestamps": [], "bwd_timestamps": [], "fwd_pkt_lens": [], "bwd_pkt_lens": [], "all_pkt_lens": [], "fwd_header_lens": [], "bwd_header_lens": [], "flags": {"FIN": 0, "SYN": 0, "RST": 0, "PSH": 0, "ACK": 0, "URG": 0, "CWR": 0, "ECE": 0}, "fwd_flags": {"PSH": 0, "URG": 0}, "init_fwd_win": 0, "init_bwd_win": 0, "fwd_act_data_pkts": 0, "fwd_seg_size_min": 0 }) for pkt in packets: if IP in pkt: src_ip = pkt[IP].src dst_ip = pkt[IP].dst proto = pkt[IP].proto header_len = pkt[IP].ihl * 4 elif IPv6 in pkt: src_ip = pkt[IPv6].src dst_ip = pkt[IPv6].dst proto = pkt[IPv6].nh header_len = 40 # Fixed for IPv6 else: continue src_port = 0 dst_port = 0 payload_len = len(pkt.payload) if TCP in pkt: src_port = pkt[TCP].sport dst_port = pkt[TCP].dport flags = pkt[TCP].flags window = pkt[TCP].window elif UDP in pkt: src_port = pkt[UDP].sport dst_port = pkt[UDP].dport flags = None window = 0 else: continue # Flow Key (5-tuple) key = (src_ip, dst_ip, src_port, dst_port, proto) rev_key = (dst_ip, src_ip, dst_port, src_port, proto) if key in flows: flow = flows[key] direction = "fwd" elif rev_key in flows: flow = flows[rev_key] direction = "bwd" else: flow = flows[key] flow["src_ip"] = src_ip flow["dst_ip"] = dst_ip flow["src_port"] = src_port flow["dst_port"] = dst_port flow["protocol"] = proto direction = "fwd" timestamp = float(pkt.time) flow["timestamps"].append(timestamp) flow["all_pkt_lens"].append(payload_len) if direction == "fwd": flow["fwd_timestamps"].append(timestamp) flow["fwd_pkt_lens"].append(payload_len) flow["fwd_header_lens"].append(header_len) if TCP in pkt: if flow["init_fwd_win"] == 0: flow["init_fwd_win"] = window if payload_len > 0: flow["fwd_act_data_pkts"] += 1 flow["fwd_seg_size_min"] = header_len # Approximation else: flow["bwd_timestamps"].append(timestamp) flow["bwd_pkt_lens"].append(payload_len) flow["bwd_header_lens"].append(header_len) if TCP in pkt: if flow["init_bwd_win"] == 0: flow["init_bwd_win"] = window if TCP in pkt and flags: if 'F' in flags: flow["flags"]["FIN"] += 1 if 'S' in flags: flow["flags"]["SYN"] += 1 if 'R' in flags: flow["flags"]["RST"] += 1 if 'P' in flags: flow["flags"]["PSH"] += 1 if direction == "fwd": flow["fwd_flags"]["PSH"] += 1 if 'A' in flags: flow["flags"]["ACK"] += 1 if 'U' in flags: flow["flags"]["URG"] += 1 if direction == "fwd": flow["fwd_flags"]["URG"] += 1 if 'C' in flags: flow["flags"]["CWR"] += 1 if 'E' in flags: flow["flags"]["ECE"] += 1 # Process flows into features rows = [] for flow in flows.values(): # Basic Stats total_fwd_pkts = len(flow["fwd_pkt_lens"]) total_bwd_pkts = len(flow["bwd_pkt_lens"]) total_fwd_len = sum(flow["fwd_pkt_lens"]) total_bwd_len = sum(flow["bwd_pkt_lens"]) fwd_min, fwd_max, fwd_mean, fwd_std = calculate_stats(flow["fwd_pkt_lens"]) bwd_min, bwd_max, bwd_mean, bwd_std = calculate_stats(flow["bwd_pkt_lens"]) pkt_min, pkt_max, pkt_mean, pkt_std = calculate_stats(flow["all_pkt_lens"]) # Time Stats duration = max(flow["timestamps"]) - min(flow["timestamps"]) if flow["timestamps"] else 0 if duration == 0: duration = 1e-6 # Avoid division by zero flow_bytes_s = (total_fwd_len + total_bwd_len) / duration flow_pkts_s = (total_fwd_pkts + total_bwd_pkts) / duration fwd_pkts_s = total_fwd_pkts / duration bwd_pkts_s = total_bwd_pkts / duration # IAT Stats flow_iats = [t2 - t1 for t1, t2 in zip(flow["timestamps"][:-1], flow["timestamps"][1:])] fwd_iats = [t2 - t1 for t1, t2 in zip(flow["fwd_timestamps"][:-1], flow["fwd_timestamps"][1:])] bwd_iats = [t2 - t1 for t1, t2 in zip(flow["bwd_timestamps"][:-1], flow["bwd_timestamps"][1:])] flow_iat_min, flow_iat_max, flow_iat_mean, flow_iat_std = calculate_stats(flow_iats) _, fwd_iat_max, _, fwd_iat_std = calculate_stats(fwd_iats) _, bwd_iat_max, _, bwd_iat_std = calculate_stats(bwd_iats) # Active/Idle (Simplified) active_mean = 0 active_std = 0 active_max = 0 active_min = 0 idle_mean = 0 idle_std = 0 idle_max = 0 idle_min = 0 if flow_iats: idle_threshold = 5.0 # seconds idles = [iat for iat in flow_iats if iat > idle_threshold] actives = [iat for iat in flow_iats if iat <= idle_threshold] if idles: idle_min, idle_max, idle_mean, idle_std = calculate_stats(idles) if actives: active_min, active_max, active_mean, active_std = calculate_stats(actives) row = { "Protocol": flow["protocol"], "Total Fwd Packets": total_fwd_pkts, "Total Backward Packets": total_bwd_pkts, "Fwd Packets Length Total": total_fwd_len, "Bwd Packets Length Total": total_bwd_len, "Fwd Packet Length Max": fwd_max, "Fwd Packet Length Min": fwd_min, "Fwd Packet Length Std": fwd_std, "Bwd Packet Length Max": bwd_max, "Bwd Packet Length Min": bwd_min, "Bwd Packet Length Std": bwd_std, "Flow Bytes/s": flow_bytes_s, "Flow Packets/s": flow_pkts_s, "Flow IAT Mean": flow_iat_mean, "Flow IAT Std": flow_iat_std, "Flow IAT Max": flow_iat_max, "Fwd IAT Std": fwd_iat_std, "Fwd IAT Max": fwd_iat_max, "Bwd IAT Std": bwd_iat_std, "Bwd IAT Max": bwd_iat_max, "Fwd PSH Flags": flow["fwd_flags"]["PSH"], "Fwd URG Flags": flow["fwd_flags"]["URG"], "Fwd Header Length": sum(flow["fwd_header_lens"]), "Bwd Header Length": sum(flow["bwd_header_lens"]), "Fwd Packets/s": fwd_pkts_s, "Bwd Packets/s": bwd_pkts_s, "Packet Length Min": pkt_min, "Packet Length Max": pkt_max, "Packet Length Mean": pkt_mean, "Packet Length Std": pkt_std, "FIN Flag Count": flow["flags"]["FIN"], "SYN Flag Count": flow["flags"]["SYN"], "RST Flag Count": flow["flags"]["RST"], "PSH Flag Count": flow["flags"]["PSH"], "ACK Flag Count": flow["flags"]["ACK"], "URG Flag Count": flow["flags"]["URG"], "CWE Flag Count": flow["flags"]["CWR"], "ECE Flag Count": flow["flags"]["ECE"], "Down/Up Ratio": safe_div(total_bwd_pkts, total_fwd_pkts), "Init Fwd Win Bytes": flow["init_fwd_win"], "Init Bwd Win Bytes": flow["init_bwd_win"], "Fwd Act Data Packets": flow["fwd_act_data_pkts"], "Fwd Seg Size Min": flow["fwd_seg_size_min"], "Active Mean": active_mean, "Active Std": active_std, "Active Max": active_max, "Active Min": active_min, "Idle Mean": idle_mean, "Idle Std": idle_std, "Idle Max": idle_max, "Idle Min": idle_min, "Attack_type": "Unknown", "Attack_encode": 0, "mapped_label": "Unknown", "severity_raw": 0, "severity": "Unknown" } rows.append(row) df = pd.DataFrame(rows) return df except Exception as e: print(f"Error converting PCAP: {e}") # Return empty DataFrame with expected columns on error return pd.DataFrame()