Cyr-CK commited on
Commit
e76e281
·
1 Parent(s): d64aade

Changed the clustering approach

Browse files
Files changed (3) hide show
  1. sections/analytics.py +43 -55
  2. sections/analyze.py +7 -7
  3. sections/upload.py +3 -3
sections/analytics.py CHANGED
@@ -24,39 +24,23 @@ if st.session_state.parsed_df is None:
24
  data = st.session_state.parsed_df
25
  data = data.select(["portdst","protocole","rule","action"])
26
 
27
- # Sélectionner toutes les colonnes numériques
28
- quanti = data.select(pl.col(pl.Int64))
29
-
30
- # Sélectionner toutes les colonnes de type chaîne
31
- quali = data.select(pl.col(pl.String))
32
-
33
  ##############################################
34
  #### Preprocessing ####
35
  ##############################################
36
 
37
- # Normalisation des données quanti (Standardisation : moyenne = 0, écart-type = 1)
38
-
39
- scaler = StandardScaler()
40
- data_quanti = scaler.fit_transform(quanti.to_pandas())
41
-
42
- # Convertir de nouveau en DataFrame Polars
43
- data_quanti = pl.from_pandas(pd.DataFrame(data_quanti, columns=quanti.columns))
44
-
45
- # Encodage one-hot des données quali
46
 
47
  encoder = OneHotEncoder(sparse_output=False)
48
- data_quali = encoder.fit_transform(quali.to_pandas())
49
 
50
  col_names = [
51
  f"{feature}_{category}"
52
- for feature, categories in zip(quali.columns, encoder.categories_)
53
  for category in categories
54
  ]
55
 
56
  # Convertir de nouveau en DataFrame Polars
57
- data_quali = pl.from_pandas(pd.DataFrame(data_quali, columns=col_names))
58
-
59
- df = pl.concat([data_quanti, data_quali], how="horizontal")
60
 
61
  ###############################################
62
  #### Clustering ####
@@ -66,11 +50,16 @@ if st.button("Start clustering"):
66
  if st.session_state.parsed_df is not None:
67
  with st.spinner("Searching the clusters..."):
68
  try:
 
 
 
 
69
  # Appliquer K-Means avec k optimal choisi
70
  k_optimal = 2 # Par exemple, supposons que k = 3
71
  kmeans = KMeans(n_clusters=k_optimal, random_state=42)
72
- preds = kmeans.fit_predict(df.to_pandas())
73
- df_clust = df.with_columns(pl.Series(values=preds, name='cluster_kmeans'))
 
74
 
75
  if df_clust.shape[0] > 200000: # 200k
76
  perc = 200000/df_clust.shape[0]
@@ -88,14 +77,14 @@ if st.button("Start clustering"):
88
 
89
  # Visualisation des clusters (en 2D avec PCA)
90
 
91
- pca = PCA(n_components=2)
92
- df_pca = pca.fit_transform(df_ech.to_pandas())
93
 
94
  fig = px.scatter(
95
- x=df_pca[:, 0],
96
- y=df_pca[:, 1],
97
- color=df_ech['cluster_kmeans'],
98
- color_continuous_scale='viridis',
99
  title='Clustering coupled with PCA',
100
  labels={'x': 'Component 1', 'y': 'Component 2', 'color': 'Cluster'},
101
  )
@@ -104,21 +93,20 @@ if st.button("Start clustering"):
104
  xaxis_title='Component 1',
105
  yaxis_title='Component 2'
106
  )
107
-
108
  # fig.show()
109
  st.plotly_chart(fig, use_container_width=True)
110
 
111
  except Exception as e:
112
- st.error(f"An error occured while doing the clustering : {e}")
113
 
114
  with st.spinner("Performing some more data analysis..."):
115
  try:
116
- data = data.with_columns(pl.Series(name="cluster_kmeans", values=df_clust.select("cluster_kmeans")))
117
  # Analyse des variables qualitatives par cluster
118
- for col in quali.columns: # protocole, action
119
  fig = make_subplots(rows=1, cols=2)
120
 
121
- data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
122
  freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
123
 
124
  fig.add_trace(
@@ -127,7 +115,7 @@ if st.button("Start clustering"):
127
  row=1, col=1
128
  )
129
 
130
- data_filtered = data.filter(pl.col("cluster_kmeans") == 1)
131
  freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
132
 
133
  fig.add_trace(
@@ -144,32 +132,32 @@ if st.button("Start clustering"):
144
  )
145
  st.plotly_chart(fig, use_container_width=True)
146
 
147
- # Analyse de la variable quantitative par cluster
148
- for col in quanti.columns: # protocole, rule, action
149
- fig = make_subplots(rows=1, cols=2)
150
 
151
- data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
152
 
153
- # Ajouter le premier histogramme
154
- fig.add_trace(
155
- go.Histogram(x=data_filtered[col], name="Cluster 0", marker_color="rebeccapurple"),
156
- row=1, col=1
157
- )
158
 
159
- data_filtered = data.filter(pl.col("cluster_kmeans") == 1)
160
 
161
- # Ajouter le deuxième histogramme
162
- fig.add_trace(
163
- go.Histogram(x=data_filtered[col], name="Cluster 1", marker_color="gold"),
164
- row=1, col=2
165
- )
166
 
167
- # Mettre à jour la mise en page pour améliorer l'apparence
168
- fig.update_layout(
169
- title_text=f"Histograms of {col}",
170
- showlegend=True,
171
- )
172
- st.plotly_chart(fig, use_container_width=True)
173
 
174
  except Exception as e:
175
  st.error(f"An error occured while doing the data analysis : {e}")
 
24
  data = st.session_state.parsed_df
25
  data = data.select(["portdst","protocole","rule","action"])
26
 
 
 
 
 
 
 
27
  ##############################################
28
  #### Preprocessing ####
29
  ##############################################
30
 
31
+ # Encodage one-hot
 
 
 
 
 
 
 
 
32
 
33
  encoder = OneHotEncoder(sparse_output=False)
34
+ data_encoded = encoder.fit_transform(data.to_pandas())
35
 
36
  col_names = [
37
  f"{feature}_{category}"
38
+ for feature, categories in zip(data.columns, encoder.categories_)
39
  for category in categories
40
  ]
41
 
42
  # Convertir de nouveau en DataFrame Polars
43
+ data_encoded = pl.from_pandas(pd.DataFrame(data_encoded, columns=col_names))
 
 
44
 
45
  ###############################################
46
  #### Clustering ####
 
50
  if st.session_state.parsed_df is not None:
51
  with st.spinner("Searching the clusters..."):
52
  try:
53
+
54
+ pca = PCA(n_components=2)
55
+ df_pca = pca.fit_transform(data_encoded.to_pandas())
56
+
57
  # Appliquer K-Means avec k optimal choisi
58
  k_optimal = 2 # Par exemple, supposons que k = 3
59
  kmeans = KMeans(n_clusters=k_optimal, random_state=42)
60
+ preds = kmeans.fit_predict(df_pca)
61
+ df_pca = pl.from_pandas(pd.DataFrame(df_pca, columns=[f"Component {i+1}" for i in range(k_optimal)]))
62
+ df_clust = df_pca.with_columns(pl.Series(values=preds, name='cluster_kmeans'))
63
 
64
  if df_clust.shape[0] > 200000: # 200k
65
  perc = 200000/df_clust.shape[0]
 
77
 
78
  # Visualisation des clusters (en 2D avec PCA)
79
 
80
+ # pca = PCA(n_components=2)
81
+ # df_pca = pca.fit_transform(df_ech.to_pandas())
82
 
83
  fig = px.scatter(
84
+ x=df_ech.select("Component 1").to_numpy().flatten(),
85
+ y=df_ech.select("Component 2").to_numpy().flatten(),
86
+ color=df_ech.select('cluster_kmeans').to_numpy().flatten().astype(str),
87
+ color_discrete_map={"0": "rebeccapurple", "1": "gold"},
88
  title='Clustering coupled with PCA',
89
  labels={'x': 'Component 1', 'y': 'Component 2', 'color': 'Cluster'},
90
  )
 
93
  xaxis_title='Component 1',
94
  yaxis_title='Component 2'
95
  )
 
96
  # fig.show()
97
  st.plotly_chart(fig, use_container_width=True)
98
 
99
  except Exception as e:
100
+ st.error(f"An error occured while doing the clustering : {e.with_traceback(None)}")
101
 
102
  with st.spinner("Performing some more data analysis..."):
103
  try:
104
+ data_clust = data.with_columns(pl.Series(name="cluster_kmeans", values=df_clust.select("cluster_kmeans")))
105
  # Analyse des variables qualitatives par cluster
106
+ for col in data.columns : # portdst, protocole, rule, action
107
  fig = make_subplots(rows=1, cols=2)
108
 
109
+ data_filtered = data_clust.filter(pl.col("cluster_kmeans") == 0)
110
  freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
111
 
112
  fig.add_trace(
 
115
  row=1, col=1
116
  )
117
 
118
+ data_filtered = data_clust.filter(pl.col("cluster_kmeans") == 1)
119
  freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
120
 
121
  fig.add_trace(
 
132
  )
133
  st.plotly_chart(fig, use_container_width=True)
134
 
135
+ # # Analyse de la variable quantitative par cluster
136
+ # for col in quanti.columns: # protocole, rule, action
137
+ # fig = make_subplots(rows=1, cols=2)
138
 
139
+ # data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
140
 
141
+ # # Ajouter le premier histogramme
142
+ # fig.add_trace(
143
+ # go.Histogram(x=data_filtered[col], name="Cluster 0", marker_color="rebeccapurple"),
144
+ # row=1, col=1
145
+ # )
146
 
147
+ # data_filtered = data.filter(pl.col("cluster_kmeans") == 1)
148
 
149
+ # # Ajouter le deuxième histogramme
150
+ # fig.add_trace(
151
+ # go.Histogram(x=data_filtered[col], name="Cluster 1", marker_color="gold"),
152
+ # row=1, col=2
153
+ # )
154
 
155
+ # # Mettre à jour la mise en page pour améliorer l'apparence
156
+ # fig.update_layout(
157
+ # title_text=f"Histograms of {col}",
158
+ # showlegend=True,
159
+ # )
160
+ # st.plotly_chart(fig, use_container_width=True)
161
 
162
  except Exception as e:
163
  st.error(f"An error occured while doing the data analysis : {e}")
sections/analyze.py CHANGED
@@ -100,8 +100,8 @@ with tab1:
100
  st.markdown("### 🔢 Port")
101
  if "portdst" in data.columns:
102
  min_port, max_port = (
103
- int(data["portdst"].min()),
104
- int(data["portdst"].max()),
105
  )
106
 
107
  # Initialize port range in session state if not present
@@ -272,7 +272,7 @@ with tab3:
272
  " (portdst < 1024 and action == 'PERMIT')"
273
  )
274
  top_ports = (
275
- data.filter((pl.col("portdst") < 1024) & (pl.col("action") == "PERMIT"))
276
  .group_by("portdst")
277
  .agg(pl.count("portdst").alias("count"))
278
  .sort("count", descending=True)
@@ -483,8 +483,8 @@ with tab5:
483
 
484
  # 🔹 Sankey entre IP source et port destination
485
  df = data_filtered.with_columns(
486
- data_filtered["portdst"].cast(pl.Utf8)
487
- ) # Convertir les ports en chaînes pour éviter les erreurs
488
  create_sankey(df, "ipsrc", "portdst")
489
 
490
  st.subheader("Connections where access were identified as : DENY")
@@ -495,6 +495,6 @@ with tab5:
495
 
496
  # 🔹 Sankey entre IP source et port destination
497
  df = data_filtered.with_columns(
498
- data_filtered["portdst"].cast(pl.Utf8)
499
- ) # Convertir les ports en chaînes pour éviter les erreurs
500
  create_sankey(df, "ipsrc", "portdst")
 
100
  st.markdown("### 🔢 Port")
101
  if "portdst" in data.columns:
102
  min_port, max_port = (
103
+ int(data["portdst"].cast(pl.Utf8).min()),
104
+ int(data["portdst"].cast(pl.Utf8).max()),
105
  )
106
 
107
  # Initialize port range in session state if not present
 
272
  " (portdst < 1024 and action == 'PERMIT')"
273
  )
274
  top_ports = (
275
+ data.filter((pl.col("portdst").cast(pl.Int64) < 1024) & (pl.col("action") == "PERMIT"))
276
  .group_by("portdst")
277
  .agg(pl.count("portdst").alias("count"))
278
  .sort("count", descending=True)
 
483
 
484
  # 🔹 Sankey entre IP source et port destination
485
  df = data_filtered.with_columns(
486
+ data_filtered["portdst"]
487
+ )
488
  create_sankey(df, "ipsrc", "portdst")
489
 
490
  st.subheader("Connections where access were identified as : DENY")
 
495
 
496
  # 🔹 Sankey entre IP source et port destination
497
  df = data_filtered.with_columns(
498
+ data_filtered["portdst"]
499
+ )
500
  create_sankey(df, "ipsrc", "portdst")
sections/upload.py CHANGED
@@ -79,9 +79,9 @@ if uploaded_file is not None:
79
  "ipsrc": pl.Utf8,
80
  "ipdst": pl.Utf8,
81
  "protocole": pl.Utf8,
82
- "portsrc": pl.Int64,
83
- "portdst": pl.Int64,
84
- "rule": pl.Int64,
85
  "action": pl.Utf8,
86
  "interface": pl.Utf8,
87
  "unknown": pl.Utf8,
 
79
  "ipsrc": pl.Utf8,
80
  "ipdst": pl.Utf8,
81
  "protocole": pl.Utf8,
82
+ "portsrc": pl.Utf8,
83
+ "portdst": pl.Utf8,
84
+ "rule": pl.Utf8,
85
  "action": pl.Utf8,
86
  "interface": pl.Utf8,
87
  "unknown": pl.Utf8,