Cyr-CK commited on
Commit
d824eb3
·
1 Parent(s): 4946e83

Added Sankey

Browse files
Files changed (2) hide show
  1. sections/analytics.py +11 -50
  2. sections/analyze.py +36 -0
sections/analytics.py CHANGED
@@ -7,7 +7,8 @@ import polars as pl
7
 
8
  from sklearn.preprocessing import StandardScaler
9
  from sklearn.preprocessing import OneHotEncoder
10
- from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
 
11
 
12
  if "parsed_df" not in st.session_state:
13
  st.session_state.parsed_df = None
@@ -21,7 +22,7 @@ if st.session_state.parsed_df is None:
21
  st.stop()
22
 
23
  data = st.session_state.parsed_df
24
- data = data.select(["portdest","protocole","regle1","status"])
25
 
26
  # Sélectionner toutes les colonnes numériques
27
  quanti = data.select(pl.col(pl.Int64))
@@ -75,21 +76,13 @@ if st.button("Start clustering"):
75
  .groupby("cluster_kmeans", group_keys=False)
76
  .apply(lambda x: x.sample(frac=0.05, random_state=42))
77
  )
78
- # dbscan = DBSCAN(eps=0.5, min_samples=10)
79
- # preds = dbscan.fit_predict(df.to_pandas())
80
- # df = df.with_columns(pl.Series(values=preds, name='cluster_dbscan'))
81
-
82
- # agg_clustering = AgglomerativeClustering(n_clusters=2)
83
- # preds = agg_clustering.fit_predict(df.to_pandas())
84
- # df = df.with_columns(pl.Series(values=preds, name='cluster_agg'))
85
 
86
  ###############################################################
87
  #### Visualisation des clusters ####
88
  ###############################################################
89
 
90
 
91
- # Visualisation des clusters (en 2D avec PCA)
92
- from sklearn.decomposition import PCA
93
 
94
  pca = PCA(n_components=2)
95
  df_pca = pca.fit_transform(df_ech.to_pandas())
@@ -117,27 +110,8 @@ if st.button("Start clustering"):
117
  with st.spinner("Performing some more data analysis..."):
118
  try:
119
  data = data.with_columns(pl.Series(name="cluster_kmeans", values=df_clust.select("cluster_kmeans")))
120
- cols = ["protocole","regle1","status"]
121
- for col in cols:
122
- # fig = px.bar(freq_df, x=col, y='frequency',
123
- # title=f'{col} frequency',
124
- # labels={'categorie': 'Category', 'frequence': 'Frequency'},
125
- # color=col)
126
- # fig.update_layout(xaxis_title='Categories', yaxis_title='Frequency')
127
- # st.plotly_chart(fig, use_container_width=True)
128
-
129
- # data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
130
- # freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
131
-
132
- # fig = px.bar(freq_df, x=col, y='frequency',
133
- # title=f'{col} frequency',
134
- # labels={'categorie': 'Category', 'frequence': 'Frequency'},
135
- # color=col)
136
- # fig.update_layout(xaxis_title='Categories', yaxis_title='Frequency')
137
- # st.plotly_chart(fig, use_container_width=True)
138
-
139
-
140
-
141
  fig = make_subplots(rows=1, cols=2)
142
 
143
  data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
@@ -166,13 +140,15 @@ if st.button("Start clustering"):
166
  )
167
  st.plotly_chart(fig, use_container_width=True)
168
 
 
 
169
  fig = make_subplots(rows=1, cols=2)
170
 
171
  data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
172
 
173
  # Ajouter le premier histogramme
174
  fig.add_trace(
175
- go.Histogram(x=data_filtered["portdest"], name="Cluster 0", marker_color="rebeccapurple"),
176
  row=1, col=1
177
  )
178
 
@@ -180,7 +156,7 @@ if st.button("Start clustering"):
180
 
181
  # Ajouter le deuxième histogramme
182
  fig.add_trace(
183
- go.Histogram(x=data_filtered["portdest"], name="Cluster 1", marker_color="gold"),
184
  row=1, col=2
185
  )
186
 
@@ -194,19 +170,4 @@ if st.button("Start clustering"):
194
  except Exception as e:
195
  st.error(f"An error occured while doing the data analysis : {e}")
196
  else:
197
- st.warning("Please parse the log file first.")
198
-
199
- # Choisir le nombre de clusters (méthode du coude)
200
- # inertia = []
201
- # for k in range(1, 11):
202
- # kmeans = KMeans(n_clusters=k, random_state=42)
203
- # kmeans.fit(df_scaled.to_pandas())
204
- # inertia.append(kmeans.inertia_)
205
-
206
- # # Tracer la courbe pour la méthode du coude
207
- # plt.plot(range(1, 11), inertia, marker='o')
208
- # plt.title('Méthode du coude')
209
- # plt.xlabel('Nombre de clusters')
210
- # plt.ylabel('Inertie')
211
- # plt.show()
212
-
 
7
 
8
  from sklearn.preprocessing import StandardScaler
9
  from sklearn.preprocessing import OneHotEncoder
10
+ from sklearn.cluster import KMeans
11
+ from sklearn.decomposition import PCA
12
 
13
  if "parsed_df" not in st.session_state:
14
  st.session_state.parsed_df = None
 
22
  st.stop()
23
 
24
  data = st.session_state.parsed_df
25
+ data = data.select(["portdst","protocole","regle","action"])
26
 
27
  # Sélectionner toutes les colonnes numériques
28
  quanti = data.select(pl.col(pl.Int64))
 
76
  .groupby("cluster_kmeans", group_keys=False)
77
  .apply(lambda x: x.sample(frac=0.05, random_state=42))
78
  )
 
 
 
 
 
 
 
79
 
80
  ###############################################################
81
  #### Visualisation des clusters ####
82
  ###############################################################
83
 
84
 
85
+ # Visualisation des clusters (en 2D avec PCA)
 
86
 
87
  pca = PCA(n_components=2)
88
  df_pca = pca.fit_transform(df_ech.to_pandas())
 
110
  with st.spinner("Performing some more data analysis..."):
111
  try:
112
  data = data.with_columns(pl.Series(name="cluster_kmeans", values=df_clust.select("cluster_kmeans")))
113
+ # Analyse des variables qualitatives par cluster
114
+ for col in quali.columns: # protocole, regle, action
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  fig = make_subplots(rows=1, cols=2)
116
 
117
  data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
 
140
  )
141
  st.plotly_chart(fig, use_container_width=True)
142
 
143
+ # Analyse de la variable quantitative par cluster
144
+
145
  fig = make_subplots(rows=1, cols=2)
146
 
147
  data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
148
 
149
  # Ajouter le premier histogramme
150
  fig.add_trace(
151
+ go.Histogram(x=data_filtered["portdst"], name="Cluster 0", marker_color="rebeccapurple"),
152
  row=1, col=1
153
  )
154
 
 
156
 
157
  # Ajouter le deuxième histogramme
158
  fig.add_trace(
159
+ go.Histogram(x=data_filtered["portdst"], name="Cluster 1", marker_color="gold"),
160
  row=1, col=2
161
  )
162
 
 
170
  except Exception as e:
171
  st.error(f"An error occured while doing the data analysis : {e}")
172
  else:
173
+ st.warning("Please parse the log file first.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
sections/analyze.py CHANGED
@@ -2,6 +2,7 @@ import polars as pl
2
  import streamlit as st
3
  import ipaddress
4
  import plotly.express as px
 
5
  import pandas as pd
6
 
7
  if "parsed_df" not in st.session_state:
@@ -316,3 +317,38 @@ with tab3:
316
  # Onglet Sankey
317
  with tab4:
318
  st.subheader("Sankey Diagram")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import streamlit as st
3
  import ipaddress
4
  import plotly.express as px
5
+ import plotly.graph_objs as go
6
  import pandas as pd
7
 
8
  if "parsed_df" not in st.session_state:
 
317
  # Onglet Sankey
318
  with tab4:
319
  st.subheader("Sankey Diagram")
320
+
321
+ def create_sankey(df, source_col, target_col):
322
+ """ Crée un diagramme de Sankey entre deux colonnes """
323
+ df_grouped = df.groupby([source_col, target_col]).len().to_pandas()
324
+
325
+ # Création des nœuds
326
+ labels = list(pd.concat([df_grouped[source_col], df_grouped[target_col]]).unique())
327
+ label_to_index = {label: i for i, label in enumerate(labels)}
328
+
329
+ # Création des liens
330
+ sources = df_grouped[source_col].map(label_to_index)
331
+ targets = df_grouped[target_col].map(label_to_index)
332
+ values = df_grouped["len"]
333
+
334
+ # Création du Sankey Diagram
335
+ fig = go.Figure(go.Sankey(
336
+ node=dict(
337
+ pad=15, thickness=20, line=dict(color="black", width=0.5),
338
+ label=labels
339
+ ),
340
+ link=dict(
341
+ source=sources, target=targets, value=values
342
+ )
343
+ ))
344
+
345
+ fig.update_layout(title_text=f"Flux entre {source_col} et {target_col}", font_size=10)
346
+ st.plotly_chart(fig, use_container_width=True)
347
+
348
+ # 🔹 Sankey entre IP source et IP destination
349
+ create_sankey(data, "ip_source", "ip_destination")
350
+
351
+ # 🔹 Sankey entre IP source et port destination
352
+ df = df.with_columns(df["port_destination"].cast(pl.Utf8)) # Convertir les ports en chaînes pour éviter les erreurs
353
+ create_sankey(data, "ip_source", "port_destination")
354
+