Spaces:
Running
Running
Updated sankey and clustering
Browse files- sections/analytics.py +27 -23
- sections/analyze.py +18 -5
sections/analytics.py
CHANGED
|
@@ -22,7 +22,7 @@ if st.session_state.parsed_df is None:
|
|
| 22 |
st.stop()
|
| 23 |
|
| 24 |
data = st.session_state.parsed_df
|
| 25 |
-
data = data.select(["portdst","protocole","
|
| 26 |
|
| 27 |
# Sélectionner toutes les colonnes numériques
|
| 28 |
quanti = data.select(pl.col(pl.Int64))
|
|
@@ -72,9 +72,13 @@ if st.button("Start clustering"):
|
|
| 72 |
preds = kmeans.fit_predict(df.to_pandas())
|
| 73 |
df_clust = df.with_columns(pl.Series(values=preds, name='cluster_kmeans'))
|
| 74 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
df_ech = pl.from_pandas(df_clust.to_pandas()
|
| 76 |
.groupby("cluster_kmeans", group_keys=False)
|
| 77 |
-
.apply(lambda x: x.sample(frac=
|
| 78 |
)
|
| 79 |
|
| 80 |
###############################################################
|
|
@@ -111,7 +115,7 @@ if st.button("Start clustering"):
|
|
| 111 |
try:
|
| 112 |
data = data.with_columns(pl.Series(name="cluster_kmeans", values=df_clust.select("cluster_kmeans")))
|
| 113 |
# Analyse des variables qualitatives par cluster
|
| 114 |
-
for col in quali.columns: # protocole,
|
| 115 |
fig = make_subplots(rows=1, cols=2)
|
| 116 |
|
| 117 |
data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
|
|
@@ -141,31 +145,31 @@ if st.button("Start clustering"):
|
|
| 141 |
st.plotly_chart(fig, use_container_width=True)
|
| 142 |
|
| 143 |
# Analyse de la variable quantitative par cluster
|
|
|
|
|
|
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
|
| 155 |
-
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
|
| 170 |
except Exception as e:
|
| 171 |
st.error(f"An error occured while doing the data analysis : {e}")
|
|
|
|
| 22 |
st.stop()
|
| 23 |
|
| 24 |
data = st.session_state.parsed_df
|
| 25 |
+
data = data.select(["portdst","protocole","rule","action"])
|
| 26 |
|
| 27 |
# Sélectionner toutes les colonnes numériques
|
| 28 |
quanti = data.select(pl.col(pl.Int64))
|
|
|
|
| 72 |
preds = kmeans.fit_predict(df.to_pandas())
|
| 73 |
df_clust = df.with_columns(pl.Series(values=preds, name='cluster_kmeans'))
|
| 74 |
|
| 75 |
+
if df_clust.shape[0] > 200000: # 200k
|
| 76 |
+
perc = 200000/df_clust.shape[0]
|
| 77 |
+
else:
|
| 78 |
+
perc = 1
|
| 79 |
df_ech = pl.from_pandas(df_clust.to_pandas()
|
| 80 |
.groupby("cluster_kmeans", group_keys=False)
|
| 81 |
+
.apply(lambda x: x.sample(frac=perc, random_state=42))
|
| 82 |
)
|
| 83 |
|
| 84 |
###############################################################
|
|
|
|
| 115 |
try:
|
| 116 |
data = data.with_columns(pl.Series(name="cluster_kmeans", values=df_clust.select("cluster_kmeans")))
|
| 117 |
# Analyse des variables qualitatives par cluster
|
| 118 |
+
for col in quali.columns: # protocole, action
|
| 119 |
fig = make_subplots(rows=1, cols=2)
|
| 120 |
|
| 121 |
data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
|
|
|
|
| 145 |
st.plotly_chart(fig, use_container_width=True)
|
| 146 |
|
| 147 |
# Analyse de la variable quantitative par cluster
|
| 148 |
+
for col in quanti.columns: # protocole, rule, action
|
| 149 |
+
fig = make_subplots(rows=1, cols=2)
|
| 150 |
|
| 151 |
+
data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
|
|
|
|
|
|
|
| 152 |
|
| 153 |
+
# Ajouter le premier histogramme
|
| 154 |
+
fig.add_trace(
|
| 155 |
+
go.Histogram(x=data_filtered[col], name="Cluster 0", marker_color="rebeccapurple"),
|
| 156 |
+
row=1, col=1
|
| 157 |
+
)
|
| 158 |
|
| 159 |
+
data_filtered = data.filter(pl.col("cluster_kmeans") == 1)
|
| 160 |
|
| 161 |
+
# Ajouter le deuxième histogramme
|
| 162 |
+
fig.add_trace(
|
| 163 |
+
go.Histogram(x=data_filtered[col], name="Cluster 1", marker_color="gold"),
|
| 164 |
+
row=1, col=2
|
| 165 |
+
)
|
| 166 |
|
| 167 |
+
# Mettre à jour la mise en page pour améliorer l'apparence
|
| 168 |
+
fig.update_layout(
|
| 169 |
+
title_text=f"Histograms of {col}",
|
| 170 |
+
showlegend=True,
|
| 171 |
+
)
|
| 172 |
+
st.plotly_chart(fig, use_container_width=True)
|
| 173 |
|
| 174 |
except Exception as e:
|
| 175 |
st.error(f"An error occured while doing the data analysis : {e}")
|
sections/analyze.py
CHANGED
|
@@ -320,7 +320,7 @@ with tab4:
|
|
| 320 |
|
| 321 |
def create_sankey(df, source_col, target_col):
|
| 322 |
""" Crée un diagramme de Sankey entre deux colonnes """
|
| 323 |
-
df_grouped = df.
|
| 324 |
|
| 325 |
# Création des nœuds
|
| 326 |
labels = list(pd.concat([df_grouped[source_col], df_grouped[target_col]]).unique())
|
|
@@ -342,13 +342,26 @@ with tab4:
|
|
| 342 |
)
|
| 343 |
))
|
| 344 |
|
| 345 |
-
fig.update_layout(title_text=f"
|
| 346 |
st.plotly_chart(fig, use_container_width=True)
|
| 347 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 348 |
# 🔹 Sankey entre IP source et IP destination
|
| 349 |
-
create_sankey(
|
| 350 |
|
| 351 |
# 🔹 Sankey entre IP source et port destination
|
| 352 |
-
df =
|
| 353 |
-
create_sankey(
|
| 354 |
|
|
|
|
| 320 |
|
| 321 |
def create_sankey(df, source_col, target_col):
|
| 322 |
""" Crée un diagramme de Sankey entre deux colonnes """
|
| 323 |
+
df_grouped = df.group_by([source_col, target_col]).len().to_pandas()
|
| 324 |
|
| 325 |
# Création des nœuds
|
| 326 |
labels = list(pd.concat([df_grouped[source_col], df_grouped[target_col]]).unique())
|
|
|
|
| 342 |
)
|
| 343 |
))
|
| 344 |
|
| 345 |
+
fig.update_layout(title_text=f"Flow between {source_col} and {target_col}", font_size=10)
|
| 346 |
st.plotly_chart(fig, use_container_width=True)
|
| 347 |
|
| 348 |
+
st.subheader("Connections where access were identified as : PERMIT")
|
| 349 |
+
|
| 350 |
+
data_filtered = data.filter(pl.col("action") == "PERMIT")
|
| 351 |
+
# 🔹 Sankey entre IP source et IP destination
|
| 352 |
+
create_sankey(data_filtered, "ipsrc", "ipdst")
|
| 353 |
+
|
| 354 |
+
# 🔹 Sankey entre IP source et port destination
|
| 355 |
+
df = data_filtered.with_columns(data_filtered["portdst"].cast(pl.Utf8)) # Convertir les ports en chaînes pour éviter les erreurs
|
| 356 |
+
create_sankey(df, "ipsrc", "portdst")
|
| 357 |
+
|
| 358 |
+
st.subheader("Connections where access were identified as : DENY")
|
| 359 |
+
|
| 360 |
+
data_filtered = data.filter(pl.col("action") == "DENY")
|
| 361 |
# 🔹 Sankey entre IP source et IP destination
|
| 362 |
+
create_sankey(data_filtered, "ipsrc", "ipdst")
|
| 363 |
|
| 364 |
# 🔹 Sankey entre IP source et port destination
|
| 365 |
+
df = data_filtered.with_columns(data_filtered["portdst"].cast(pl.Utf8)) # Convertir les ports en chaînes pour éviter les erreurs
|
| 366 |
+
create_sankey(df, "ipsrc", "portdst")
|
| 367 |
|