Spaces:
Sleeping
Sleeping
Changed the clustering approach
Browse files- sections/analytics.py +43 -55
- sections/analyze.py +7 -7
- sections/upload.py +3 -3
sections/analytics.py
CHANGED
|
@@ -24,39 +24,23 @@ if st.session_state.parsed_df is None:
|
|
| 24 |
data = st.session_state.parsed_df
|
| 25 |
data = data.select(["portdst","protocole","rule","action"])
|
| 26 |
|
| 27 |
-
# Sélectionner toutes les colonnes numériques
|
| 28 |
-
quanti = data.select(pl.col(pl.Int64))
|
| 29 |
-
|
| 30 |
-
# Sélectionner toutes les colonnes de type chaîne
|
| 31 |
-
quali = data.select(pl.col(pl.String))
|
| 32 |
-
|
| 33 |
##############################################
|
| 34 |
#### Preprocessing ####
|
| 35 |
##############################################
|
| 36 |
|
| 37 |
-
#
|
| 38 |
-
|
| 39 |
-
scaler = StandardScaler()
|
| 40 |
-
data_quanti = scaler.fit_transform(quanti.to_pandas())
|
| 41 |
-
|
| 42 |
-
# Convertir de nouveau en DataFrame Polars
|
| 43 |
-
data_quanti = pl.from_pandas(pd.DataFrame(data_quanti, columns=quanti.columns))
|
| 44 |
-
|
| 45 |
-
# Encodage one-hot des données quali
|
| 46 |
|
| 47 |
encoder = OneHotEncoder(sparse_output=False)
|
| 48 |
-
|
| 49 |
|
| 50 |
col_names = [
|
| 51 |
f"{feature}_{category}"
|
| 52 |
-
for feature, categories in zip(
|
| 53 |
for category in categories
|
| 54 |
]
|
| 55 |
|
| 56 |
# Convertir de nouveau en DataFrame Polars
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
df = pl.concat([data_quanti, data_quali], how="horizontal")
|
| 60 |
|
| 61 |
###############################################
|
| 62 |
#### Clustering ####
|
|
@@ -66,11 +50,16 @@ if st.button("Start clustering"):
|
|
| 66 |
if st.session_state.parsed_df is not None:
|
| 67 |
with st.spinner("Searching the clusters..."):
|
| 68 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
# Appliquer K-Means avec k optimal choisi
|
| 70 |
k_optimal = 2 # Par exemple, supposons que k = 3
|
| 71 |
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
|
| 72 |
-
preds = kmeans.fit_predict(
|
| 73 |
-
|
|
|
|
| 74 |
|
| 75 |
if df_clust.shape[0] > 200000: # 200k
|
| 76 |
perc = 200000/df_clust.shape[0]
|
|
@@ -88,14 +77,14 @@ if st.button("Start clustering"):
|
|
| 88 |
|
| 89 |
# Visualisation des clusters (en 2D avec PCA)
|
| 90 |
|
| 91 |
-
pca = PCA(n_components=2)
|
| 92 |
-
df_pca = pca.fit_transform(df_ech.to_pandas())
|
| 93 |
|
| 94 |
fig = px.scatter(
|
| 95 |
-
x=
|
| 96 |
-
y=
|
| 97 |
-
color=df_ech
|
| 98 |
-
|
| 99 |
title='Clustering coupled with PCA',
|
| 100 |
labels={'x': 'Component 1', 'y': 'Component 2', 'color': 'Cluster'},
|
| 101 |
)
|
|
@@ -104,21 +93,20 @@ if st.button("Start clustering"):
|
|
| 104 |
xaxis_title='Component 1',
|
| 105 |
yaxis_title='Component 2'
|
| 106 |
)
|
| 107 |
-
|
| 108 |
# fig.show()
|
| 109 |
st.plotly_chart(fig, use_container_width=True)
|
| 110 |
|
| 111 |
except Exception as e:
|
| 112 |
-
st.error(f"An error occured while doing the clustering : {e}")
|
| 113 |
|
| 114 |
with st.spinner("Performing some more data analysis..."):
|
| 115 |
try:
|
| 116 |
-
|
| 117 |
# Analyse des variables qualitatives par cluster
|
| 118 |
-
for col in
|
| 119 |
fig = make_subplots(rows=1, cols=2)
|
| 120 |
|
| 121 |
-
data_filtered =
|
| 122 |
freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
|
| 123 |
|
| 124 |
fig.add_trace(
|
|
@@ -127,7 +115,7 @@ if st.button("Start clustering"):
|
|
| 127 |
row=1, col=1
|
| 128 |
)
|
| 129 |
|
| 130 |
-
data_filtered =
|
| 131 |
freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
|
| 132 |
|
| 133 |
fig.add_trace(
|
|
@@ -144,32 +132,32 @@ if st.button("Start clustering"):
|
|
| 144 |
)
|
| 145 |
st.plotly_chart(fig, use_container_width=True)
|
| 146 |
|
| 147 |
-
# Analyse de la variable quantitative par cluster
|
| 148 |
-
for col in quanti.columns: # protocole, rule, action
|
| 149 |
-
|
| 150 |
|
| 151 |
-
|
| 152 |
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
|
| 159 |
-
|
| 160 |
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
|
| 174 |
except Exception as e:
|
| 175 |
st.error(f"An error occured while doing the data analysis : {e}")
|
|
|
|
| 24 |
data = st.session_state.parsed_df
|
| 25 |
data = data.select(["portdst","protocole","rule","action"])
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
##############################################
|
| 28 |
#### Preprocessing ####
|
| 29 |
##############################################
|
| 30 |
|
| 31 |
+
# Encodage one-hot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
encoder = OneHotEncoder(sparse_output=False)
|
| 34 |
+
data_encoded = encoder.fit_transform(data.to_pandas())
|
| 35 |
|
| 36 |
col_names = [
|
| 37 |
f"{feature}_{category}"
|
| 38 |
+
for feature, categories in zip(data.columns, encoder.categories_)
|
| 39 |
for category in categories
|
| 40 |
]
|
| 41 |
|
| 42 |
# Convertir de nouveau en DataFrame Polars
|
| 43 |
+
data_encoded = pl.from_pandas(pd.DataFrame(data_encoded, columns=col_names))
|
|
|
|
|
|
|
| 44 |
|
| 45 |
###############################################
|
| 46 |
#### Clustering ####
|
|
|
|
| 50 |
if st.session_state.parsed_df is not None:
|
| 51 |
with st.spinner("Searching the clusters..."):
|
| 52 |
try:
|
| 53 |
+
|
| 54 |
+
pca = PCA(n_components=2)
|
| 55 |
+
df_pca = pca.fit_transform(data_encoded.to_pandas())
|
| 56 |
+
|
| 57 |
# Appliquer K-Means avec k optimal choisi
|
| 58 |
k_optimal = 2 # Par exemple, supposons que k = 3
|
| 59 |
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
|
| 60 |
+
preds = kmeans.fit_predict(df_pca)
|
| 61 |
+
df_pca = pl.from_pandas(pd.DataFrame(df_pca, columns=[f"Component {i+1}" for i in range(k_optimal)]))
|
| 62 |
+
df_clust = df_pca.with_columns(pl.Series(values=preds, name='cluster_kmeans'))
|
| 63 |
|
| 64 |
if df_clust.shape[0] > 200000: # 200k
|
| 65 |
perc = 200000/df_clust.shape[0]
|
|
|
|
| 77 |
|
| 78 |
# Visualisation des clusters (en 2D avec PCA)
|
| 79 |
|
| 80 |
+
# pca = PCA(n_components=2)
|
| 81 |
+
# df_pca = pca.fit_transform(df_ech.to_pandas())
|
| 82 |
|
| 83 |
fig = px.scatter(
|
| 84 |
+
x=df_ech.select("Component 1").to_numpy().flatten(),
|
| 85 |
+
y=df_ech.select("Component 2").to_numpy().flatten(),
|
| 86 |
+
color=df_ech.select('cluster_kmeans').to_numpy().flatten().astype(str),
|
| 87 |
+
color_discrete_map={"0": "rebeccapurple", "1": "gold"},
|
| 88 |
title='Clustering coupled with PCA',
|
| 89 |
labels={'x': 'Component 1', 'y': 'Component 2', 'color': 'Cluster'},
|
| 90 |
)
|
|
|
|
| 93 |
xaxis_title='Component 1',
|
| 94 |
yaxis_title='Component 2'
|
| 95 |
)
|
|
|
|
| 96 |
# fig.show()
|
| 97 |
st.plotly_chart(fig, use_container_width=True)
|
| 98 |
|
| 99 |
except Exception as e:
|
| 100 |
+
st.error(f"An error occured while doing the clustering : {e.with_traceback(None)}")
|
| 101 |
|
| 102 |
with st.spinner("Performing some more data analysis..."):
|
| 103 |
try:
|
| 104 |
+
data_clust = data.with_columns(pl.Series(name="cluster_kmeans", values=df_clust.select("cluster_kmeans")))
|
| 105 |
# Analyse des variables qualitatives par cluster
|
| 106 |
+
for col in data.columns : # portdst, protocole, rule, action
|
| 107 |
fig = make_subplots(rows=1, cols=2)
|
| 108 |
|
| 109 |
+
data_filtered = data_clust.filter(pl.col("cluster_kmeans") == 0)
|
| 110 |
freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
|
| 111 |
|
| 112 |
fig.add_trace(
|
|
|
|
| 115 |
row=1, col=1
|
| 116 |
)
|
| 117 |
|
| 118 |
+
data_filtered = data_clust.filter(pl.col("cluster_kmeans") == 1)
|
| 119 |
freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
|
| 120 |
|
| 121 |
fig.add_trace(
|
|
|
|
| 132 |
)
|
| 133 |
st.plotly_chart(fig, use_container_width=True)
|
| 134 |
|
| 135 |
+
# # Analyse de la variable quantitative par cluster
|
| 136 |
+
# for col in quanti.columns: # protocole, rule, action
|
| 137 |
+
# fig = make_subplots(rows=1, cols=2)
|
| 138 |
|
| 139 |
+
# data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
|
| 140 |
|
| 141 |
+
# # Ajouter le premier histogramme
|
| 142 |
+
# fig.add_trace(
|
| 143 |
+
# go.Histogram(x=data_filtered[col], name="Cluster 0", marker_color="rebeccapurple"),
|
| 144 |
+
# row=1, col=1
|
| 145 |
+
# )
|
| 146 |
|
| 147 |
+
# data_filtered = data.filter(pl.col("cluster_kmeans") == 1)
|
| 148 |
|
| 149 |
+
# # Ajouter le deuxième histogramme
|
| 150 |
+
# fig.add_trace(
|
| 151 |
+
# go.Histogram(x=data_filtered[col], name="Cluster 1", marker_color="gold"),
|
| 152 |
+
# row=1, col=2
|
| 153 |
+
# )
|
| 154 |
|
| 155 |
+
# # Mettre à jour la mise en page pour améliorer l'apparence
|
| 156 |
+
# fig.update_layout(
|
| 157 |
+
# title_text=f"Histograms of {col}",
|
| 158 |
+
# showlegend=True,
|
| 159 |
+
# )
|
| 160 |
+
# st.plotly_chart(fig, use_container_width=True)
|
| 161 |
|
| 162 |
except Exception as e:
|
| 163 |
st.error(f"An error occured while doing the data analysis : {e}")
|
sections/analyze.py
CHANGED
|
@@ -100,8 +100,8 @@ with tab1:
|
|
| 100 |
st.markdown("### 🔢 Port")
|
| 101 |
if "portdst" in data.columns:
|
| 102 |
min_port, max_port = (
|
| 103 |
-
int(data["portdst"].min()),
|
| 104 |
-
int(data["portdst"].max()),
|
| 105 |
)
|
| 106 |
|
| 107 |
# Initialize port range in session state if not present
|
|
@@ -272,7 +272,7 @@ with tab3:
|
|
| 272 |
" (portdst < 1024 and action == 'PERMIT')"
|
| 273 |
)
|
| 274 |
top_ports = (
|
| 275 |
-
data.filter((pl.col("portdst") < 1024) & (pl.col("action") == "PERMIT"))
|
| 276 |
.group_by("portdst")
|
| 277 |
.agg(pl.count("portdst").alias("count"))
|
| 278 |
.sort("count", descending=True)
|
|
@@ -483,8 +483,8 @@ with tab5:
|
|
| 483 |
|
| 484 |
# 🔹 Sankey entre IP source et port destination
|
| 485 |
df = data_filtered.with_columns(
|
| 486 |
-
data_filtered["portdst"]
|
| 487 |
-
)
|
| 488 |
create_sankey(df, "ipsrc", "portdst")
|
| 489 |
|
| 490 |
st.subheader("Connections where access were identified as : DENY")
|
|
@@ -495,6 +495,6 @@ with tab5:
|
|
| 495 |
|
| 496 |
# 🔹 Sankey entre IP source et port destination
|
| 497 |
df = data_filtered.with_columns(
|
| 498 |
-
data_filtered["portdst"]
|
| 499 |
-
)
|
| 500 |
create_sankey(df, "ipsrc", "portdst")
|
|
|
|
| 100 |
st.markdown("### 🔢 Port")
|
| 101 |
if "portdst" in data.columns:
|
| 102 |
min_port, max_port = (
|
| 103 |
+
int(data["portdst"].cast(pl.Utf8).min()),
|
| 104 |
+
int(data["portdst"].cast(pl.Utf8).max()),
|
| 105 |
)
|
| 106 |
|
| 107 |
# Initialize port range in session state if not present
|
|
|
|
| 272 |
" (portdst < 1024 and action == 'PERMIT')"
|
| 273 |
)
|
| 274 |
top_ports = (
|
| 275 |
+
data.filter((pl.col("portdst").cast(pl.Int64) < 1024) & (pl.col("action") == "PERMIT"))
|
| 276 |
.group_by("portdst")
|
| 277 |
.agg(pl.count("portdst").alias("count"))
|
| 278 |
.sort("count", descending=True)
|
|
|
|
| 483 |
|
| 484 |
# 🔹 Sankey entre IP source et port destination
|
| 485 |
df = data_filtered.with_columns(
|
| 486 |
+
data_filtered["portdst"]
|
| 487 |
+
)
|
| 488 |
create_sankey(df, "ipsrc", "portdst")
|
| 489 |
|
| 490 |
st.subheader("Connections where access were identified as : DENY")
|
|
|
|
| 495 |
|
| 496 |
# 🔹 Sankey entre IP source et port destination
|
| 497 |
df = data_filtered.with_columns(
|
| 498 |
+
data_filtered["portdst"]
|
| 499 |
+
)
|
| 500 |
create_sankey(df, "ipsrc", "portdst")
|
sections/upload.py
CHANGED
|
@@ -79,9 +79,9 @@ if uploaded_file is not None:
|
|
| 79 |
"ipsrc": pl.Utf8,
|
| 80 |
"ipdst": pl.Utf8,
|
| 81 |
"protocole": pl.Utf8,
|
| 82 |
-
"portsrc": pl.
|
| 83 |
-
"portdst": pl.
|
| 84 |
-
"rule": pl.
|
| 85 |
"action": pl.Utf8,
|
| 86 |
"interface": pl.Utf8,
|
| 87 |
"unknown": pl.Utf8,
|
|
|
|
| 79 |
"ipsrc": pl.Utf8,
|
| 80 |
"ipdst": pl.Utf8,
|
| 81 |
"protocole": pl.Utf8,
|
| 82 |
+
"portsrc": pl.Utf8,
|
| 83 |
+
"portdst": pl.Utf8,
|
| 84 |
+
"rule": pl.Utf8,
|
| 85 |
"action": pl.Utf8,
|
| 86 |
"interface": pl.Utf8,
|
| 87 |
"unknown": pl.Utf8,
|