Spaces:
Sleeping
Sleeping
Commit
·
8e30e05
1
Parent(s):
c657a93
Add protocol filter and slider for dates
Browse files- sections/analyze.py +91 -48
sections/analyze.py
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
-
import
|
| 2 |
-
import streamlit as st
|
| 3 |
import ipaddress
|
|
|
|
|
|
|
| 4 |
import plotly.express as px
|
| 5 |
import plotly.graph_objs as go
|
| 6 |
-
import
|
|
|
|
| 7 |
|
| 8 |
if "parsed_df" not in st.session_state:
|
| 9 |
st.session_state.parsed_df = None
|
|
@@ -24,22 +26,24 @@ university_subnets = [
|
|
| 24 |
ipaddress.ip_network("159.84.0.0/16"),
|
| 25 |
]
|
| 26 |
|
|
|
|
| 27 |
# Fonction pour vérifier si une IP appartient aux sous-réseaux universitaires
|
| 28 |
def is_university_ip(ip):
|
| 29 |
try:
|
| 30 |
ip_obj = ipaddress.ip_address(ip)
|
| 31 |
return any(ip_obj in subnet for subnet in university_subnets)
|
| 32 |
except ValueError:
|
| 33 |
-
return False
|
|
|
|
| 34 |
|
| 35 |
# Créer les onglets principaux
|
| 36 |
tab1, tab2, tab3, tab4 = st.tabs(
|
| 37 |
-
["
|
| 38 |
)
|
| 39 |
|
| 40 |
# Onglet Analysis
|
| 41 |
with tab1:
|
| 42 |
-
st.subheader("
|
| 43 |
|
| 44 |
# Vérifier que la colonne timestamp existe et est bien de type datetime
|
| 45 |
if "timestamp" in data.columns and data["timestamp"].dtype == pl.Datetime:
|
|
@@ -47,14 +51,35 @@ with tab1:
|
|
| 47 |
min_date = data["timestamp"].min().date()
|
| 48 |
max_date = data["timestamp"].max().date()
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
# Disposition des filtres en colonnes
|
| 51 |
col1, col2, col3 = st.columns(3)
|
| 52 |
|
| 53 |
# ---- FILTRE DATE ----
|
| 54 |
with col1:
|
| 55 |
-
st.markdown("###
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
# ---- FILTRE action----
|
| 60 |
with col2:
|
|
@@ -200,19 +225,16 @@ with tab2:
|
|
| 200 |
|
| 201 |
# Compter les occurrences des IP sources bloquées
|
| 202 |
blocked_ips = (
|
| 203 |
-
blocked_attempts
|
| 204 |
-
.group_by("ipsrc")
|
| 205 |
.agg(pl.count("ipsrc").alias("count"))
|
| 206 |
.sort("count", descending=True)
|
| 207 |
)
|
| 208 |
|
| 209 |
-
|
| 210 |
top_n = st.slider(" ", 5, 20, 10, key="top_n_slider")
|
| 211 |
|
| 212 |
# Sélectionner le Top N des IP bloquées
|
| 213 |
top_blocked_ips = blocked_ips.head(top_n)
|
| 214 |
|
| 215 |
-
|
| 216 |
# ---- GRAPHIQUE AVEC PLOTLY ----
|
| 217 |
color_palette = px.colors.sequential.Blues
|
| 218 |
if not top_blocked_ips.is_empty():
|
|
@@ -224,11 +246,11 @@ with tab2:
|
|
| 224 |
text="count",
|
| 225 |
title=f"Top {top_n} Most Blocked IPs",
|
| 226 |
labels={"ipsrc": "IP Source", "count": "Number of Blocked Attempts"},
|
| 227 |
-
color_discrete_sequence=["#3d85c6"]
|
| 228 |
)
|
| 229 |
|
| 230 |
# Amélioration du layout
|
| 231 |
-
fig.update_traces(texttemplate=
|
| 232 |
fig.update_layout(yaxis=dict(categoryorder="total ascending"))
|
| 233 |
|
| 234 |
# Afficher le graphique interactif
|
|
@@ -242,7 +264,9 @@ with tab2:
|
|
| 242 |
st.write("### 📊 Connection Activity Analysis")
|
| 243 |
if "timestamp" in data.columns:
|
| 244 |
# 📌 Ajout d'un sélecteur de fréquence
|
| 245 |
-
frequency = st.selectbox(
|
|
|
|
|
|
|
| 246 |
|
| 247 |
# Définition des formats selon la fréquence choisie
|
| 248 |
if frequency == "second":
|
|
@@ -261,7 +285,9 @@ with tab2:
|
|
| 261 |
# Filtrage et regroupement
|
| 262 |
activity_data = (
|
| 263 |
data.filter(pl.col("action") == "PERMIT")
|
| 264 |
-
.with_columns(
|
|
|
|
|
|
|
| 265 |
.group_by("time_period")
|
| 266 |
.agg(pl.count("time_period").alias("connection_count"))
|
| 267 |
.sort("time_period")
|
|
@@ -280,8 +306,11 @@ with tab2:
|
|
| 280 |
y="connection_count",
|
| 281 |
markers=True,
|
| 282 |
title=f"Connection Activity ({time_label} level)",
|
| 283 |
-
labels={
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
| 285 |
)
|
| 286 |
|
| 287 |
# Afficher le graphique
|
|
@@ -298,20 +327,24 @@ with tab3:
|
|
| 298 |
|
| 299 |
if "ipsrc" in data.columns and "action" in data.columns:
|
| 300 |
# Conversion des IPs en chaînes de caractères pour éviter les erreurs de type
|
| 301 |
-
data = data.with_columns(
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
|
|
|
|
|
|
| 305 |
|
| 306 |
# Vérification des IPs avec la fonction is_university_ip
|
| 307 |
-
data = data.with_columns(
|
| 308 |
-
|
| 309 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 310 |
|
| 311 |
# filtrer toutes les connexions impliquant une adresse externe
|
| 312 |
-
intrusion_attempts = data.filter(
|
| 313 |
-
(~pl.col("is_src_university_ip"))
|
| 314 |
-
)
|
| 315 |
# Ajout d'un filtre par action
|
| 316 |
selected_action = st.selectbox("Select action type", ["All", "PERMIT", "DENY"])
|
| 317 |
|
|
@@ -321,23 +354,26 @@ with tab3:
|
|
| 321 |
)
|
| 322 |
# Affichage des accès externes
|
| 323 |
st.write(f"### 🔍 External accesses: {intrusion_attempts.shape[0]} entries")
|
| 324 |
-
st.dataframe(
|
|
|
|
|
|
|
| 325 |
|
| 326 |
else:
|
| 327 |
st.warning("Columns 'ipsrc' not found.")
|
| 328 |
|
| 329 |
|
| 330 |
-
|
| 331 |
# Onglet Sankey
|
| 332 |
with tab4:
|
| 333 |
st.subheader("Sankey Diagram")
|
| 334 |
-
|
| 335 |
def create_sankey(df, source_col, target_col):
|
| 336 |
-
"""
|
| 337 |
df_grouped = df.group_by([source_col, target_col]).len().to_pandas()
|
| 338 |
|
| 339 |
# Création des nœuds
|
| 340 |
-
labels = list(
|
|
|
|
|
|
|
| 341 |
label_to_index = {label: i for i, label in enumerate(labels)}
|
| 342 |
|
| 343 |
# Création des liens
|
|
@@ -346,27 +382,33 @@ with tab4:
|
|
| 346 |
values = df_grouped["len"]
|
| 347 |
|
| 348 |
# Création du Sankey Diagram
|
| 349 |
-
fig = go.Figure(
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
|
|
|
|
|
|
| 356 |
)
|
| 357 |
-
)
|
| 358 |
-
|
| 359 |
-
fig.update_layout(
|
|
|
|
|
|
|
| 360 |
st.plotly_chart(fig, use_container_width=True)
|
| 361 |
|
| 362 |
st.subheader("Connections where access were identified as : PERMIT")
|
| 363 |
-
|
| 364 |
data_filtered = data.filter(pl.col("action") == "PERMIT")
|
| 365 |
# 🔹 Sankey entre IP source et IP destination
|
| 366 |
create_sankey(data_filtered, "ipsrc", "ipdst")
|
| 367 |
|
| 368 |
# 🔹 Sankey entre IP source et port destination
|
| 369 |
-
df = data_filtered.with_columns(
|
|
|
|
|
|
|
| 370 |
create_sankey(df, "ipsrc", "portdst")
|
| 371 |
|
| 372 |
st.subheader("Connections where access were identified as : DENY")
|
|
@@ -376,6 +418,7 @@ with tab4:
|
|
| 376 |
create_sankey(data_filtered, "ipsrc", "ipdst")
|
| 377 |
|
| 378 |
# 🔹 Sankey entre IP source et port destination
|
| 379 |
-
df = data_filtered.with_columns(
|
|
|
|
|
|
|
| 380 |
create_sankey(df, "ipsrc", "portdst")
|
| 381 |
-
|
|
|
|
| 1 |
+
import datetime
|
|
|
|
| 2 |
import ipaddress
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
import plotly.express as px
|
| 6 |
import plotly.graph_objs as go
|
| 7 |
+
import polars as pl
|
| 8 |
+
import streamlit as st
|
| 9 |
|
| 10 |
if "parsed_df" not in st.session_state:
|
| 11 |
st.session_state.parsed_df = None
|
|
|
|
| 26 |
ipaddress.ip_network("159.84.0.0/16"),
|
| 27 |
]
|
| 28 |
|
| 29 |
+
|
| 30 |
# Fonction pour vérifier si une IP appartient aux sous-réseaux universitaires
|
| 31 |
def is_university_ip(ip):
|
| 32 |
try:
|
| 33 |
ip_obj = ipaddress.ip_address(ip)
|
| 34 |
return any(ip_obj in subnet for subnet in university_subnets)
|
| 35 |
except ValueError:
|
| 36 |
+
return False
|
| 37 |
+
|
| 38 |
|
| 39 |
# Créer les onglets principaux
|
| 40 |
tab1, tab2, tab3, tab4 = st.tabs(
|
| 41 |
+
["Explore data", "Analysis", "Foreign IP addresses", "Sankey"]
|
| 42 |
)
|
| 43 |
|
| 44 |
# Onglet Analysis
|
| 45 |
with tab1:
|
| 46 |
+
st.subheader("Explore data")
|
| 47 |
|
| 48 |
# Vérifier que la colonne timestamp existe et est bien de type datetime
|
| 49 |
if "timestamp" in data.columns and data["timestamp"].dtype == pl.Datetime:
|
|
|
|
| 51 |
min_date = data["timestamp"].min().date()
|
| 52 |
max_date = data["timestamp"].max().date()
|
| 53 |
|
| 54 |
+
# Convertir les dates min et max en datetime pour inclure heures et minutes
|
| 55 |
+
min_date_time = datetime.datetime.combine(min_date, datetime.time(0, 0))
|
| 56 |
+
max_date_time = datetime.datetime.combine(max_date, datetime.time(23, 59))
|
| 57 |
+
|
| 58 |
+
selected_date_range = st.slider(
|
| 59 |
+
"Filter by date & time",
|
| 60 |
+
min_value=min_date_time,
|
| 61 |
+
max_value=max_date_time,
|
| 62 |
+
value=(min_date_time, max_date_time),
|
| 63 |
+
format="YYYY-MM-DD HH:mm",
|
| 64 |
+
)
|
| 65 |
+
start_date, end_date = selected_date_range
|
| 66 |
+
|
| 67 |
# Disposition des filtres en colonnes
|
| 68 |
col1, col2, col3 = st.columns(3)
|
| 69 |
|
| 70 |
# ---- FILTRE DATE ----
|
| 71 |
with col1:
|
| 72 |
+
st.markdown("### 🛠 Protocol")
|
| 73 |
+
if "protocole" in data.columns:
|
| 74 |
+
unique_protocols = sorted(
|
| 75 |
+
data["protocole"].unique().cast(pl.Utf8).to_list()
|
| 76 |
+
)
|
| 77 |
+
selected_protocol = st.selectbox(
|
| 78 |
+
"Select a protocol", ["All"] + unique_protocols
|
| 79 |
+
)
|
| 80 |
+
else:
|
| 81 |
+
selected_protocol = "All"
|
| 82 |
+
st.warning("Column 'protocole' not found.")
|
| 83 |
|
| 84 |
# ---- FILTRE action----
|
| 85 |
with col2:
|
|
|
|
| 225 |
|
| 226 |
# Compter les occurrences des IP sources bloquées
|
| 227 |
blocked_ips = (
|
| 228 |
+
blocked_attempts.group_by("ipsrc")
|
|
|
|
| 229 |
.agg(pl.count("ipsrc").alias("count"))
|
| 230 |
.sort("count", descending=True)
|
| 231 |
)
|
| 232 |
|
|
|
|
| 233 |
top_n = st.slider(" ", 5, 20, 10, key="top_n_slider")
|
| 234 |
|
| 235 |
# Sélectionner le Top N des IP bloquées
|
| 236 |
top_blocked_ips = blocked_ips.head(top_n)
|
| 237 |
|
|
|
|
| 238 |
# ---- GRAPHIQUE AVEC PLOTLY ----
|
| 239 |
color_palette = px.colors.sequential.Blues
|
| 240 |
if not top_blocked_ips.is_empty():
|
|
|
|
| 246 |
text="count",
|
| 247 |
title=f"Top {top_n} Most Blocked IPs",
|
| 248 |
labels={"ipsrc": "IP Source", "count": "Number of Blocked Attempts"},
|
| 249 |
+
color_discrete_sequence=["#3d85c6"],
|
| 250 |
)
|
| 251 |
|
| 252 |
# Amélioration du layout
|
| 253 |
+
fig.update_traces(texttemplate="%{text}", textposition="inside")
|
| 254 |
fig.update_layout(yaxis=dict(categoryorder="total ascending"))
|
| 255 |
|
| 256 |
# Afficher le graphique interactif
|
|
|
|
| 264 |
st.write("### 📊 Connection Activity Analysis")
|
| 265 |
if "timestamp" in data.columns:
|
| 266 |
# 📌 Ajout d'un sélecteur de fréquence
|
| 267 |
+
frequency = st.selectbox(
|
| 268 |
+
"Select frequency", ["second", "minute", "hour", "day"], index=1
|
| 269 |
+
)
|
| 270 |
|
| 271 |
# Définition des formats selon la fréquence choisie
|
| 272 |
if frequency == "second":
|
|
|
|
| 285 |
# Filtrage et regroupement
|
| 286 |
activity_data = (
|
| 287 |
data.filter(pl.col("action") == "PERMIT")
|
| 288 |
+
.with_columns(
|
| 289 |
+
pl.col("timestamp").dt.strftime(time_format).alias("time_period")
|
| 290 |
+
)
|
| 291 |
.group_by("time_period")
|
| 292 |
.agg(pl.count("time_period").alias("connection_count"))
|
| 293 |
.sort("time_period")
|
|
|
|
| 306 |
y="connection_count",
|
| 307 |
markers=True,
|
| 308 |
title=f"Connection Activity ({time_label} level)",
|
| 309 |
+
labels={
|
| 310 |
+
"time_period": time_label,
|
| 311 |
+
"connection_count": "Number of Connections",
|
| 312 |
+
},
|
| 313 |
+
line_shape="spline",
|
| 314 |
)
|
| 315 |
|
| 316 |
# Afficher le graphique
|
|
|
|
| 327 |
|
| 328 |
if "ipsrc" in data.columns and "action" in data.columns:
|
| 329 |
# Conversion des IPs en chaînes de caractères pour éviter les erreurs de type
|
| 330 |
+
data = data.with_columns(
|
| 331 |
+
[
|
| 332 |
+
pl.col("ipsrc").cast(pl.Utf8).alias("ipsrc"),
|
| 333 |
+
pl.col("action").cast(pl.Utf8).alias("action"),
|
| 334 |
+
]
|
| 335 |
+
)
|
| 336 |
|
| 337 |
# Vérification des IPs avec la fonction is_university_ip
|
| 338 |
+
data = data.with_columns(
|
| 339 |
+
[
|
| 340 |
+
pl.col("ipsrc")
|
| 341 |
+
.map_elements(is_university_ip, return_dtype=pl.Boolean)
|
| 342 |
+
.alias("is_src_university_ip")
|
| 343 |
+
]
|
| 344 |
+
)
|
| 345 |
|
| 346 |
# filtrer toutes les connexions impliquant une adresse externe
|
| 347 |
+
intrusion_attempts = data.filter((~pl.col("is_src_university_ip")))
|
|
|
|
|
|
|
| 348 |
# Ajout d'un filtre par action
|
| 349 |
selected_action = st.selectbox("Select action type", ["All", "PERMIT", "DENY"])
|
| 350 |
|
|
|
|
| 354 |
)
|
| 355 |
# Affichage des accès externes
|
| 356 |
st.write(f"### 🔍 External accesses: {intrusion_attempts.shape[0]} entries")
|
| 357 |
+
st.dataframe(
|
| 358 |
+
intrusion_attempts.drop(["is_src_university_ip"]), use_container_width=True
|
| 359 |
+
)
|
| 360 |
|
| 361 |
else:
|
| 362 |
st.warning("Columns 'ipsrc' not found.")
|
| 363 |
|
| 364 |
|
|
|
|
| 365 |
# Onglet Sankey
|
| 366 |
with tab4:
|
| 367 |
st.subheader("Sankey Diagram")
|
| 368 |
+
|
| 369 |
def create_sankey(df, source_col, target_col):
|
| 370 |
+
"""Crée un diagramme de Sankey entre deux colonnes"""
|
| 371 |
df_grouped = df.group_by([source_col, target_col]).len().to_pandas()
|
| 372 |
|
| 373 |
# Création des nœuds
|
| 374 |
+
labels = list(
|
| 375 |
+
pd.concat([df_grouped[source_col], df_grouped[target_col]]).unique()
|
| 376 |
+
)
|
| 377 |
label_to_index = {label: i for i, label in enumerate(labels)}
|
| 378 |
|
| 379 |
# Création des liens
|
|
|
|
| 382 |
values = df_grouped["len"]
|
| 383 |
|
| 384 |
# Création du Sankey Diagram
|
| 385 |
+
fig = go.Figure(
|
| 386 |
+
go.Sankey(
|
| 387 |
+
node=dict(
|
| 388 |
+
pad=15,
|
| 389 |
+
thickness=20,
|
| 390 |
+
line=dict(color="black", width=0.5),
|
| 391 |
+
label=labels,
|
| 392 |
+
),
|
| 393 |
+
link=dict(source=sources, target=targets, value=values),
|
| 394 |
)
|
| 395 |
+
)
|
| 396 |
+
|
| 397 |
+
fig.update_layout(
|
| 398 |
+
title_text=f"Flow between {source_col} and {target_col}", font_size=10
|
| 399 |
+
)
|
| 400 |
st.plotly_chart(fig, use_container_width=True)
|
| 401 |
|
| 402 |
st.subheader("Connections where access were identified as : PERMIT")
|
| 403 |
+
|
| 404 |
data_filtered = data.filter(pl.col("action") == "PERMIT")
|
| 405 |
# 🔹 Sankey entre IP source et IP destination
|
| 406 |
create_sankey(data_filtered, "ipsrc", "ipdst")
|
| 407 |
|
| 408 |
# 🔹 Sankey entre IP source et port destination
|
| 409 |
+
df = data_filtered.with_columns(
|
| 410 |
+
data_filtered["portdst"].cast(pl.Utf8)
|
| 411 |
+
) # Convertir les ports en chaînes pour éviter les erreurs
|
| 412 |
create_sankey(df, "ipsrc", "portdst")
|
| 413 |
|
| 414 |
st.subheader("Connections where access were identified as : DENY")
|
|
|
|
| 418 |
create_sankey(data_filtered, "ipsrc", "ipdst")
|
| 419 |
|
| 420 |
# 🔹 Sankey entre IP source et port destination
|
| 421 |
+
df = data_filtered.with_columns(
|
| 422 |
+
data_filtered["portdst"].cast(pl.Utf8)
|
| 423 |
+
) # Convertir les ports en chaînes pour éviter les erreurs
|
| 424 |
create_sankey(df, "ipsrc", "portdst")
|
|
|