File size: 17,855 Bytes
8e30e05
1a52323
8e30e05
 
1a52323
d824eb3
8e30e05
 
d1fd9e1
 
 
 
 
 
 
1d4f790
d1fd9e1
 
 
 
 
 
1a52323
 
 
 
 
 
8e30e05
1a52323
 
 
 
 
 
8e30e05
 
1a52323
1d4f790
5457771
 
a6d49bf
1d4f790
 
 
8e30e05
1d4f790
 
 
 
 
 
 
8e30e05
 
 
 
 
 
 
 
 
 
 
 
 
1d4f790
 
 
 
 
8e30e05
 
 
 
 
 
 
 
 
 
 
1d4f790
a684a19
1d4f790
a684a19
 
4d7885a
 
 
 
 
 
1d4f790
4d7885a
 
1d4f790
a684a19
1d4f790
 
a684a19
4d7885a
e76e281
 
4d7885a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d4f790
4d7885a
1d4f790
4d7885a
e2408de
1d4f790
 
4d7885a
1d4f790
 
4d7885a
 
 
 
 
 
e2408de
1d4f790
 
4d7885a
 
e2408de
 
a684a19
4d7885a
 
 
 
e2408de
a684a19
 
e2408de
4d7885a
 
e2408de
 
1d4f790
633774d
a6d49bf
e2408de
d1fd9e1
4d7885a
 
 
d1fd9e1
a6d49bf
1d4f790
5457771
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6d49bf
 
 
 
 
 
 
 
e76e281
a6d49bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a52323
 
 
 
 
 
 
 
 
 
8e30e05
1a52323
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e30e05
1a52323
 
 
8e30e05
1a52323
 
 
 
 
 
 
 
 
 
ccf68a7
1a52323
ccf68a7
8e30e05
 
 
ccf68a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a52323
ccf68a7
8e30e05
 
 
ccf68a7
 
 
1a52323
 
ccf68a7
1a52323
ccf68a7
1a52323
ccf68a7
1a52323
 
 
 
ccf68a7
1a52323
ccf68a7
 
8e30e05
 
 
 
 
1a52323
 
 
 
 
 
 
 
 
 
a6d49bf
5457771
1a52323
 
 
 
8e30e05
 
 
 
 
 
1a52323
 
8e30e05
 
 
 
 
 
 
1a52323
 
8e30e05
1a52323
 
 
 
 
 
 
 
 
8e30e05
 
 
1a52323
 
 
 
 
a6d49bf
5457771
1d4f790
8e30e05
d824eb3
8e30e05
b3c5e03
d824eb3
 
8e30e05
 
 
d824eb3
 
 
 
 
 
 
 
8e30e05
 
 
 
 
 
 
 
 
d824eb3
8e30e05
 
 
 
 
d824eb3
 
b3c5e03
8e30e05
b3c5e03
 
 
 
 
8e30e05
e76e281
 
b3c5e03
 
 
 
 
d824eb3
b3c5e03
d824eb3
 
8e30e05
e76e281
 
b3c5e03
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
import datetime
import ipaddress

import pandas as pd
import plotly.express as px
import plotly.graph_objs as go
import polars as pl
import streamlit as st

if "parsed_df" not in st.session_state:
    st.session_state.parsed_df = None

# Page title
st.title("Data Analysis")

# Vérifier que les données sont chargées
if st.session_state.parsed_df is None:
    st.info("Please upload a log file on the 'Upload' page.")
    st.stop()

data = st.session_state.parsed_df

university_subnets = [
    ipaddress.ip_network("192.168.0.0/16"),
    ipaddress.ip_network("10.79.0.0/16"),
    ipaddress.ip_network("159.84.0.0/16"),
]


# Fonction pour vérifier si une IP appartient aux sous-réseaux universitaires
def is_university_ip(ip):
    try:
        ip_obj = ipaddress.ip_address(ip)
        return any(ip_obj in subnet for subnet in university_subnets)
    except ValueError:
        return False


# Créer les onglets principaux
tab1, tab2, tab3, tab4, tab5 = st.tabs(
    ["Explore data", "Dataviz", "Analysis", "Foreign IP addresses", "Sankey"]
)

# Onglet Analysis
with tab1:
    st.subheader("Explore data")

    # Vérifier que la colonne timestamp existe et est bien de type datetime
    if "timestamp" in data.columns and data["timestamp"].dtype == pl.Datetime:
        # Obtenir les valeurs min et max des dates
        min_date = data["timestamp"].min().date()
        max_date = data["timestamp"].max().date()

        # Convertir les dates min et max en datetime pour inclure heures et minutes
        min_date_time = datetime.datetime.combine(min_date, datetime.time(0, 0))
        max_date_time = datetime.datetime.combine(max_date, datetime.time(23, 59))

        selected_date_range = st.slider(
            "Filter by date & time",
            min_value=min_date_time,
            max_value=max_date_time,
            value=(min_date_time, max_date_time),
            format="YYYY-MM-DD HH:mm",
        )
        start_date, end_date = selected_date_range

        # Disposition des filtres en colonnes
        col1, col2, col3 = st.columns(3)

        # ---- FILTRE DATE ----
        with col1:
            st.markdown("### 🛠 Protocol")
            if "protocole" in data.columns:
                unique_protocols = sorted(
                    data["protocole"].unique().cast(pl.Utf8).to_list()
                )
                selected_protocol = st.selectbox(
                    "Select a protocol", ["All"] + unique_protocols
                )
            else:
                selected_protocol = "All"
                st.warning("Column 'protocole' not found.")

        # ---- FILTRE action----
        with col2:
            st.markdown("### 🔄 Action")
            if "action" in data.columns:
                unique_action = sorted(
                    data["action"].unique().cast(pl.Utf8).to_list()
                )  # S'assurer du bon format
                selected_action = st.selectbox(
                    "Select an action", ["All"] + unique_action
                )
            else:
                selected_action = "All"
                st.warning("Column 'action' not found.")

        # ---- FILTRE portdst ----
        with col3:
            st.markdown("### 🔢 Port")
            if "portdst" in data.columns:
                min_port, max_port = (
                    int(data["portdst"].cast(pl.Utf8).min()),
                    int(data["portdst"].cast(pl.Utf8).max()),
                )

                # Initialize port range in session state if not present
                if "port_range" not in st.session_state:
                    st.session_state.port_range = (min_port, max_port)

                # Quick port range selection buttons
                col_ports1, col_ports2, col_ports3 = st.columns(3)

                # Define button handlers to update session state
                def set_well_known():
                    st.session_state.port_range = (0, 1023)

                def set_registered():
                    st.session_state.port_range = (1024, 49151)

                def set_dynamic():
                    st.session_state.port_range = (49152, 65535)

                with col_ports1:
                    st.button("Well-known (0-1023)", on_click=set_well_known)

                with col_ports2:
                    st.button("Registered (1024-49151)", on_click=set_registered)

                with col_ports3:
                    st.button("Dynamic (49152-65535)", on_click=set_dynamic)

                # Custom range slider that uses and updates the session state
                selected_port = st.slider(
                    "Custom port range",
                    min_port,
                    max_port,
                    value=st.session_state.port_range,
                    key="port_slider",
                )

                # Update port_range when slider changes
                st.session_state.port_range = selected_port
            else:
                min_port, max_port = 0, 65535  # Standard TCP/IP port range
                selected_port = (min_port, max_port)
                st.warning("Column 'portdst' not found, default values applied.")

        # Vérification des dates sélectionnées
        if start_date > end_date:
            st.error("The start date cannot be later than the end date.")
        else:
            # Conversion des dates en datetime
            start_datetime = pl.datetime(
                start_date.year, start_date.month, start_date.day
            )
            end_datetime = pl.datetime(
                end_date.year, end_date.month, end_date.day, 23, 59, 59
            )

            # ---- APPLICATION DES FILTRES ----
            filtered_data = data.filter(
                (pl.col("timestamp") >= start_datetime)
                & (pl.col("timestamp") <= end_datetime)
            )

            # Correction du filtrage par action(forcer conversion Utf8)
            if "action" in data.columns and selected_action != "All":
                filtered_data = filtered_data.filter(
                    pl.col("action").cast(pl.Utf8) == selected_action
                )

            # Filtrer par portdst en prenant en compte min/max
            if "portdst" in data.columns:
                filtered_data = filtered_data.filter(
                    (pl.col("portdst").cast(pl.Int64) >= selected_port[0])
                    & (pl.col("portdst").cast(pl.Int64) <= selected_port[1])
                )

            # Affichage des données filtrées
            st.write(f"### 🔍 Data filtered : {filtered_data.shape[0]} entries")
            st.dataframe(filtered_data, use_container_width=True)

    else:
        st.warning(
            "The 'timestamp' column does not exist or is not in datetime format."
        )

# Onglet Analysis
with tab2:
    st.subheader("Dataviz")

    # Créer ici un scatter plot permettant une Visualisation interactive des données (IP source avec le nombre
    # d’occurrences de destination contactées, incluant le nombre de flux rejetés et autorisés).

    # Agréger les données par IP source
    df_agg = data.group_by("ipsrc").agg(
        [
            pl.col("ipdst").n_unique().alias("distinct_destinations"),
            ((pl.col("action") == "PERMIT").cast(pl.Int64)).sum().alias("count_permit"),
            ((pl.col("action") == "DENY").cast(pl.Int64)).sum().alias("count_deny"),
        ]
    )

    # Créer un scatter plot
    if not df_agg.is_empty():
        # We need to recreate the aggregation to count distinct destinations per action type
        permit_agg = (
            data.filter(pl.col("action") == "PERMIT")
            .group_by("ipsrc")
            .agg(
                [
                    pl.col("ipdst").n_unique().alias("distinct_destinations"),
                    pl.count("ipsrc").alias("connections"),
                ]
            )
            .with_columns(pl.lit("PERMIT").alias("action"))
        )

        deny_agg = (
            data.filter(pl.col("action") == "DENY")
            .group_by("ipsrc")
            .agg(
                [
                    pl.col("ipdst").n_unique().alias("distinct_destinations"),
                    pl.count("ipsrc").alias("connections"),
                ]
            )
            .with_columns(pl.lit("DENY").alias("action"))
        )

        # Combine both datasets
        combined_df = pl.concat([permit_agg, deny_agg])

        # Convert to pandas
        df_pandas = combined_df.to_pandas()

        # Create the scatter plot with two points per IP source (one for PERMIT, one for DENY)
        fig = px.scatter(
            df_pandas,
            x="ipsrc",
            y="distinct_destinations",
            color="action",
            size="connections",
            color_discrete_map={"PERMIT": "blue", "DENY": "red"},
            hover_data=["connections", "action"],
            title="Number of Distinct Destinations Contacted by Each IP Source",
            labels={
                "ipsrc": "Source IP Address",
                "distinct_destinations": "Number of Distinct Destinations",
                "connections": "Number of Connections",
                "action": "Action",
            },
        )

        # Improve layout for better readability
        fig.update_layout(
            xaxis={"categoryorder": "total descending"}, legend_title="Action Type"
        )

        st.plotly_chart(fig, use_container_width=True)
    else:
        st.info("No data available for scatter plot.")

# Onglet Analysis
with tab3:
    st.subheader("Analysis")

    # Afficher ici le top 10 des ports inférieurs à 1024 avec accès autorisé
    st.write(
        "### 🔢 Top 10 ports with authorized access"
        " (portdst < 1024 and action == 'PERMIT')"
    )
    top_ports = (
        data.filter((pl.col("portdst").cast(pl.Int64) < 1024) & (pl.col("action") == "PERMIT"))
        .group_by("portdst")
        .agg(pl.count("portdst").alias("count"))
        .sort("count", descending=True)
        .head(10)
    )
    st.dataframe(top_ports, use_container_width=True)

    # Afficher ici le top 5 des IP sources les plus émettrices
    st.write("### 🌐 Top 5 emitting IP addresses (ipsource and action == 'PERMIT')")
    top_ips = (
        data.filter(pl.col("action") == "PERMIT")
        .group_by("ipsrc")
        .agg(pl.count("ipsrc").alias("count"))
        .sort("count", descending=True)
        .head(5)
    )
    st.dataframe(top_ips, use_container_width=True)

    # Graphique

    st.write("### 🔴 Analysis of Blocked Attempts")

    if "ipsrc" in data.columns and "action" in data.columns:
        # Filtrer uniquement les tentatives bloquées
        blocked_attempts = data.filter(pl.col("action") == "DENY")

        # Compter les occurrences des IP sources bloquées
        blocked_ips = (
            blocked_attempts.group_by("ipsrc")
            .agg(pl.count("ipsrc").alias("count"))
            .sort("count", descending=True)
        )

        top_n = st.slider(" ", 5, 20, 10, key="top_n_slider")

        # Sélectionner le Top N des IP bloquées
        top_blocked_ips = blocked_ips.head(top_n)

        # ---- GRAPHIQUE AVEC PLOTLY ----
        color_palette = px.colors.sequential.Blues
        if not top_blocked_ips.is_empty():
            fig = px.bar(
                top_blocked_ips.to_pandas(),  # Convertir en DataFrame Pandas pour Plotly
                x="count",
                y="ipsrc",
                orientation="h",
                text="count",
                title=f"Top {top_n} Most Blocked IPs",
                labels={"ipsrc": "IP Source", "count": "Number of Blocked Attempts"},
                color_discrete_sequence=["#3d85c6"],
            )

            # Amélioration du layout
            fig.update_traces(texttemplate="%{text}", textposition="inside")
            fig.update_layout(yaxis=dict(categoryorder="total ascending"))

            # Afficher le graphique interactif
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.info("No blocked attempts found.")
    else:
        st.warning("Columns 'ipsrc' or 'action' not found.")

    # Graphique de série temporelle des connexions par heure
    st.write("### 📊 Connection Activity Analysis")
    if "timestamp" in data.columns:
        # 📌 Ajout d'un sélecteur de fréquence
        frequency = st.selectbox(
            "Select frequency", ["second", "minute", "hour", "day"], index=1
        )

        # Définition des formats selon la fréquence choisie
        if frequency == "second":
            time_format = "%Y-%m-%d %H:%M:%S"
            time_label = "Second"
        elif frequency == "minute":
            time_format = "%Y-%m-%d %H:%M:00"
            time_label = "Minute"
        elif frequency == "hour":
            time_format = "%Y-%m-%d %H:00:00"
            time_label = "Hour"
        else:
            time_format = "%Y-%m-%d"
            time_label = "Day"

        # Filtrage et regroupement
        activity_data = (
            data.filter(pl.col("action") == "PERMIT")
            .with_columns(
                pl.col("timestamp").dt.strftime(time_format).alias("time_period")
            )
            .group_by("time_period")
            .agg(pl.count("time_period").alias("connection_count"))
            .sort("time_period")
        )

        # Vérifier s'il y a des données
        if not activity_data.is_empty():
            # Convertir en Pandas
            df_activity = activity_data.to_pandas()
            df_activity["time_period"] = pd.to_datetime(df_activity["time_period"])

            # Tracer le graphique
            fig = px.line(
                df_activity,
                x="time_period",
                y="connection_count",
                markers=True,
                title=f"Connection Activity ({time_label} level)",
                labels={
                    "time_period": time_label,
                    "connection_count": "Number of Connections",
                },
                line_shape="spline",
            )

            # Afficher le graphique
            st.plotly_chart(fig, use_container_width=True)
        else:
            st.info("No connection data found for the selected period.")
    else:
        st.warning("Column 'timestamp' not found.")


# Onglet Foreign IP addresses
with tab4:
    st.subheader("🚫 List of access outside the university network")

    if "ipsrc" in data.columns and "action" in data.columns:
        # Conversion des IPs en chaînes de caractères pour éviter les erreurs de type
        data = data.with_columns(
            [
                pl.col("ipsrc").cast(pl.Utf8).alias("ipsrc"),
                pl.col("action").cast(pl.Utf8).alias("action"),
            ]
        )

        # Vérification des IPs avec la fonction is_university_ip
        data = data.with_columns(
            [
                pl.col("ipsrc")
                .map_elements(is_university_ip, return_dtype=pl.Boolean)
                .alias("is_src_university_ip")
            ]
        )

        # filtrer toutes les connexions impliquant une adresse externe
        intrusion_attempts = data.filter((~pl.col("is_src_university_ip")))
        # Ajout d'un filtre par action
        selected_action = st.selectbox("Select action type", ["All", "PERMIT", "DENY"])

        if selected_action != "All":
            intrusion_attempts = intrusion_attempts.filter(
                pl.col("action") == selected_action
            )
        # Affichage des accès externes
        st.write(f"### 🔍 External accesses: {intrusion_attempts.shape[0]} entries")
        st.dataframe(
            intrusion_attempts.drop(["is_src_university_ip"]), use_container_width=True
        )

    else:
        st.warning("Columns 'ipsrc' not found.")


# Onglet Sankey
with tab5:
    st.subheader("Sankey Diagram")

    def create_sankey(df, source_col, target_col):
        """Crée un diagramme de Sankey entre deux colonnes"""
        df_grouped = df.group_by([source_col, target_col]).len().to_pandas()

        # Création des nœuds
        labels = list(
            pd.concat([df_grouped[source_col], df_grouped[target_col]]).unique()
        )
        label_to_index = {label: i for i, label in enumerate(labels)}

        # Création des liens
        sources = df_grouped[source_col].map(label_to_index)
        targets = df_grouped[target_col].map(label_to_index)
        values = df_grouped["len"]

        # Création du Sankey Diagram
        fig = go.Figure(
            go.Sankey(
                node=dict(
                    pad=15,
                    thickness=20,
                    line=dict(color="black", width=0.5),
                    label=labels,
                ),
                link=dict(source=sources, target=targets, value=values),
            )
        )

        fig.update_layout(
            title_text=f"Flow between {source_col} and {target_col}", font_size=10
        )
        st.plotly_chart(fig, use_container_width=True)

    st.subheader("Connections where access were identified as : PERMIT")

    data_filtered = data.filter(pl.col("action") == "PERMIT")
    # 🔹 Sankey entre IP source et IP destination
    create_sankey(data_filtered, "ipsrc", "ipdst")

    # 🔹 Sankey entre IP source et port destination
    df = data_filtered.with_columns(
        data_filtered["portdst"]
    ) 
    create_sankey(df, "ipsrc", "portdst")

    st.subheader("Connections where access were identified as : DENY")

    data_filtered = data.filter(pl.col("action") == "DENY")
    # 🔹 Sankey entre IP source et IP destination
    create_sankey(data_filtered, "ipsrc", "ipdst")

    # 🔹 Sankey entre IP source et port destination
    df = data_filtered.with_columns(
        data_filtered["portdst"]
    )
    create_sankey(df, "ipsrc", "portdst")