import pandas as pd import plotly.express as px from plotly.subplots import make_subplots import plotly.graph_objs as go import streamlit as st import polars as pl from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import OneHotEncoder from sklearn.cluster import KMeans from sklearn.decomposition import PCA if "parsed_df" not in st.session_state: st.session_state.parsed_df = None # Page title st.title("Analytics") # Loading data if st.session_state.parsed_df is None: st.info("Please upload a log file on the 'Upload' page.") st.stop() data = st.session_state.parsed_df data = data.select(["portdst","protocole","rule","action"]) ############################################## #### Preprocessing #### ############################################## # Encodage one-hot encoder = OneHotEncoder(sparse_output=False) data_encoded = encoder.fit_transform(data.to_pandas()) col_names = [ f"{feature}_{category}" for feature, categories in zip(data.columns, encoder.categories_) for category in categories ] # Convertir de nouveau en DataFrame Polars data_encoded = pl.from_pandas(pd.DataFrame(data_encoded, columns=col_names)) ############################################### #### Clustering #### ############################################### if st.button("Start clustering"): if st.session_state.parsed_df is not None: with st.spinner("Searching the clusters..."): try: ncp = 2 pca = PCA(n_components=ncp) df_pca = pca.fit_transform(data_encoded.to_pandas()) cp1_var = round(pca.explained_variance_ratio_[0],3) cp2_var = round(pca.explained_variance_ratio_[1],3) # Appliquer K-Means avec k optimal choisi k_optimal = 2 # Par exemple, supposons que k = 3 kmeans = KMeans(n_clusters=k_optimal, random_state=42) preds = kmeans.fit_predict(df_pca) df_pca = pl.from_pandas(pd.DataFrame(df_pca, columns=[f"Component {i+1}" for i in range(ncp)])) df_clust = df_pca.with_columns(pl.Series(values=preds, name='cluster_kmeans')) if df_clust.shape[0] > 200000: # 200k perc = 200000/df_clust.shape[0] else: perc = 1 df_ech = pl.from_pandas(df_clust.to_pandas() .groupby("cluster_kmeans", group_keys=False) .apply(lambda x: x.sample(frac=perc, random_state=42)) ) ############################################################### #### Visualisation des clusters #### ############################################################### # Visualisation des clusters (en 2D avec PCA) fig = px.scatter( x=df_ech.select("Component 1").to_numpy().flatten(), y=df_ech.select("Component 2").to_numpy().flatten(), color=df_ech.select('cluster_kmeans').to_numpy().flatten().astype(str), color_discrete_map={"0": "rebeccapurple", "1": "gold"}, title=f'Clustering coupled with PCA ({pca.explained_variance_ratio_.sum():.3f})', labels={'x': 'Component 1', 'y': 'Component 2', 'color': 'Cluster'}, hover_data={ "ip": st.session_state.parsed_df.select("ipsrc").to_numpy().flatten() } ) fig.update_layout( xaxis_title=f'Component 1 ({cp1_var})', yaxis_title=f'Component 2 ({cp2_var})' ) # fig.show() st.plotly_chart(fig, use_container_width=True) except Exception as e: st.error(f"An error occured while doing the clustering : {e}") with st.spinner("Performing some more data analysis..."): try: data_clust = data.with_columns(pl.Series(name="cluster_kmeans", values=df_clust.select("cluster_kmeans"))) # Analyse des variables qualitatives par cluster for col in data.columns : # portdst, protocole, rule, action fig = make_subplots(rows=1, cols=2) data_filtered = data_clust.filter(pl.col("cluster_kmeans") == 0) freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency")) fig.add_trace( go.Bar(x=freq_df[col], y=freq_df['frequency'], name='Cluster 0', marker=dict(color='rebeccapurple')), row=1, col=1 ) data_filtered = data_clust.filter(pl.col("cluster_kmeans") == 1) freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency")) fig.add_trace( go.Bar(x=freq_df[col], y=freq_df['frequency'], name='Cluster 1', marker=dict(color='gold')), row=1, col=2 ) fig.update_layout( title=f'{col} frequencies by cluster', xaxis_title='Category', yaxis_title='Frequency', showlegend=True ) st.plotly_chart(fig, use_container_width=True) except Exception as e: st.error(f"An error occured while doing the data analysis : {e}") else: st.warning("Please parse the log file first.")