Cyr-CK commited on
Commit
b78ea6e
·
1 Parent(s): 78f91c5

Clustering method working and post analysis

Browse files
Files changed (1) hide show
  1. sections/analytics.py +91 -6
sections/analytics.py CHANGED
@@ -1,5 +1,7 @@
1
  import pandas as pd
2
  import plotly.express as px
 
 
3
  import streamlit as st
4
  import polars as pl
5
 
@@ -11,7 +13,7 @@ if "parsed_df" not in st.session_state:
11
  st.session_state.parsed_df = None
12
 
13
  # Page title
14
- st.title("Analytiques")
15
 
16
  # Loading data
17
  if st.session_state.parsed_df is None:
@@ -67,9 +69,12 @@ if st.button("Start clustering"):
67
  k_optimal = 2 # Par exemple, supposons que k = 3
68
  kmeans = KMeans(n_clusters=k_optimal, random_state=42)
69
  preds = kmeans.fit_predict(df.to_pandas())
70
- df = df.with_columns(pl.Series(values=preds, name='cluster_kmeans'))
71
-
72
 
 
 
 
 
73
  # dbscan = DBSCAN(eps=0.5, min_samples=10)
74
  # preds = dbscan.fit_predict(df.to_pandas())
75
  # df = df.with_columns(pl.Series(values=preds, name='cluster_dbscan'))
@@ -87,12 +92,12 @@ if st.button("Start clustering"):
87
  from sklearn.decomposition import PCA
88
 
89
  pca = PCA(n_components=2)
90
- df_pca = pca.fit_transform(df.to_pandas())
91
 
92
  fig = px.scatter(
93
  x=df_pca[:, 0],
94
  y=df_pca[:, 1],
95
- color=df['cluster_kmeans'],
96
  color_continuous_scale='viridis',
97
  title='Clustering coupled with PCA',
98
  labels={'x': 'Component 1', 'y': 'Component 2', 'color': 'Cluster'},
@@ -107,7 +112,87 @@ if st.button("Start clustering"):
107
  st.plotly_chart(fig, use_container_width=True)
108
 
109
  except Exception as e:
110
- st.error(f"An error occured : {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  else:
112
  st.warning("Please parse the log file first.")
113
 
 
1
  import pandas as pd
2
  import plotly.express as px
3
+ from plotly.subplots import make_subplots
4
+ import plotly.graph_objs as go
5
  import streamlit as st
6
  import polars as pl
7
 
 
13
  st.session_state.parsed_df = None
14
 
15
  # Page title
16
+ st.title("Analytics")
17
 
18
  # Loading data
19
  if st.session_state.parsed_df is None:
 
69
  k_optimal = 2 # Par exemple, supposons que k = 3
70
  kmeans = KMeans(n_clusters=k_optimal, random_state=42)
71
  preds = kmeans.fit_predict(df.to_pandas())
72
+ df_clust = df.with_columns(pl.Series(values=preds, name='cluster_kmeans'))
 
73
 
74
+ df_ech = pl.from_pandas(df_clust.to_pandas()
75
+ .groupby("cluster_kmeans", group_keys=False)
76
+ .apply(lambda x: x.sample(frac=0.05, random_state=42))
77
+ )
78
  # dbscan = DBSCAN(eps=0.5, min_samples=10)
79
  # preds = dbscan.fit_predict(df.to_pandas())
80
  # df = df.with_columns(pl.Series(values=preds, name='cluster_dbscan'))
 
92
  from sklearn.decomposition import PCA
93
 
94
  pca = PCA(n_components=2)
95
+ df_pca = pca.fit_transform(df_ech.to_pandas())
96
 
97
  fig = px.scatter(
98
  x=df_pca[:, 0],
99
  y=df_pca[:, 1],
100
+ color=df_ech['cluster_kmeans'],
101
  color_continuous_scale='viridis',
102
  title='Clustering coupled with PCA',
103
  labels={'x': 'Component 1', 'y': 'Component 2', 'color': 'Cluster'},
 
112
  st.plotly_chart(fig, use_container_width=True)
113
 
114
  except Exception as e:
115
+ st.error(f"An error occured while doing the clustering : {e}")
116
+
117
+ with st.spinner("Performing some more data analysis..."):
118
+ try:
119
+ data = data.with_columns(pl.Series(name="cluster_kmeans", values=df_clust.select("cluster_kmeans")))
120
+ cols = ["protocole","regle1","status"]
121
+ for col in cols:
122
+ # fig = px.bar(freq_df, x=col, y='frequency',
123
+ # title=f'{col} frequency',
124
+ # labels={'categorie': 'Category', 'frequence': 'Frequency'},
125
+ # color=col)
126
+ # fig.update_layout(xaxis_title='Categories', yaxis_title='Frequency')
127
+ # st.plotly_chart(fig, use_container_width=True)
128
+
129
+ # data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
130
+ # freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
131
+
132
+ # fig = px.bar(freq_df, x=col, y='frequency',
133
+ # title=f'{col} frequency',
134
+ # labels={'categorie': 'Category', 'frequence': 'Frequency'},
135
+ # color=col)
136
+ # fig.update_layout(xaxis_title='Categories', yaxis_title='Frequency')
137
+ # st.plotly_chart(fig, use_container_width=True)
138
+
139
+
140
+
141
+ fig = make_subplots(rows=1, cols=2)
142
+
143
+ data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
144
+ freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
145
+
146
+ fig.add_trace(
147
+ go.Bar(x=freq_df[col], y=freq_df['frequency'], name='Cluster 0',
148
+ marker=dict(color='rebeccapurple')),
149
+ row=1, col=1
150
+ )
151
+
152
+ data_filtered = data.filter(pl.col("cluster_kmeans") == 1)
153
+ freq_df = data_filtered.group_by(col).agg(pl.count(col).alias("frequency"))
154
+
155
+ fig.add_trace(
156
+ go.Bar(x=freq_df[col], y=freq_df['frequency'], name='Cluster 1',
157
+ marker=dict(color='gold')),
158
+ row=1, col=2
159
+ )
160
+
161
+ fig.update_layout(
162
+ title=f'{col} frequencies by cluster',
163
+ xaxis_title='Category',
164
+ yaxis_title='Frequency',
165
+ showlegend=True
166
+ )
167
+ st.plotly_chart(fig, use_container_width=True)
168
+
169
+ fig = make_subplots(rows=1, cols=2)
170
+
171
+ data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
172
+
173
+ # Ajouter le premier histogramme
174
+ fig.add_trace(
175
+ go.Histogram(x=data_filtered["portdest"], name="Cluster 0", marker_color="rebeccapurple"),
176
+ row=1, col=1
177
+ )
178
+
179
+ data_filtered = data.filter(pl.col("cluster_kmeans") == 1)
180
+
181
+ # Ajouter le deuxième histogramme
182
+ fig.add_trace(
183
+ go.Histogram(x=data_filtered["portdest"], name="Cluster 1", marker_color="gold"),
184
+ row=1, col=2
185
+ )
186
+
187
+ # Mettre à jour la mise en page pour améliorer l'apparence
188
+ fig.update_layout(
189
+ title_text="Histograms of destination ports",
190
+ showlegend=True,
191
+ )
192
+ st.plotly_chart(fig, use_container_width=True)
193
+
194
+ except Exception as e:
195
+ st.error(f"An error occured while doing the data analysis : {e}")
196
  else:
197
  st.warning("Please parse the log file first.")
198