Cyr-CK commited on
Commit
5599cc9
·
1 Parent(s): e76e281

Update analytics.py

Browse files
Files changed (1) hide show
  1. sections/analytics.py +16 -39
sections/analytics.py CHANGED
@@ -51,14 +51,18 @@ if st.button("Start clustering"):
51
  with st.spinner("Searching the clusters..."):
52
  try:
53
 
54
- pca = PCA(n_components=2)
55
- df_pca = pca.fit_transform(data_encoded.to_pandas())
 
 
 
 
56
 
57
  # Appliquer K-Means avec k optimal choisi
58
  k_optimal = 2 # Par exemple, supposons que k = 3
59
  kmeans = KMeans(n_clusters=k_optimal, random_state=42)
60
  preds = kmeans.fit_predict(df_pca)
61
- df_pca = pl.from_pandas(pd.DataFrame(df_pca, columns=[f"Component {i+1}" for i in range(k_optimal)]))
62
  df_clust = df_pca.with_columns(pl.Series(values=preds, name='cluster_kmeans'))
63
 
64
  if df_clust.shape[0] > 200000: # 200k
@@ -75,29 +79,29 @@ if st.button("Start clustering"):
75
  ###############################################################
76
 
77
 
78
- # Visualisation des clusters (en 2D avec PCA)
79
-
80
- # pca = PCA(n_components=2)
81
- # df_pca = pca.fit_transform(df_ech.to_pandas())
82
-
83
  fig = px.scatter(
84
  x=df_ech.select("Component 1").to_numpy().flatten(),
85
  y=df_ech.select("Component 2").to_numpy().flatten(),
86
  color=df_ech.select('cluster_kmeans').to_numpy().flatten().astype(str),
87
  color_discrete_map={"0": "rebeccapurple", "1": "gold"},
88
- title='Clustering coupled with PCA',
89
  labels={'x': 'Component 1', 'y': 'Component 2', 'color': 'Cluster'},
 
 
 
90
  )
91
 
92
  fig.update_layout(
93
- xaxis_title='Component 1',
94
- yaxis_title='Component 2'
95
  )
96
  # fig.show()
97
  st.plotly_chart(fig, use_container_width=True)
98
 
99
  except Exception as e:
100
- st.error(f"An error occured while doing the clustering : {e.with_traceback(None)}")
101
 
102
  with st.spinner("Performing some more data analysis..."):
103
  try:
@@ -132,33 +136,6 @@ if st.button("Start clustering"):
132
  )
133
  st.plotly_chart(fig, use_container_width=True)
134
 
135
- # # Analyse de la variable quantitative par cluster
136
- # for col in quanti.columns: # protocole, rule, action
137
- # fig = make_subplots(rows=1, cols=2)
138
-
139
- # data_filtered = data.filter(pl.col("cluster_kmeans") == 0)
140
-
141
- # # Ajouter le premier histogramme
142
- # fig.add_trace(
143
- # go.Histogram(x=data_filtered[col], name="Cluster 0", marker_color="rebeccapurple"),
144
- # row=1, col=1
145
- # )
146
-
147
- # data_filtered = data.filter(pl.col("cluster_kmeans") == 1)
148
-
149
- # # Ajouter le deuxième histogramme
150
- # fig.add_trace(
151
- # go.Histogram(x=data_filtered[col], name="Cluster 1", marker_color="gold"),
152
- # row=1, col=2
153
- # )
154
-
155
- # # Mettre à jour la mise en page pour améliorer l'apparence
156
- # fig.update_layout(
157
- # title_text=f"Histograms of {col}",
158
- # showlegend=True,
159
- # )
160
- # st.plotly_chart(fig, use_container_width=True)
161
-
162
  except Exception as e:
163
  st.error(f"An error occured while doing the data analysis : {e}")
164
  else:
 
51
  with st.spinner("Searching the clusters..."):
52
  try:
53
 
54
+ ncp = 2
55
+ pca = PCA(n_components=ncp)
56
+ df_pca = pca.fit_transform(data_encoded.to_pandas())
57
+
58
+ cp1_var = round(pca.explained_variance_ratio_[0],3)
59
+ cp2_var = round(pca.explained_variance_ratio_[1],3)
60
 
61
  # Appliquer K-Means avec k optimal choisi
62
  k_optimal = 2 # Par exemple, supposons que k = 3
63
  kmeans = KMeans(n_clusters=k_optimal, random_state=42)
64
  preds = kmeans.fit_predict(df_pca)
65
+ df_pca = pl.from_pandas(pd.DataFrame(df_pca, columns=[f"Component {i+1}" for i in range(ncp)]))
66
  df_clust = df_pca.with_columns(pl.Series(values=preds, name='cluster_kmeans'))
67
 
68
  if df_clust.shape[0] > 200000: # 200k
 
79
  ###############################################################
80
 
81
 
82
+ # Visualisation des clusters (en 2D avec PCA)
83
+ st.write(st.session_state.parsed_df.select("ipsrc").to_numpy().flatten())
 
 
 
84
  fig = px.scatter(
85
  x=df_ech.select("Component 1").to_numpy().flatten(),
86
  y=df_ech.select("Component 2").to_numpy().flatten(),
87
  color=df_ech.select('cluster_kmeans').to_numpy().flatten().astype(str),
88
  color_discrete_map={"0": "rebeccapurple", "1": "gold"},
89
+ title=f'Clustering coupled with PCA ({pca.explained_variance_ratio_.sum():.3f})',
90
  labels={'x': 'Component 1', 'y': 'Component 2', 'color': 'Cluster'},
91
+ hover_data={
92
+ "ip": st.session_state.parsed_df.select("ipsrc").to_numpy().flatten()
93
+ }
94
  )
95
 
96
  fig.update_layout(
97
+ xaxis_title=f'Component 1 ({cp1_var})',
98
+ yaxis_title=f'Component 2 ({cp2_var})'
99
  )
100
  # fig.show()
101
  st.plotly_chart(fig, use_container_width=True)
102
 
103
  except Exception as e:
104
+ st.error(f"An error occured while doing the clustering : {e}")
105
 
106
  with st.spinner("Performing some more data analysis..."):
107
  try:
 
136
  )
137
  st.plotly_chart(fig, use_container_width=True)
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  except Exception as e:
140
  st.error(f"An error occured while doing the data analysis : {e}")
141
  else: