Cyr-CK commited on
Commit
90f858e
·
1 Parent(s): e2408de

Updated requirements & ML section

Browse files
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. sections/ml.py +76 -5
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  pandas
2
  streamlit
3
  plotly
4
- polars
 
 
1
  pandas
2
  streamlit
3
  plotly
4
+ polars
5
+ scikit-learn
sections/ml.py CHANGED
@@ -1,6 +1,11 @@
1
  import pandas as pd
2
  import plotly.express as px
3
  import streamlit as st
 
 
 
 
 
4
 
5
  if "parsed_df" not in st.session_state:
6
  st.session_state.parsed_df = None
@@ -15,16 +20,82 @@ if st.session_state.parsed_df is None:
15
 
16
  data = st.session_state.parsed_df
17
 
18
- # Sidebar for controls
19
- st.dataframe(data)
20
-
21
-
22
  ##############################################
23
  #### Preprocessing ####
24
  ##############################################
25
 
 
 
 
 
 
 
26
 
27
 
28
  ###############################################
29
  #### Clustering ####
30
- ###############################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
  import plotly.express as px
3
  import streamlit as st
4
+ import polars as pl
5
+
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
8
+ import matplotlib.pyplot as plt
9
 
10
  if "parsed_df" not in st.session_state:
11
  st.session_state.parsed_df = None
 
20
 
21
  data = st.session_state.parsed_df
22
 
 
 
 
 
23
  ##############################################
24
  #### Preprocessing ####
25
  ##############################################
26
 
27
+ # Normalisation des données (Standardisation : moyenne = 0, écart-type = 1)
28
+ scaler = StandardScaler()
29
+ df_scaled = scaler.fit_transform(data.to_pandas())
30
+
31
+ # Convertir de nouveau en DataFrame Polars
32
+ df_scaled = pl.from_pandas(pd.DataFrame(df_scaled, columns=data.columns))
33
 
34
 
35
  ###############################################
36
  #### Clustering ####
37
+ ###############################################
38
+
39
+ if st.button("Start clustering"):
40
+ if st.session_state.parsed_df is not None:
41
+ with st.spinner("Searching the clusters..."):
42
+ try:
43
+ # Appliquer K-Means avec k optimal choisi
44
+ k_optimal = 2 # Par exemple, supposons que k = 3
45
+ kmeans = KMeans(n_clusters=k_optimal, random_state=42)
46
+ df_scaled = df_scaled.with_columns(pl.Series(kmeans.fit_predict(df_scaled.to_pandas()), name='cluster_kmeans'))
47
+
48
+ # Appliquer DBSCAN (epsilon et min_samples sont des hyperparamètres)
49
+ # dbscan = DBSCAN(eps=0.5, min_samples=10)
50
+ # df_scaled = df_scaled.with_columns(pl.Series(dbscan.fit_predict(df_scaled.to_pandas()), name='cluster_dbscan'))
51
+
52
+ # Appliquer Agglomerative Clustering
53
+ # agg_clustering = AgglomerativeClustering(n_clusters=2)
54
+ # df_scaled = df_scaled.with_columns(pl.Series(agg_clustering.fit_predict(df_scaled.to_pandas()), name='cluster_agg'))
55
+
56
+ ###############################################################
57
+ #### Visualisation des clusters ####
58
+ ###############################################################
59
+
60
+
61
+ # Visualisation des clusters (en 2D avec PCA)
62
+ from sklearn.decomposition import PCA
63
+
64
+ pca = PCA(n_components=2)
65
+ df_pca = pca.fit_transform(df_scaled.to_pandas())
66
+
67
+ fig = px.scatter(
68
+ x=df_pca[:, 0],
69
+ y=df_pca[:, 1],
70
+ color=df_scaled['cluster_kmeans'],
71
+ color_continuous_scale='viridis',
72
+ title='Clustering coupled with PCA',
73
+ labels={'x': 'Component 1', 'y': 'Component 2', 'color': 'Cluster'},
74
+ )
75
+
76
+ fig.update_layout(
77
+ xaxis_title='Component 1',
78
+ yaxis_title='Component 2'
79
+ )
80
+
81
+ fig.show()
82
+
83
+ except Exception as e:
84
+ st.error(f"An error occured : {e}")
85
+ else:
86
+ st.warning("Please parse the log file first.")
87
+
88
+ # Choisir le nombre de clusters (méthode du coude)
89
+ # inertia = []
90
+ # for k in range(1, 11):
91
+ # kmeans = KMeans(n_clusters=k, random_state=42)
92
+ # kmeans.fit(df_scaled.to_pandas())
93
+ # inertia.append(kmeans.inertia_)
94
+
95
+ # # Tracer la courbe pour la méthode du coude
96
+ # plt.plot(range(1, 11), inertia, marker='o')
97
+ # plt.title('Méthode du coude')
98
+ # plt.xlabel('Nombre de clusters')
99
+ # plt.ylabel('Inertie')
100
+ # plt.show()
101
+