kawaiipeace commited on
Commit
53e2114
·
1 Parent(s): bb4e66c

update model

Browse files
.gitattributes copy DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -1,5 +1,5 @@
1
  .env
2
- models/__pycache__
3
  utils/__pycache__
4
  dataset/*
5
  figure/*
 
1
  .env
2
+ clustering/__pycache__
3
  utils/__pycache__
4
  dataset/*
5
  figure/*
app.py CHANGED
@@ -1,39 +1,226 @@
1
  import gradio as gr
2
- from clustering.pure import run_kmeans, run_dbscan
3
- from clustering.pretrained import run_bert_clustering
4
- from utils.preprocess import load_csv, get_numeric, get_text_column
5
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- def cluster(file, model_type, algorithm, n_clusters, eps, min_samples, pretrained_model):
 
8
  df = load_csv(file.name)
9
-
 
 
 
 
 
10
  if model_type == "Pure":
11
- X = get_numeric(df)
 
 
 
 
 
 
 
 
12
  if algorithm == "KMeans":
13
- labels = run_kmeans(X, n_clusters)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  else:
15
- labels = run_dbscan(X, eps, min_samples)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  else:
 
17
  texts = get_text_column(df)
 
 
 
 
18
  labels = run_bert_clustering(texts, n_clusters, pretrained_model)
19
-
20
- df["Cluster"] = labels
21
- return df
22
-
23
- iface = gr.Interface(
24
- fn=cluster,
25
- inputs=[
26
- gr.File(label="Upload CSV"),
27
- gr.Radio(["Pure", "Pretrained"], label="Model Type"),
28
- gr.Radio(["KMeans", "DBSCAN"], label="Algorithm"),
29
- gr.Slider(2, 20, step=1, label="n_clusters"),
30
- gr.Slider(0.1, 5.0, step=0.1, label="DBSCAN eps"),
31
- gr.Slider(1, 20, step=1, label="DBSCAN min_samples"),
32
- gr.Textbox(value="all-MiniLM-L6-v2", label="Pretrained Model Name")
33
- ],
34
- outputs=gr.Dataframe(label="Resulting Clusters"),
35
- title="Unsupervised Clustering App"
36
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  if __name__ == "__main__":
39
  iface.launch()
 
1
  import gradio as gr
 
 
 
2
  import pandas as pd
3
+ import numpy as np
4
+ from clustering.pure import run_kmeans, run_dbscan, run_fuzzy_cmeans, run_som
5
+ from clustering.pretrained import run_bert_clustering
6
+ from utils.preprocess import (
7
+ load_csv,
8
+ get_numeric,
9
+ get_text_column,
10
+ normalize_data,
11
+ denormalize_data,
12
+ )
13
+ from utils.visualize import plot_embedding, plot_som
14
+ from sklearn.metrics import silhouette_score
15
+ import tempfile
16
+ import matplotlib.pyplot as plt
17
+ import os
18
+
19
+ trained_model = {
20
+ "labels": None,
21
+ "X": None,
22
+ "model_type": None,
23
+ "algorithm": None,
24
+ "centroid": None,
25
+ "pretrained_model": None,
26
+ "scaler": None,
27
+ "model": None,
28
+ "normalizer": None,
29
+ }
30
+
31
 
32
+ def cluster(file, model_type, algorithm, n_clusters, eps, min_samples, pretrained_model,
33
+ missing_strategy, dim_reduce_method, n_init, max_iter, normalize_method):
34
  df = load_csv(file.name)
35
+
36
+ download_path = None
37
+ plot_path = None
38
+ embedding_path = None
39
+ som_plot_path = None
40
+
41
  if model_type == "Pure":
42
+ X = get_numeric(df, strategy=missing_strategy)
43
+ trained_model["model_type"] = "Pure"
44
+ trained_model["algorithm"] = algorithm
45
+ trained_model["X"] = X
46
+
47
+ # Normalize before clustering
48
+ X_norm, norm_scaler = normalize_data(X, normalize_method)
49
+ trained_model["normalizer"] = norm_scaler
50
+
51
  if algorithm == "KMeans":
52
+ model, labels, scaler, X_scaled = run_kmeans(
53
+ X_norm,
54
+ n_clusters,
55
+ init="k-means++",
56
+ n_init=n_init,
57
+ max_iter=max_iter,
58
+ random_state=42,
59
+ algorithm="lloyd",
60
+ )
61
+ trained_model["centroid"] = model.cluster_centers_
62
+
63
+ elif algorithm == "DBSCAN":
64
+ model, labels, scaler, X_scaled = run_dbscan(X_norm, eps, min_samples)
65
+ trained_model["centroid"] = None
66
+
67
+ elif algorithm == "Fuzzy C-Means":
68
+ model, labels, scaler, X_scaled = run_fuzzy_cmeans(X_norm, n_clusters)
69
+ trained_model["centroid"] = None
70
+
71
+ elif algorithm == "SOM":
72
+ model, labels, scaler, X_scaled = run_som(X_norm)
73
+ trained_model["centroid"] = None
74
+
75
+ else:
76
+ raise ValueError(f"Unknown algorithm: {algorithm}")
77
+
78
+ if algorithm == "DBSCAN":
79
+ labels = np.where(labels == -1, 0, labels + 1)
80
  else:
81
+ labels = labels + 1
82
+
83
+ trained_model["model"] = model
84
+ trained_model["scaler"] = scaler
85
+ trained_model["labels"] = labels
86
+
87
+ df_export = df.copy()
88
+ df_export["cluster"] = labels
89
+ filename = f"cluster_{algorithm.replace(' ', '_').upper()}.csv"
90
+ download_path = os.path.join(tempfile.gettempdir(), filename)
91
+ df_export.to_csv(download_path, index=False)
92
+
93
+ if X_scaled.shape[1] >= 2:
94
+ X_plot = denormalize_data(pd.DataFrame(X_scaled, columns=X.columns), norm_scaler)
95
+ plt.figure()
96
+ plt.scatter(X_plot.iloc[:, 0], X_plot.iloc[:, 1], c=labels, cmap="tab10", s=30)
97
+ if algorithm == "KMeans" and trained_model["centroid"] is not None:
98
+ centroids = trained_model["centroid"]
99
+ centroids_denorm = denormalize_data(
100
+ pd.DataFrame(centroids, columns=X.columns), norm_scaler
101
+ )
102
+ plt.scatter(
103
+ centroids_denorm.iloc[:, 0],
104
+ centroids_denorm.iloc[:, 1],
105
+ c="red",
106
+ marker="X",
107
+ s=100,
108
+ label="Centroids",
109
+ )
110
+ plt.legend()
111
+ plt.title(f"Cluster Scatter Plot (first 2 features) - {algorithm}")
112
+ plt.tight_layout()
113
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as img_tmp:
114
+ plt.savefig(img_tmp.name)
115
+ plot_path = img_tmp.name
116
+ plt.close()
117
+
118
+ if dim_reduce_method != "None" and algorithm != "SOM":
119
+ try:
120
+ embedding_path = plot_embedding(X_scaled, labels, method=dim_reduce_method)
121
+ except Exception as e:
122
+ print("UMAP/TSNE Failed:", e)
123
+
124
+ if algorithm == "SOM":
125
+ try:
126
+ som_plot_path = plot_som(model, X_scaled, labels)
127
+ except Exception as e:
128
+ print("SOM visualization failed:", e)
129
+
130
+ score = silhouette_score(X_scaled, labels) if len(set(labels)) > 1 else -1
131
+ df_out = pd.DataFrame({"Set": ["All Data"], "Silhouette Score": [score]})
132
+
133
+ return (
134
+ df_out,
135
+ download_path,
136
+ plot_path,
137
+ som_plot_path if algorithm == "SOM" else embedding_path,
138
+ )
139
+
140
  else:
141
+ trained_model["model_type"] = "Pretrained"
142
  texts = get_text_column(df)
143
+
144
+ if not texts or len(texts) == 0:
145
+ raise ValueError("No text data found for pretrained clustering. Check your CSV and preprocessing.")
146
+
147
  labels = run_bert_clustering(texts, n_clusters, pretrained_model)
148
+
149
+ trained_model["labels"] = labels
150
+ trained_model["X"] = texts
151
+ trained_model["pretrained_model"] = pretrained_model
152
+
153
+ df_export = df.copy()
154
+ df_export["cluster"] = labels
155
+ filename = f"cluster_PRETRAINED.csv"
156
+ download_path = os.path.join(tempfile.gettempdir(), filename)
157
+ df_export.to_csv(download_path, index=False)
158
+
159
+ df_out = pd.DataFrame({"Set": ["All Data"], "Clusters": [len(set(labels))]})
160
+ return df_out, download_path, None, None
161
+
162
+
163
+ with gr.Blocks() as iface:
164
+ gr.Markdown("## 🧠 Unsupervised Clustering App")
165
+
166
+ file = gr.File(label="Upload CSV")
167
+ model_type = gr.Radio(["Pure", "Pretrained"], label="Model Type", value="Pure")
168
+ algorithm = gr.Radio(
169
+ ["KMeans", "DBSCAN", "Fuzzy C-Means", "SOM"],
170
+ label="Algorithm",
171
+ value="KMeans",
172
+ )
173
+
174
+ n_clusters = gr.Slider(2, 20, step=1, label="Number of Clusters", visible=True)
175
+ n_init = gr.Slider(1, 50, step=1, label="Number of Initial Samples", value=30, visible=True)
176
+ max_iter = gr.Slider(100, 5000, step=100, label="Max Iteration", value=2000, visible=True)
177
+ eps = gr.Slider(0.1, 5.0, step=0.1, label="Epsilon", visible=False)
178
+ min_samples = gr.Slider(1, 20, step=1, label="Minimum Samples", visible=False)
179
+ pretrained_model = gr.Textbox(value="all-MiniLM-L6-v2", label="Pretrained Model Name", visible=False)
180
+ missing_strategy = gr.Dropdown(["Fill with Mean", "Fill with Zero", "Drop Rows"],
181
+ label="Missing Value Strategy", value="Drop Rows")
182
+ normalize_method = gr.Radio(["none", "mapminmax", "z-score"],
183
+ label="Normalization Method", value="none")
184
+ dim_reduce_method = gr.Radio(["None", "UMAP", "TSNE"], label="Dimensionality Reduction", value="None", visible=False)
185
+
186
+ def update_fields(model_type_val, algorithm_val):
187
+ if model_type_val == "Pure":
188
+ return (
189
+ gr.update(visible=(algorithm_val in ["KMeans", "Fuzzy C-Means"])), # n_clusters
190
+ gr.update(visible=(algorithm_val == "KMeans")), # n_init
191
+ gr.update(visible=(algorithm_val == "KMeans")), # max_iter
192
+ gr.update(visible=(algorithm_val == "DBSCAN")), # eps
193
+ gr.update(visible=(algorithm_val == "DBSCAN")), # min_samples
194
+ gr.update(visible=False), # pretrained_model
195
+ )
196
+ else:
197
+ return (
198
+ gr.update(visible=True), # n_clusters
199
+ gr.update(visible=False),
200
+ gr.update(visible=False),
201
+ gr.update(visible=False),
202
+ gr.update(visible=False),
203
+ gr.update(visible=True),
204
+ )
205
+
206
+ model_type.change(fn=update_fields, inputs=[model_type, algorithm],
207
+ outputs=[n_clusters, n_init, max_iter, eps, min_samples, pretrained_model])
208
+ algorithm.change(fn=update_fields, inputs=[model_type, algorithm],
209
+ outputs=[n_clusters, n_init, max_iter, eps, min_samples, pretrained_model])
210
+
211
+ btn = gr.Button("Run Clustering")
212
+ output = gr.Dataframe(label="Resulting Clusters")
213
+ download_csv = gr.File(label="Download Clustered CSV")
214
+ cluster_plot = gr.Image(label="2D Cluster Plot")
215
+ dim_plot = gr.Image(label="SOM Visualization")
216
+
217
+ btn.click(fn=cluster,
218
+ inputs=[
219
+ file, model_type, algorithm, n_clusters, eps, min_samples,
220
+ pretrained_model, missing_strategy, dim_reduce_method,
221
+ n_init, max_iter, normalize_method
222
+ ],
223
+ outputs=[output, download_csv, cluster_plot, dim_plot])
224
 
225
  if __name__ == "__main__":
226
  iface.launch()
clustering/pretrained.py CHANGED
@@ -4,6 +4,6 @@ from sentence_transformers import SentenceTransformer
4
  def run_bert_clustering(texts, n_clusters, model_name="all-MiniLM-L6-v2"):
5
  model = SentenceTransformer(model_name)
6
  embeddings = model.encode(texts, show_progress_bar=False)
7
- km = KMeans(n_clusters=n_clusters, random_state=42)
8
  labels = km.fit_predict(embeddings)
9
  return labels
 
4
  def run_bert_clustering(texts, n_clusters, model_name="all-MiniLM-L6-v2"):
5
  model = SentenceTransformer(model_name)
6
  embeddings = model.encode(texts, show_progress_bar=False)
7
+ km = KMeans(n_clusters=n_clusters, random_state=69)
8
  labels = km.fit_predict(embeddings)
9
  return labels
clustering/pure.py CHANGED
@@ -1,16 +1,84 @@
1
  from sklearn.cluster import KMeans, DBSCAN
2
  from sklearn.preprocessing import StandardScaler
 
 
 
3
 
4
- def run_kmeans(data, n_clusters):
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  scaler = StandardScaler()
6
  X_scaled = scaler.fit_transform(data)
7
- model = KMeans(n_clusters=n_clusters, random_state=42)
 
 
 
 
 
 
 
 
8
  labels = model.fit_predict(X_scaled)
9
- return labels
10
 
11
- def run_dbscan(data, eps=0.5, min_samples=5):
 
 
 
12
  scaler = StandardScaler()
13
  X_scaled = scaler.fit_transform(data)
 
14
  model = DBSCAN(eps=eps, min_samples=min_samples)
15
  labels = model.fit_predict(X_scaled)
16
- return labels
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from sklearn.cluster import KMeans, DBSCAN
2
  from sklearn.preprocessing import StandardScaler
3
+ import pandas as pd
4
+ from minisom import MiniSom
5
+ import numpy as np
6
 
7
+ import numpy as np
8
+ import skfuzzy as fuzz
9
+ from sklearn.preprocessing import StandardScaler
10
+
11
+
12
+ def run_kmeans(
13
+ data: pd.DataFrame,
14
+ n_clusters: int,
15
+ init="k-means++",
16
+ n_init=30,
17
+ max_iter=2000,
18
+ random_state=42,
19
+ algorithm="lloyd",
20
+ ):
21
  scaler = StandardScaler()
22
  X_scaled = scaler.fit_transform(data)
23
+
24
+ model = KMeans(
25
+ n_clusters=n_clusters,
26
+ init=init,
27
+ n_init=n_init,
28
+ max_iter=max_iter,
29
+ random_state=random_state,
30
+ algorithm=algorithm,
31
+ )
32
  labels = model.fit_predict(X_scaled)
 
33
 
34
+ return model, labels, scaler, X_scaled
35
+
36
+
37
+ def run_dbscan(data: pd.DataFrame, eps: float = 0.5, min_samples: int = 5):
38
  scaler = StandardScaler()
39
  X_scaled = scaler.fit_transform(data)
40
+
41
  model = DBSCAN(eps=eps, min_samples=min_samples)
42
  labels = model.fit_predict(X_scaled)
43
+
44
+ return model, labels, scaler, X_scaled
45
+
46
+
47
+ def run_fuzzy_cmeans(data, n_clusters):
48
+ scaler = StandardScaler()
49
+ X_scaled = scaler.fit_transform(data.T).T
50
+ X_T = X_scaled.T
51
+ cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
52
+ X_T, c=n_clusters, m=2, error=0.005, maxiter=3000, init=None, seed=42
53
+ )
54
+ labels = np.argmax(u, axis=0)
55
+
56
+ # labels เป็น 0-based
57
+ return cntr, labels, scaler, X_scaled
58
+
59
+
60
+ def run_som(
61
+ data: pd.DataFrame,
62
+ x: int = 10,
63
+ y: int = 10,
64
+ sigma: float = 1.0,
65
+ learning_rate: float = 0.5,
66
+ num_iteration: int = 1000,
67
+ ):
68
+ scaler = StandardScaler()
69
+ X_scaled = scaler.fit_transform(data)
70
+
71
+ som = MiniSom(
72
+ x,
73
+ y,
74
+ X_scaled.shape[1],
75
+ sigma=sigma,
76
+ learning_rate=learning_rate,
77
+ random_seed=42,
78
+ )
79
+ som.random_weights_init(X_scaled)
80
+ som.train_random(X_scaled, num_iteration)
81
+ win_map = np.array([som.winner(xi) for xi in X_scaled])
82
+ labels = np.array([w[0] * y + w[1] for w in win_map])
83
+
84
+ return som, labels, scaler, X_scaled
requirements.txt CHANGED
@@ -2,3 +2,7 @@ gradio
2
  pandas
3
  scikit-learn
4
  sentence-transformers
 
 
 
 
 
2
  pandas
3
  scikit-learn
4
  sentence-transformers
5
+ umap-learn
6
+ matplotlib
7
+ minisom
8
+ scikit-fuzzy
utils/preprocess.py CHANGED
@@ -1,13 +1,40 @@
1
  import pandas as pd
 
 
2
 
3
- def load_csv(file):
4
- return pd.read_csv(file)
5
 
6
- def get_numeric(df):
7
- return df.select_dtypes(include=['number'])
8
 
9
- def get_text_column(df):
10
- for col in df.columns:
11
- if df[col].dtype == 'object':
12
- return df[col].dropna().astype(str).tolist()
 
 
 
 
 
 
 
 
 
13
  return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
2
+ import numpy as np
3
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
4
 
5
+ def load_csv(file_path_or_obj):
6
+ return pd.read_csv(file_path_or_obj)
7
 
8
+ def get_numeric(df: pd.DataFrame, strategy: str = "Fill with Mean") -> pd.DataFrame:
9
+ numeric_df = df.select_dtypes(include=['number'])
10
 
11
+ if strategy == "Fill with Mean":
12
+ return numeric_df.fillna(numeric_df.mean(numeric_only=True))
13
+ elif strategy == "Fill with Zero":
14
+ return numeric_df.fillna(0)
15
+ elif strategy == "Drop Rows":
16
+ return numeric_df.dropna()
17
+ else:
18
+ return numeric_df
19
+
20
+ def get_text_column(df: pd.DataFrame) -> list:
21
+ text_columns = df.select_dtypes(include=['object']).columns
22
+ if not text_columns.empty:
23
+ return df[text_columns[0]].dropna().astype(str).tolist()
24
  return []
25
+
26
+ def normalize_data(data: pd.DataFrame, method: str):
27
+ if method == "z-score":
28
+ scaler = StandardScaler()
29
+ elif method == "mapminmax":
30
+ scaler = MinMaxScaler()
31
+ else: # "none"
32
+ return data.copy(), None
33
+
34
+ scaled = scaler.fit_transform(data)
35
+ return pd.DataFrame(scaled, columns=data.columns), scaler
36
+
37
+ def denormalize_data(scaled_data: pd.DataFrame, scaler):
38
+ if scaler is None:
39
+ return scaled_data
40
+ return pd.DataFrame(scaler.inverse_transform(scaled_data), columns=scaled_data.columns)
utils/visualize.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib.pyplot as plt
2
+ import numpy as np
3
+ import umap
4
+ from sklearn.manifold import TSNE
5
+ import tempfile
6
+
7
+ def plot_embedding(X, labels, method="UMAP", title="Clustering Visualization") -> str:
8
+ if method.upper() == "NONE":
9
+ # ไม่ลดมิติ กูทำแค่ plot scatter ตามข้อมูลเดิม 2 มิติ
10
+ if X.shape[1] < 2:
11
+ raise ValueError("Data must have at least 2 features for plotting without dimensionality reduction.")
12
+ plt.figure(figsize=(8, 6))
13
+ scatter = plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='tab10', s=30)
14
+ plt.title(f"No Dimensionality Reduction - {title}")
15
+ plt.colorbar(scatter, label="Cluster ID")
16
+ plt.tight_layout()
17
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_img:
18
+ plt.savefig(tmp_img.name)
19
+ plt.close()
20
+ return tmp_img.name
21
+
22
+ elif method.upper() == "UMAP":
23
+ reducer = umap.UMAP(random_state=69)
24
+ elif method.upper() == "TSNE":
25
+ reducer = TSNE(random_state=69, perplexity=30, max_iter=1000)
26
+ else:
27
+ raise ValueError(f"Unknown method: {method}. Use 'UMAP', 'TSNE', or 'None'.")
28
+
29
+ X_embedded = reducer.fit_transform(X)
30
+
31
+ plt.figure(figsize=(8, 6))
32
+ scatter = plt.scatter(X_embedded[:, 0], X_embedded[:, 1], c=labels, cmap='tab10', s=30)
33
+ plt.title(f"{method.upper()} - {title}")
34
+ plt.colorbar(scatter, label="Cluster ID")
35
+ plt.tight_layout()
36
+
37
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_img:
38
+ plt.savefig(tmp_img.name)
39
+ plt.close()
40
+ return tmp_img.name
41
+
42
+ def plot_som(som_model, X_scaled, labels):
43
+ """
44
+ Visualize SOM clustering result with U-Matrix + labeled points.
45
+ som_model: trained SOM object (เช่น MiniSom)
46
+ X_scaled: scaled data array
47
+ labels: cluster labels assigned for each point
48
+ """
49
+
50
+ plt.figure(figsize=(8, 8))
51
+
52
+ # วาด U-Matrix (distance map)
53
+ plt.pcolor(som_model.distance_map().T, cmap='bone_r')
54
+ plt.colorbar(label='Distance')
55
+
56
+ # วาดจุดข้อมูลบน SOM grid
57
+ markers = ['o', 's', 'D', '^', 'v', 'p', '*', 'h', 'x', '+'] # marker สำหรับ cluster สูงสุด 10 กลุ่ม
58
+ colors = plt.cm.tab10.colors
59
+
60
+ for cnt, x in enumerate(X_scaled):
61
+ w = som_model.winner(x) # ตำแหน่ง node ที่ชนะ (winner neuron)
62
+ cluster_id = labels[cnt] - 1 # adjust label to zero-based index
63
+ plt.plot(w[0] + 0.5, w[1] + 0.5, markers[cluster_id % len(markers)],
64
+ markerfacecolor=colors[cluster_id % len(colors)],
65
+ markeredgecolor='k',
66
+ markersize=12,
67
+ markeredgewidth=1.5)
68
+
69
+ plt.title("SOM Clustering Visualization (U-Matrix + Clustered Data Points)")
70
+ plt.tight_layout()
71
+
72
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_img:
73
+ plt.savefig(tmp_img.name)
74
+ plt.close()
75
+ return tmp_img.name