mboukabous commited on
Commit
4c91838
·
1 Parent(s): a9ab4a2

first commit

Browse files
app.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ Gradio Interface for Unsupervised Learning (Clustering, Dimensionality Reduction, Anomaly Detection)
4
+
5
+ This script provides a single Gradio-based interface to run three unsupervised tasks:
6
+ 1. Clustering
7
+ 2. Dimensionality Reduction
8
+ 3. Anomaly (Outlier) Detection
9
+
10
+ Each task is placed in its own Gradio Tab. The user can:
11
+ - Choose a model from the relevant unsupervised directory (clustering/dimred/anomaly).
12
+ - Specify dataset input (upload, local path, or Kaggle).
13
+ - Select columns to drop or keep.
14
+ - Execute the relevant training script (train_clustering_model.py, train_dimred_model.py, or train_anomaly_detection.py).
15
+ - View logs and optional plots.
16
+
17
+ Project Requirements:
18
+ - Python 3.7+.
19
+ - Gradio, scikit-learn, pandas, etc. in requirements.txt.
20
+ - Properly structured project with:
21
+ - scripts/train_clustering_model.py
22
+ - scripts/train_dimred_model.py
23
+ - scripts/train_anomaly_detection.py
24
+ - models/unsupervised/<task>/<model>.py
25
+ - data/datasets/kaggle_data.py (optional for Kaggle usage).
26
+ """
27
+
28
+ import gradio as gr
29
+ import pandas as pd
30
+ import os
31
+ import subprocess
32
+ import sys
33
+ import glob
34
+ import re
35
+
36
+ #####################################
37
+ # Helper Functions
38
+ #####################################
39
+
40
+ def get_model_modules(task_type):
41
+ """
42
+ Dynamically fetch model modules from the unsupervised subdirectories:
43
+ - clustering
44
+ - dimred
45
+ - anomaly
46
+ """
47
+ models_dir = os.path.join(project_root, 'models', 'unsupervised', task_type)
48
+ if not os.path.exists(models_dir):
49
+ print(f"Directory does not exist: {models_dir}")
50
+ return []
51
+ model_files = glob.glob(os.path.join(models_dir, '*.py'))
52
+ modules = [
53
+ os.path.splitext(os.path.basename(f))[0]
54
+ for f in model_files if not f.endswith('__init__.py')
55
+ ]
56
+ return modules
57
+
58
+ def download_kaggle_data(json_path, dataset_name, is_competition):
59
+ from data.datasets.kaggle_data import get_kaggle_data
60
+ data_path = get_kaggle_data(json_path=json_path, data_name=dataset_name, is_competition=is_competition)
61
+ return data_path
62
+
63
+ def run_subprocess(script_path, script_args):
64
+ """
65
+ Run a subprocess call to the given script with the specified arguments.
66
+ Returns (output_text, plot_image_path_or_None).
67
+ """
68
+ try:
69
+ result = subprocess.run(script_args, capture_output=True, text=True)
70
+ output = result.stdout
71
+ errors = result.stderr
72
+ if result.returncode != 0:
73
+ return f"Error during training:\n{errors}", None
74
+ else:
75
+ # Attempt to parse any 'Visualization saved to ...' line for an image path
76
+ output = re.sub(r"Figure\(\d+x\d+\)", "", output).strip()
77
+ image_path = None
78
+
79
+ # Look for "Plot saved to ..." or any ".png" reference
80
+ match_plot = re.search(r"Plot saved to (.+)", output)
81
+ if match_plot:
82
+ image_path = match_plot.group(1).strip()
83
+ else:
84
+ match_png = re.search(r"(\S+\.png)", output)
85
+ if match_png:
86
+ image_path = match_png.group(1)
87
+
88
+ if image_path and os.path.exists(image_path):
89
+ return f"Completed successfully.\n\n{output}", image_path
90
+ else:
91
+ return f"Completed successfully.\n\n{output}", None
92
+ except Exception as e:
93
+ return f"An error occurred:\n{str(e)}", None
94
+
95
+ def get_columns_from_data(data_option, data_file, data_path,
96
+ kaggle_json_file, kaggle_competition_name, kaggle_data_name,
97
+ is_competition):
98
+ """
99
+ Attempt to load the CSV and return columns.
100
+ """
101
+ final_path = None
102
+ if data_option == "Upload Data File":
103
+ if data_file is None:
104
+ return []
105
+ final_path = data_file
106
+ elif data_option == "Provide Data Path":
107
+ if os.path.exists(data_path):
108
+ final_path = data_path
109
+ else:
110
+ print("Provided path does not exist.")
111
+ return []
112
+ elif data_option == "Download from Kaggle":
113
+ if kaggle_json_file is None:
114
+ print("No kaggle.json uploaded.")
115
+ return []
116
+ import shutil
117
+ kaggle_config_dir = os.path.expanduser('~/.kaggle')
118
+ os.makedirs(kaggle_config_dir, exist_ok=True)
119
+ kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json')
120
+ shutil.copy(kaggle_json_file.name, kaggle_json_path)
121
+ os.chmod(kaggle_json_path, 0o600)
122
+
123
+ data_dir = download_kaggle_data(kaggle_json_path, kaggle_competition_name, is_competition)
124
+ if data_dir is None:
125
+ print("Failed to download from Kaggle.")
126
+ return []
127
+ final_path = os.path.join(data_dir, kaggle_data_name)
128
+ if not os.path.exists(final_path):
129
+ print(f"{kaggle_data_name} not found in Kaggle data.")
130
+ return []
131
+ else:
132
+ print("Invalid data option.")
133
+ return []
134
+
135
+ try:
136
+ df = pd.read_csv(final_path)
137
+ return df.columns.tolist()
138
+ except Exception as e:
139
+ print(f"Error reading {final_path}: {e}")
140
+ return []
141
+
142
+ #####################################
143
+ # Creating the Gradio Tab
144
+ #####################################
145
+
146
+ def create_task_tab(task_name, model_modules, script_path):
147
+ """
148
+ Creates a Gradio Tab for a specific unsupervised task (Clustering, DimRed, Anomaly).
149
+ - model_modules: list of model modules from get_model_modules(task_type)
150
+ - script_path: e.g. 'scripts/train_clustering_model.py'
151
+ """
152
+
153
+ with gr.Tab(task_name):
154
+ gr.Markdown(f"## {task_name} Task")
155
+
156
+ # Model selection
157
+ model_select = gr.Dropdown(choices=model_modules, label=f"{task_name} Model Module")
158
+
159
+ # Data input approach
160
+ data_option = gr.Radio(
161
+ choices=["Upload Data File", "Provide Data Path", "Download from Kaggle"],
162
+ label="Data Input Option",
163
+ value="Upload Data File"
164
+ )
165
+
166
+ with gr.Column(visible=True) as upload_data_col:
167
+ data_file = gr.File(label="Upload CSV Data File", type="filepath")
168
+
169
+ with gr.Column(visible=False) as path_data_col:
170
+ data_path_txt = gr.Textbox(label="Data File Path")
171
+
172
+ with gr.Column(visible=False) as kaggle_data_col:
173
+ kaggle_json = gr.File(label="Upload kaggle.json File", type="filepath")
174
+ kaggle_competition_name = gr.Textbox(value='', label="Kaggle Competition/Dataset Name")
175
+ kaggle_data_name = gr.Textbox(value='data.csv', label="Data File Name in Kaggle dataset")
176
+ kaggle_is_competition = gr.Checkbox(label="Is Kaggle Competition?", value=False)
177
+
178
+ # Toggle data input columns
179
+ def toggle_data_input(choice):
180
+ if choice == "Upload Data File":
181
+ return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
182
+ elif choice == "Provide Data Path":
183
+ return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
184
+ elif choice == "Download from Kaggle":
185
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
186
+ else:
187
+ return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
188
+
189
+ data_option.change(
190
+ toggle_data_input,
191
+ inputs=[data_option],
192
+ outputs=[upload_data_col, path_data_col, kaggle_data_col]
193
+ )
194
+
195
+ # Update columns button
196
+ update_cols_btn = gr.Button("Update Columns")
197
+
198
+ # We remove "Columns in Data (for reference)" as requested
199
+ drop_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Drop")
200
+ select_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Keep (if empty, keep all)")
201
+
202
+ # Visualization param
203
+ visualize_chk = gr.Checkbox(label="Visualize 2D (using PCA if needed)", value=True)
204
+
205
+ # Model / results path with empty default, and label "(optional)"
206
+ model_path_txt = gr.Textbox(label="Model Save Path (optional)", value="")
207
+ results_path_txt = gr.Textbox(label="Results Save Path (optional)", value="")
208
+
209
+ # The Train button
210
+ train_btn = gr.Button(f"Train {task_name}")
211
+
212
+ # Logs/Output
213
+ output_box = gr.Textbox(label="Logs / Output")
214
+ image_display = gr.Image(label="Plot Output", visible=True)
215
+
216
+ # Function to update columns
217
+ def update_columns_fn(dataopt, f, p, kagfile, kcname, kdname, iscomp):
218
+ cols = get_columns_from_data(dataopt, f, p, kagfile, kcname, kdname, iscomp)
219
+ # Return updated choices for drop_cols_chk, select_cols_chk
220
+ if cols:
221
+ return gr.update(choices=cols), gr.update(choices=cols)
222
+ else:
223
+ return gr.update(choices=[]), gr.update(choices=[])
224
+
225
+ update_cols_btn.click(
226
+ fn=update_columns_fn,
227
+ inputs=[
228
+ data_option, data_file, data_path_txt,
229
+ kaggle_json, kaggle_competition_name, kaggle_data_name,
230
+ kaggle_is_competition
231
+ ],
232
+ outputs=[drop_cols_chk, select_cols_chk]
233
+ )
234
+
235
+ def run_task(model_mod, dataopt, f, p, kagfile, kcname, kdname, iscomp,
236
+ drop_cols, select_cols, visualize, mpath, rpath):
237
+ # Build the command for the relevant script
238
+ script_cmd = [sys.executable, os.path.join(project_root, script_path)]
239
+ script_cmd.extend(["--model_module", model_mod])
240
+
241
+ # Minimal approach for data path logic
242
+ final_path = None
243
+ if dataopt == "Upload Data File" and f is not None:
244
+ final_path = f
245
+ elif dataopt == "Provide Data Path" and os.path.exists(p):
246
+ final_path = p
247
+ else:
248
+ # For Kaggle or other complexities, skipping for brevity.
249
+ # Could handle it similarly to get_columns_from_data approach
250
+ final_path = ""
251
+
252
+ if final_path:
253
+ script_cmd.extend(["--data_path", final_path])
254
+
255
+ # drop cols
256
+ if drop_cols and len(drop_cols) > 0:
257
+ script_cmd.extend(["--drop_columns", ",".join(drop_cols)])
258
+ # select cols
259
+ if select_cols and len(select_cols) > 0:
260
+ script_cmd.extend(["--select_columns", ",".join(select_cols)])
261
+ # visualize
262
+ if visualize:
263
+ script_cmd.append("--visualize")
264
+
265
+ # model_path
266
+ if mpath.strip():
267
+ script_cmd.extend(["--model_path", mpath.strip()])
268
+ # results_path
269
+ if rpath.strip():
270
+ script_cmd.extend(["--results_path", rpath.strip()])
271
+
272
+ print("Executing command:", " ".join(script_cmd))
273
+ out_text, plot_path = run_subprocess(script_path, script_cmd)
274
+ return out_text, plot_path
275
+
276
+ # The Train button is above logs, so let's define the click function
277
+ train_btn.click(
278
+ fn=run_task,
279
+ inputs=[
280
+ model_select, data_option, data_file, data_path_txt,
281
+ kaggle_json, kaggle_competition_name, kaggle_data_name, kaggle_is_competition,
282
+ drop_cols_chk, select_cols_chk, visualize_chk,
283
+ model_path_txt, results_path_txt
284
+ ],
285
+ outputs=[output_box, image_display]
286
+ )
287
+
288
+ return # end create_task_tab
289
+
290
+
291
+ #####################################
292
+ # Build the Main Gradio App
293
+ #####################################
294
+
295
+ with gr.Blocks() as demo:
296
+ gr.Markdown("# Unsupervised Learning Gradio Interface")
297
+
298
+ # 1) Clustering Tab
299
+ clustering_modules = get_model_modules("clustering")
300
+ create_task_tab(
301
+ task_name="Clustering",
302
+ model_modules=clustering_modules,
303
+ script_path="scripts/train_clustering_model.py"
304
+ )
305
+
306
+ # 2) Dimensionality Reduction Tab
307
+ dimred_modules = get_model_modules("dimred")
308
+ create_task_tab(
309
+ task_name="Dimensionality Reduction",
310
+ model_modules=dimred_modules,
311
+ script_path="scripts/train_dimred_model.py"
312
+ )
313
+
314
+ # 3) Anomaly Detection Tab
315
+ anomaly_modules = get_model_modules("anomaly")
316
+ create_task_tab(
317
+ task_name="Anomaly Detection",
318
+ model_modules=anomaly_modules,
319
+ script_path="scripts/train_anomaly_detection.py"
320
+ )
321
+
322
+ if __name__ == "__main__":
323
+ demo.launch()
data/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # data
data/datasets/README.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Datasets Utilities
2
+
3
+ This folder contains utility scripts for handling datasets, including downloading data from Kaggle.
4
+
5
+ ## 📄 Scripts
6
+
7
+ ### `kaggle_data.py`
8
+
9
+ - **Description**: A Python script to download Kaggle datasets or competition data seamlessly, supporting Google Colab, local Linux/Mac, and Windows environments.
10
+ - **Path**: [`data/datasets/kaggle_data.py`](kaggle_data.py)
11
+ - **Key Function**: `get_kaggle_data(json_path, data_name, is_competition=False, output_dir='data/raw')`
12
+ - **Example**:
13
+
14
+ ```python
15
+ from kaggle_data import get_kaggle_data
16
+
17
+ # Download a standard Kaggle dataset
18
+ dataset_path = get_kaggle_data("kaggle.json", "paultimothymooney/chest-xray-pneumonia")
19
+
20
+ # Download competition data
21
+ competition_path = get_kaggle_data("kaggle.json", "house-prices-advanced-regression-techniques", is_competition=True)
data/datasets/kaggle_data.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module provides a utility function to download Kaggle datasets or competition data.
3
+
4
+ The function automatically detects whether it is running in a Google Colab environment, a local Linux/Mac environment, or a Windows environment, and sets up the Kaggle API accordingly.
5
+
6
+ Requirements:
7
+ - Kaggle API installed (`pip install kaggle`)
8
+ - Kaggle API key (`kaggle.json`) with appropriate permissions.
9
+
10
+ Environment Detection:
11
+ - Google Colab: Uses `/root/.config/kaggle/kaggle.json`.
12
+ - Local Linux/Mac: Uses `~/.kaggle/kaggle.json`.
13
+ - Windows: Uses `C:\\Users\\<Username>\\.kaggle\\kaggle.json`.
14
+
15
+ Functions:
16
+ get_kaggle_data(json_path: str, data_name: str, is_competition: bool = False, output_dir: str = "data/raw") -> str
17
+ """
18
+
19
+ import os
20
+ import zipfile
21
+ import sys
22
+ import shutil
23
+ import platform
24
+
25
+ def get_kaggle_data(json_path: str, data_name: str, is_competition: bool = False, output_dir: str = "data/raw") -> str:
26
+ """
27
+ Downloads a Kaggle dataset or competition data using the Kaggle API in Google Colab, local Linux/Mac, or Windows environment.
28
+
29
+ Parameters:
30
+ json_path (str): Path to your 'kaggle.json' file.
31
+ data_name (str): Kaggle dataset or competition name (e.g., 'paultimothymooney/chest-xray-pneumonia' or 'house-prices-advanced-regression-techniques').
32
+ is_competition (bool): Set to True if downloading competition data. Default is False (for datasets).
33
+ output_dir (str): Directory to save and extract the data. Default is 'data'.
34
+
35
+ Returns:
36
+ str: Path to the extracted dataset folder.
37
+
38
+ Raises:
39
+ OSError: If 'kaggle.json' is not found or cannot be copied.
40
+ Exception: If there is an error during download or extraction.
41
+
42
+ Example of Usage:
43
+ # For downloading a standard dataset
44
+ dataset_path = get_kaggle_data("kaggle.json", "paultimothymooney/chest-xray-pneumonia")
45
+ print(f"Dataset is available at: {dataset_path}")
46
+
47
+ # For downloading competition data
48
+ competition_path = get_kaggle_data("kaggle.json", "house-prices-advanced-regression-techniques", is_competition=True)
49
+ print(f"Competition data is available at: {competition_path}")
50
+ """
51
+ # Detect environment (Colab, local Linux/Mac, or Windows)
52
+ is_colab = "google.colab" in sys.modules
53
+ is_windows = platform.system() == "Windows"
54
+
55
+ # Step 1: Setup Kaggle API credentials
56
+ try:
57
+ if is_colab:
58
+ config_dir = "/root/.config/kaggle"
59
+ os.makedirs(config_dir, exist_ok=True)
60
+ print("Setting up Kaggle API credentials for Colab environment.")
61
+ shutil.copy(json_path, os.path.join(config_dir, "kaggle.json"))
62
+ os.chmod(os.path.join(config_dir, "kaggle.json"), 0o600)
63
+ else:
64
+ # For both local Linux/Mac and Windows, use the home directory
65
+ config_dir = os.path.join(os.path.expanduser("~"), ".kaggle")
66
+ os.makedirs(config_dir, exist_ok=True)
67
+ print("Setting up Kaggle API credentials for local environment.")
68
+ kaggle_json_dest = os.path.join(config_dir, "kaggle.json")
69
+ if not os.path.exists(kaggle_json_dest):
70
+ shutil.copy(json_path, kaggle_json_dest)
71
+ if not is_windows:
72
+ os.chmod(kaggle_json_dest, 0o600)
73
+ except Exception as e:
74
+ raise OSError(f"Could not set up Kaggle API credentials: {e}")
75
+
76
+ # Step 2: Create output directory
77
+ dataset_dir = os.path.join(output_dir, data_name.split('/')[-1])
78
+ os.makedirs(dataset_dir, exist_ok=True)
79
+ original_dir = os.getcwd()
80
+ os.chdir(dataset_dir)
81
+
82
+ # Step 3: Download the dataset or competition data
83
+ try:
84
+ if is_competition:
85
+ print(f"Downloading competition data: {data_name}")
86
+ cmd = f"kaggle competitions download -c {data_name}"
87
+ else:
88
+ print(f"Downloading dataset: {data_name}")
89
+ cmd = f"kaggle datasets download -d {data_name}"
90
+ os.system(cmd)
91
+ except Exception as e:
92
+ print(f"Error during download: {e}")
93
+ os.chdir(original_dir)
94
+ return None
95
+
96
+ # Step 4: Unzip all downloaded files
97
+ zip_files = [f for f in os.listdir() if f.endswith(".zip")]
98
+ if not zip_files:
99
+ print("No zip files found. Please check the dataset or competition name.")
100
+ os.chdir(original_dir)
101
+ return None
102
+
103
+ for zip_file in zip_files:
104
+ try:
105
+ with zipfile.ZipFile(zip_file, "r") as zip_ref:
106
+ zip_ref.extractall()
107
+ print(f"Extracted: {zip_file}")
108
+ os.remove(zip_file)
109
+ except Exception as e:
110
+ print(f"Error extracting {zip_file}: {e}")
111
+
112
+ # Step 5: Navigate back to the original directory
113
+ os.chdir(original_dir)
114
+
115
+ return dataset_dir
data/raw/README.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # raw
models/unsupervised/anomaly/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Anomaly (Outlier) Detection Models
2
+
3
+ This directory hosts scripts defining **anomaly detection** estimators (e.g., Isolation Forest, One-Class SVM, etc.) for use with `train_anomaly_detection.py`. Each file specifies a scikit-learn–compatible outlier detector and, if applicable, a parameter grid.
4
+
5
+ **Key Points**:
6
+ - **Estimator**: Must allow `.fit(X)` and `.predict(X)` or similar. Typically returns +1 / −1 for inliers / outliers (we unify to 0 / 1).
7
+ - **Parameter Grid**: You can define hyperparameters (like `n_estimators`, `contamination`) for potential searching.
8
+ - **Default Approach**: We do not rely on labeled anomalies (unsupervised). The script will produce a predictions CSV with 0 = normal, 1 = outlier.
9
+
10
+ **Note**: The main script `train_anomaly_detection.py` handles data loading, label encoding, dropping/selecting columns, the `.fit(X)`, `.predict(X)` steps, saving the outlier predictions, and (optionally) a 2D plot with outliers in red.
11
+
12
+ ## Available Anomaly Detection Models
13
+
14
+ - [Isolation Forest](isolation_forest.py)
15
+ - [One-Class SVM](one_class_svm.py)
16
+ - [Local Outlier Factor (LOF)](local_outlier_factor.py)
17
+
18
+ ### Usage
19
+
20
+ For example, to detect outliers with an Isolation Forest:
21
+
22
+ ```bash
23
+ python scripts/train_anomaly_detection.py \
24
+ --model_module isolation_forest \
25
+ --data_path data/breast_cancer/data.csv \
26
+ --drop_columns "id,diagnosis" \
27
+ --visualize
28
+ ```
29
+
30
+ This:
31
+ 1. Loads `isolation_forest.py`, sets up `IsolationForest(...)`.
32
+ 2. Fits the model to the data, saves it, then `predict(...)`.
33
+ 3. Saves a `predictions.csv` with `OutlierPrediction`.
34
+ 4. If `--visualize`, does a 2D PCA scatter, coloring outliers red.
models/unsupervised/anomaly/isolation_forest.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ isolation_forest.py
4
+
5
+ This module defines an Isolation Forest model for anomaly detection.
6
+ Isolation Forest is an efficient and effective algorithm for identifying
7
+ outliers in high-dimensional datasets.
8
+
9
+ Key Features:
10
+ - Utilizes a tree-based approach to isolate anomalies.
11
+ - Efficient for both large datasets and high-dimensional spaces.
12
+ - Automatically determines the expected proportion of anomalies.
13
+
14
+ Parameters:
15
+ - n_estimators (int): Number of base estimators in the ensemble.
16
+ - Default: 100.
17
+ - contamination (str or float): Expected proportion of outliers in the data.
18
+ - Default: 'auto' (automatically inferred based on dataset size).
19
+ - max_samples (int or float): Number of samples to draw for training each estimator.
20
+ - Default: 'auto' (uses min(256, number of samples)).
21
+
22
+ Default Configuration:
23
+ - n_estimators=100: Adequate for most datasets.
24
+ - contamination='auto': Automatically estimates the proportion of outliers.
25
+ """
26
+
27
+ from sklearn.ensemble import IsolationForest
28
+
29
+ # Define the Isolation Forest estimator
30
+ estimator = IsolationForest(
31
+ n_estimators=100, # Default number of trees
32
+ contamination='auto', # Automatically estimates the contamination proportion
33
+ random_state=42 # Ensures reproducibility
34
+ )
models/unsupervised/anomaly/local_outlier_factor.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ local_outlier_factor.py
4
+
5
+ This module defines a Local Outlier Factor (LOF) model for anomaly detection.
6
+ LOF identifies anomalies by comparing the local density of a sample to the density
7
+ of its neighbors. Samples with significantly lower density are flagged as outliers.
8
+
9
+ Key Features:
10
+ - Detects local anomalies in datasets with varying densities.
11
+ - Effective for datasets where the notion of an outlier is context-dependent.
12
+ - Non-parametric method that adapts to the data's structure.
13
+
14
+ Parameters:
15
+ - n_neighbors (int): Number of neighbors used to calculate local density.
16
+ - Default: 20. Higher values smooth out anomalies but may miss local patterns.
17
+ - contamination (str or float): Proportion of outliers in the data.
18
+ - 'auto': Automatically estimates the proportion based on the dataset size.
19
+ - float: Manually set the expected proportion (e.g., 0.1 for 10%).
20
+ - novelty (bool): If True, allows the model to be applied to new unseen data.
21
+
22
+ Limitations:
23
+ - LOF directly computes predictions during `fit_predict()` and does not support `predict()`
24
+ unless `novelty=True`.
25
+
26
+ Default Configuration:
27
+ - n_neighbors=20: Uses 20 neighbors for density comparison.
28
+ - contamination='auto': Automatically estimates the proportion of outliers.
29
+ - novelty=True: Enables predictions on unseen data.
30
+ """
31
+
32
+ from sklearn.neighbors import LocalOutlierFactor
33
+
34
+ # Define the Local Outlier Factor estimator
35
+ estimator = LocalOutlierFactor(
36
+ n_neighbors=20, # Number of neighbors to calculate density
37
+ contamination='auto', # Auto-detect the proportion of outliers
38
+ novelty=True # Enables prediction on new data
39
+ )
models/unsupervised/anomaly/one_class_svm.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ one_class_svm.py
4
+
5
+ This module defines a One-Class SVM model for anomaly detection.
6
+ One-Class SVM identifies a decision boundary that separates normal data points from potential outliers
7
+ in a high-dimensional feature space.
8
+
9
+ Key Features:
10
+ - Effective for detecting anomalies in high-dimensional datasets.
11
+ - Flexible kernel options for nonlinear decision boundaries.
12
+ - Suitable for datasets with a small proportion of outliers.
13
+
14
+ Parameters:
15
+ - kernel (str): Specifies the kernel type used in the algorithm.
16
+ - Common options: 'linear', 'poly', 'rbf' (default), and 'sigmoid'.
17
+ - gamma (str or float): Kernel coefficient. Determines the influence of each sample.
18
+ - Default: 'scale' (1 / (n_features * X.var())).
19
+ - nu (float): Approximate fraction of outliers in the dataset.
20
+ - Must be in the range (0, 1]. Default: 0.05 (5% of data considered outliers).
21
+
22
+ Default Configuration:
23
+ - kernel='rbf': Radial Basis Function for nonlinear separation.
24
+ - gamma='scale': Automatically adjusts kernel influence based on dataset features.
25
+ - nu=0.05: Assumes approximately 5% of data points are outliers.
26
+ """
27
+
28
+ from sklearn.svm import OneClassSVM
29
+
30
+ # Define the One-Class SVM estimator
31
+ estimator = OneClassSVM(
32
+ kernel='rbf', # Radial Basis Function kernel for nonlinear boundaries
33
+ gamma='scale', # Adjusts kernel influence based on dataset variance
34
+ nu=0.05 # Assumes 5% of the data are outliers
35
+ )
models/unsupervised/clustering/README.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Clustering Models
2
+
3
+ This directory contains Python scripts defining various **clustering** models and their associated hyperparameter grids. Each model file sets up a scikit-learn–compatible clustering estimator (e.g., `KMeans`, `DBSCAN`, `GaussianMixture`) and defines a param grid for the `train_clustering_model.py` script.
4
+
5
+ **Key Points**:
6
+ - **Estimator**: Usually supports `.fit(X)` for unsupervised training, and either `.labels_` or `.predict(X)` to retrieve cluster assignments.
7
+ - **Parameter Grid (`param_grid`)**: Used for silhouette-based hyperparameter tuning in `train_clustering_model.py`.
8
+ - **Default Scoring**: Often `'silhouette'`, but can be changed if you adapt your tuning logic.
9
+
10
+ **Note**: Preprocessing (dropping columns, label encoding) and any hyperparameter loop is handled externally by the script/utility. These model definition files simply define:
11
+ - An **estimator** (like `KMeans(n_clusters=3, random_state=42)`).
12
+ - A **`param_grid`** for silhouette tuning (e.g., `{'model__n_clusters':[2,3,4]}`).
13
+ - Optionally, a **`default_scoring`** set to `'silhouette'`.
14
+
15
+ ## Available Clustering Models
16
+
17
+ - [KMeans](kmeans.py)
18
+ - [DBSCAN](dbscan.py)
19
+ - [Gaussian Mixture](gaussian_mixture.py)
20
+ - [Agglomerative Clustering (Hierarchical)](hierarchical_clustering.py) )
21
+
22
+ ### Usage
23
+
24
+ To train or tune any clustering model, specify the `--model_module` argument with the appropriate model name (e.g., `kmeans`) when running `train_clustering_model.py`, for example:
25
+
26
+ ```bash
27
+ python scripts/train_clustering_model.py \
28
+ --model_module kmeans \
29
+ --data_path data/mall_customer/Mall_Customers.csv \
30
+ --tune \
31
+ --visualize
32
+ ```
33
+
34
+ This will:
35
+ 1. Load the chosen model definition (`kmeans.py`).
36
+ 2. Perform optional silhouette-based hyperparameter tuning if `--tune` is used.
37
+ 3. Fit the final model, save it, and optionally generate a 2D scatter plot if requested.
models/unsupervised/clustering/dbscan.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ dbscan.py
4
+
5
+ This module defines a DBSCAN clustering model and a parameter grid for hyperparameter tuning.
6
+
7
+ DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm.
8
+ It groups points closely packed together and marks as outliers those points in low-density regions.
9
+
10
+ Parameters:
11
+ - eps (float): The maximum distance between two samples for them to be considered as in the same neighborhood.
12
+ - min_samples (int): The number of samples (or total weight) in a neighborhood for a point to be considered a core point.
13
+ """
14
+
15
+ from sklearn.cluster import DBSCAN
16
+
17
+ # Define the DBSCAN estimator
18
+ estimator = DBSCAN(eps=0.5, min_samples=5)
19
+
20
+ # Define the hyperparameter grid for tuning
21
+ param_grid = {
22
+ 'model__eps': [0.2, 0.5, 1.0, 1.5, 2.0], # Explore a wide range of neighborhood radii
23
+ 'model__min_samples': [3, 5, 10, 20] # Adjust density thresholds for core points
24
+ }
25
+
26
+ # Default scoring metric
27
+ # Note: Silhouette score works best for convex clusters and may not always be ideal for DBSCAN.
28
+ # For more complex shapes, consider custom evaluation metrics.
29
+ default_scoring = 'silhouette'
models/unsupervised/clustering/gaussian_mixture.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ gaussian_mixture.py
4
+
5
+ This module defines a GaussianMixture model for clustering, along with a parameter grid for hyperparameter tuning.
6
+
7
+ Gaussian Mixture Models (GMM) assume that data is generated from a mixture of several Gaussian distributions
8
+ with unknown parameters. It's a probabilistic model and can handle clusters of varying sizes and shapes.
9
+
10
+ Parameters:
11
+ - n_components (int): Number of mixture components (clusters).
12
+ - covariance_type (str): Determines the shape of each cluster.
13
+ - 'full': Each cluster has its own general covariance matrix.
14
+ - 'tied': All clusters share the same covariance matrix.
15
+ - 'diag': Each cluster has its own diagonal covariance matrix.
16
+ - 'spherical': Each cluster has its own single variance.
17
+ """
18
+
19
+ from sklearn.mixture import GaussianMixture
20
+
21
+ # Define the GaussianMixture estimator
22
+ estimator = GaussianMixture(n_components=3, random_state=42)
23
+
24
+ # Define the hyperparameter grid for tuning
25
+ param_grid = {
26
+ 'model__n_components': [2, 3, 4], # Experiment with 2 to 4 clusters
27
+ 'model__covariance_type': ['full', 'tied', 'diag', 'spherical'] # Different shapes for cluster covariance
28
+ }
29
+
30
+ # Default scoring metric
31
+ # Note: Silhouette score works better for convex clusters. For GMMs with non-convex clusters, consider other metrics like BIC or AIC.
32
+ default_scoring = 'silhouette'
models/unsupervised/clustering/hierarchical_clustering.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ hierarchical_clustering.py
4
+
5
+ This module defines an AgglomerativeClustering model for hierarchical clustering,
6
+ along with a parameter grid for hyperparameter tuning.
7
+
8
+ Hierarchical clustering creates a tree-like structure (dendrogram) to represent the nested grouping of data points
9
+ and their similarity levels. Agglomerative clustering starts with each data point as its own cluster and iteratively merges them.
10
+
11
+ Parameters:
12
+ - n_clusters (int): The number of clusters to form.
13
+ - linkage (str): Determines how distances between clusters are computed.
14
+ - 'ward': Minimizes the variance of clusters (requires Euclidean distance).
15
+ - 'complete': Maximum linkage, i.e., uses the farthest points between clusters.
16
+ - 'average': Average linkage, i.e., uses the mean distances between clusters.
17
+ - 'single': Minimum linkage, i.e., uses the closest points between clusters.
18
+ """
19
+
20
+ from sklearn.cluster import AgglomerativeClustering
21
+
22
+ # Define the AgglomerativeClustering estimator
23
+ estimator = AgglomerativeClustering(n_clusters=3)
24
+
25
+ # Define the hyperparameter grid for tuning
26
+ param_grid = {
27
+ 'model__n_clusters': [2, 3, 4], # Experiment with 2 to 4 clusters
28
+ 'model__linkage': ['ward', 'complete', 'average', 'single'] # Different linkage methods for clustering
29
+ }
30
+
31
+ # Default scoring metric
32
+ # Note: Silhouette score works well for evaluating convex clusters formed by hierarchical clustering.
33
+ default_scoring = 'silhouette'
models/unsupervised/clustering/kmeans.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ kmeans.py
4
+
5
+ This module defines a KMeans clustering model and a parameter grid for hyperparameter tuning.
6
+
7
+ KMeans is a popular clustering algorithm that partitions data into k clusters. Each cluster is represented by the centroid of its members, and the algorithm iteratively refines the centroids to minimize the within-cluster variance.
8
+
9
+ Parameters:
10
+ - n_clusters (int): Number of clusters to form.
11
+ - init (str): Initialization method for centroids. Common options:
12
+ - 'k-means++' (default): Optimized centroid initialization.
13
+ - 'random': Random initialization.
14
+ - n_init (int): Number of times the algorithm runs with different centroid seeds.
15
+ - random_state (int): Ensures reproducibility of results.
16
+ """
17
+
18
+ from sklearn.cluster import KMeans
19
+
20
+ # Define the KMeans estimator
21
+ estimator = KMeans(n_clusters=3, random_state=42)
22
+
23
+ # Define the hyperparameter grid for tuning
24
+ param_grid = {
25
+ 'model__n_clusters': [2, 3, 4, 5], # Experiment with 2 to 5 clusters
26
+ 'model__init': ['k-means++', 'random'], # Compare optimized and random initialization
27
+ 'model__n_init': [10, 20, 50] # Test different numbers of initializations for stability
28
+ }
29
+
30
+ # Use silhouette score as the default scoring metric
31
+ # Silhouette score evaluates how well clusters are separated and compact
32
+ default_scoring = 'silhouette'
models/unsupervised/dimred/README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dimensionality Reduction Models
2
+
3
+ This directory contains Python scripts defining **dimensionality reduction** techniques (e.g., PCA, t-SNE, UMAP). Each model file sets up a scikit-learn–compatible estimator or follows a similar interface, making it easy to swap in `train_dimred_model.py`.
4
+
5
+ **Key Points**:
6
+ - **Estimator**: Typically supports `.fit_transform(X)` for dimension reduction.
7
+ - **Default Settings**: e.g., PCA might default to `n_components=2`; t-SNE might set `n_components=2` and `perplexity=30`; UMAP might define `n_neighbors=15` or `n_components=2`.
8
+ - **No Supervised Tuning**: Usually we pick hyperparameters based on interpretability or domain. A manual approach or specialized metric can be used if needed.
9
+
10
+ **Note**: The `train_dimred_model.py` script handles dropping columns, label encoding, performing `.fit_transform(X)`, and optionally saving a 2D/3D scatter plot if `--visualize` is used.
11
+
12
+ ## Available Dimensionality Reduction Models
13
+
14
+ - [PCA](pca.py)
15
+ - [t-SNE](tsne.py)
16
+ - [UMAP](umap.py)
17
+
18
+ ### Usage
19
+
20
+ To reduce data dimensions:
21
+
22
+ ```bash
23
+ python scripts/train_dimred_model.py \
24
+ --model_module pca \
25
+ --data_path data/breast_cancer/data.csv \
26
+ --select_columns "radius_mean, texture_mean, area_mean, smoothness_mean" \
27
+ --visualize
28
+ ```
29
+
30
+ This:
31
+ 1. Loads `pca.py`, which defines a `PCA(n_components=2)` estimator by default.
32
+ 2. Applies `.fit_transform(...)` to produce a 2D embedding.
33
+ 3. Saves the model (`dimred_model.pkl`) and the transformed data (`X_transformed.csv`).
34
+ 4. If `--visualize` is set and `n_components=2`, it scatter-plots the result.
models/unsupervised/dimred/pca.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ pca.py
4
+
5
+ This module defines a Principal Component Analysis (PCA) model for dimensionality reduction.
6
+ PCA is a widely used technique to reduce the dimensionality of large datasets by projecting the data
7
+ onto a lower-dimensional subspace while preserving as much variance as possible.
8
+
9
+ Key Features:
10
+ - Reduces computational complexity for high-dimensional data.
11
+ - Helps in visualizing data in 2D or 3D space.
12
+ - Useful as a preprocessing step for clustering or classification.
13
+
14
+ Parameters:
15
+ - n_components (int, float, or None): Number of principal components to keep.
16
+ - int: Specifies the exact number of components.
17
+ - float: Keeps enough components to explain the specified fraction of variance (e.g., 0.95 for 95% variance).
18
+ - None: Keeps all components (default).
19
+
20
+ Default:
21
+ - n_components=2: Projects the data onto 2 dimensions for visualization purposes.
22
+
23
+ """
24
+
25
+ from sklearn.decomposition import PCA
26
+
27
+ # Define the PCA estimator
28
+ estimator = PCA(n_components=2) # Default to 2D projection for visualization
models/unsupervised/dimred/tsne.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ tsne.py
4
+
5
+ This module defines a t-Distributed Stochastic Neighbor Embedding (t-SNE) model
6
+ for dimensionality reduction. t-SNE is primarily used for visualizing high-dimensional
7
+ data by projecting it into a lower-dimensional space (typically 2D or 3D).
8
+
9
+ Key Features:
10
+ - Nonlinear dimensionality reduction technique.
11
+ - Preserves local relationships within the data.
12
+ - Useful for exploring clustering structures in high-dimensional datasets.
13
+
14
+ Parameters:
15
+ - n_components (int): Number of dimensions for projection (default: 2 for visualization).
16
+ - perplexity (float): Controls the balance between local and global data structure.
17
+ - Typical values range between 5 and 50.
18
+ - learning_rate (float, optional): Learning rate for optimization (default: 'auto').
19
+ - random_state (int, optional): Ensures reproducibility of the results.
20
+
21
+ Default:
22
+ - n_components=2: Projects the data into a 2D space for visualization purposes.
23
+ - perplexity=30: A good starting point for most datasets.
24
+
25
+ """
26
+
27
+ from sklearn.manifold import TSNE
28
+
29
+ # Define the t-SNE estimator
30
+ estimator = TSNE(n_components=2, perplexity=30) # Default to 2D projection with a reasonable perplexity
models/unsupervised/dimred/umap.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ umap.py
4
+
5
+ This module defines a Uniform Manifold Approximation and Projection (UMAP) model
6
+ for dimensionality reduction. UMAP is a nonlinear dimensionality reduction technique
7
+ that is efficient for visualizing and analyzing high-dimensional data.
8
+
9
+ Key Features:
10
+ - Preserves both local and global data structures better than t-SNE in some cases.
11
+ - Scales efficiently to larger datasets compared to t-SNE.
12
+ - Suitable for exploratory data analysis and clustering.
13
+
14
+ Parameters:
15
+ - n_components (int): Number of dimensions for projection (default: 2 for visualization).
16
+ - n_neighbors (int): Determines the size of the local neighborhood to consider for manifold approximation.
17
+ - Typical values range between 5 and 50.
18
+ - min_dist (float): Minimum distance between points in the low-dimensional space.
19
+ - Smaller values maintain tighter clusters.
20
+ - metric (str): Distance metric for computing similarity (default: 'euclidean').
21
+
22
+ Default:
23
+ - n_components=2: Projects the data into a 2D space for visualization purposes.
24
+ - n_neighbors=15: Balances local and global structure preservation.
25
+ - min_dist=0.1: Provides moderate clustering while preserving distances.
26
+
27
+ Requirements:
28
+ - umap-learn library must be installed.
29
+ """
30
+
31
+ # Import UMAP from the umap-learn library
32
+ import umap.umap_ as umap
33
+
34
+ # Define the UMAP estimator
35
+ estimator = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1) # Default configuration for 2D projection
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.2.2
2
+ numpy==1.26.4
3
+ matplotlib==3.8.0
4
+ seaborn==0.13.2
5
+ kaggle==1.6.17
6
+ scikit-learn==1.5.2
7
+ catboost==1.2.7
8
+ dask[dataframe]==2024.10.0
9
+ xgboost==2.1.2
10
+ lightgbm==4.5.0
11
+ joblib==1.4.2
12
+ gradio==5.7.1
13
+ umap-learn==0.5.7
scripts/README.md ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Scripts
2
+
3
+ This directory contains executable scripts for training, testing, and other tasks related to model development and evaluation.
4
+
5
+ ## Contents
6
+
7
+ Supervised Learning:
8
+ - [train_regression_model.py](#train_regression_modelpy)
9
+ - [train_classification_model.py](#train_classification_modelpy)
10
+
11
+ Unsupervised Learning:
12
+ - [train_clustering_model.py](#train_clustering_modelpy)
13
+ - [train_dimred_model.py](#train_dimred_modelpy)
14
+ - [train_anomaly_detection.py](#train_anomaly_detectionpy)
15
+
16
+ ---
17
+
18
+ ## `train_regression_model.py`
19
+
20
+ A script for training supervised learning **regression** models using scikit-learn. It handles data loading, preprocessing, optional log transformation, hyperparameter tuning, model evaluation, and saving of models, metrics, and visualizations.
21
+
22
+ ### Features
23
+
24
+ - Supports various regression models defined in `models/supervised/regression`.
25
+ - Performs hyperparameter tuning using grid search cross-validation.
26
+ - Saves trained models and evaluation metrics.
27
+ - Generates visualizations if specified.
28
+
29
+ ### Usage
30
+
31
+ ```bash
32
+ python train_regression_model.py --model_module MODEL_MODULE \
33
+ --data_path DATA_PATH/DATA_NAME.csv \
34
+ --target_variable TARGET_VARIABLE [OPTIONS]
35
+ ```
36
+
37
+ **Required Arguments**:
38
+ - `model_module`: Name of the regression model module to import (e.g., `linear_regression`).
39
+ - `data_path`: Path to the dataset directory, including the data file name.
40
+ - `target_variable`: Name of the target variable.
41
+
42
+ **Optional Arguments**:
43
+ - `test_size`: Proportion of the dataset to include in the test split (default: `0.2`).
44
+ - `random_state`: Random seed for reproducibility (default: `42`).
45
+ - `log_transform`: Apply log transformation to the target variable (regression only).
46
+ - `cv_folds`: Number of cross-validation folds (default: `5`).
47
+ - `scoring_metric`: Scoring metric for model evaluation.
48
+ - `model_path`: Path to save the trained model.
49
+ - `results_path`: Path to save results and metrics.
50
+ - `visualize`: Generate and save visualizations (e.g., scatter or actual vs. predicted).
51
+ - `drop_columns`: Comma-separated column names to drop from the dataset.
52
+
53
+ ### Usage Example
54
+
55
+ ```bash
56
+ python train_regression_model.py --model_module linear_regression \
57
+ --data_path data/house_prices/train.csv \
58
+ --target_variable SalePrice --drop_columns Id \
59
+ --log_transform --visualize
60
+ ```
61
+
62
+ ---
63
+
64
+ ## `train_classification_model.py`
65
+
66
+ A script for training supervised learning **classification** models using scikit-learn. It handles data loading, preprocessing, hyperparameter tuning (via grid search CV), model evaluation using classification metrics, and saving of models, metrics, and visualizations.
67
+
68
+ ### Features
69
+
70
+ - Supports various classification models defined in `models/supervised/classification`.
71
+ - Performs hyperparameter tuning using grid search cross-validation (via `classification_hyperparameter_tuning`).
72
+ - Saves trained models and evaluation metrics (accuracy, precision, recall, F1).
73
+ - If `visualize` is enabled, it generates a metrics bar chart and a confusion matrix plot.
74
+
75
+ ### Usage
76
+
77
+ ```bash
78
+ python train_classification_model.py --model_module MODEL_MODULE \
79
+ --data_path DATA_PATH/DATA_NAME.csv \
80
+ --target_variable TARGET_VARIABLE [OPTIONS]
81
+ ```
82
+
83
+ **Required Arguments**:
84
+ - `model_module`: Name of the classification model module to import (e.g., `logistic_regression`).
85
+ - `data_path`: Path to the dataset directory, including the data file name.
86
+ - `target_variable`: Name of the target variable (categorical).
87
+
88
+ **Optional Arguments**:
89
+ - `test_size`: Proportion of the dataset to include in the test split (default: `0.2`).
90
+ - `random_state`: Random seed for reproducibility (default: `42`).
91
+ - `cv_folds`: Number of cross-validation folds (default: `5`).
92
+ - `scoring_metric`: Scoring metric for model evaluation (e.g., `accuracy`, `f1`, `roc_auc`).
93
+ - `model_path`: Path to save the trained model.
94
+ - `results_path`: Path to save results and metrics.
95
+ - `visualize`: Generate and save visualizations (metrics bar chart, confusion matrix).
96
+ - `drop_columns`: Comma-separated column names to drop from the dataset.
97
+
98
+ ### Usage Example
99
+
100
+ ```bash
101
+ python train_classification_model.py --model_module logistic_regression \
102
+ --data_path data/adult_income/train.csv \
103
+ --target_variable income_bracket \
104
+ --scoring_metric accuracy --visualize
105
+ ```
106
+
107
+ ---
108
+
109
+ ## `train_clustering_model.py`
110
+
111
+ A script for training **clustering** models (K-Means, DBSCAN, Gaussian Mixture, etc.) in an unsupervised manner. It supports data loading, optional drop/select of columns, label encoding for non-numeric features, optional hyperparameter tuning (silhouette-based), saving the final model, and generating a 2D cluster plot if needed.
112
+
113
+ ### Features
114
+
115
+ - Supports various clustering models defined in `models/unsupervised/clustering`.
116
+ - Optional hyperparameter tuning (silhouette score) via `clustering_hyperparameter_tuning`.
117
+ - Saves the trained clustering model and optional silhouette metrics.
118
+ - Generates a 2D scatter plot if `visualize` is enabled (using PCA if needed).
119
+
120
+ ### Usage
121
+
122
+ ```bash
123
+ python train_clustering_model.py --model_module MODEL_MODULE \
124
+ --data_path DATA_PATH/DATA_NAME.csv [OPTIONS]
125
+ ```
126
+
127
+ **Key Arguments**:
128
+ - `model_module`: Name of the clustering model module (e.g., `kmeans`, `dbscan`, `gaussian_mixture`).
129
+ - `data_path`: Path to the CSV dataset.
130
+
131
+ **Optional Arguments**:
132
+ - `drop_columns`: Comma-separated column names to drop.
133
+ - `select_columns`: Comma-separated column names to keep.
134
+ - `tune`: If set, performs silhouette-based hyperparameter tuning.
135
+ - `cv_folds`: Number of folds or times for silhouette-based repeated runs (basic approach).
136
+ - `scoring_metric`: Typically `'silhouette'`.
137
+ - `visualize`: If set, attempts a 2D scatter, using PCA if more than 2 features remain.
138
+ - `model_path`: Path to save the trained model.
139
+ - `results_path`: Path to save results (metrics, plots).
140
+
141
+ ### Usage Example
142
+
143
+ ```bash
144
+ python train_clustering_model.py \
145
+ --model_module kmeans \
146
+ --data_path data/mall_customer/Mall_Customers.csv \
147
+ --drop_columns "Gender" \
148
+ --select_columns "Annual Income (k$),Spending Score (1-100)" \
149
+ --visualize
150
+ ```
151
+
152
+ ---
153
+
154
+ ## `train_dimred_model.py`
155
+
156
+ A script for **dimensionality reduction** tasks (e.g., PCA, t-SNE, UMAP). It loads data, optionally drops or selects columns, label-encodes categorical features, fits the chosen dimensionality reduction model, saves the transformed data, and can visualize 2D/3D outputs.
157
+
158
+ ### Features
159
+
160
+ - Supports various dimension reduction models in `models/unsupervised/dimred`.
161
+ - Saves the fitted model and the transformed data (in CSV).
162
+ - Optionally creates a 2D or 3D scatter plot if the output dimension is 2 or 3.
163
+
164
+ ### Usage
165
+
166
+ ```bash
167
+ python train_dimred_model.py --model_module MODEL_MODULE \
168
+ --data_path DATA_PATH/DATA_NAME.csv [OPTIONS]
169
+ ```
170
+
171
+ **Key Arguments**:
172
+ - `model_module`: Name of the dimension reduction module (e.g., `pca`, `tsne`, `umap`).
173
+ - `data_path`: Path to the CSV dataset.
174
+
175
+ **Optional Arguments**:
176
+ - `drop_columns`: Comma-separated column names to drop.
177
+ - `select_columns`: Comma-separated column names to keep.
178
+ - `visualize`: If set, plots the 2D or 3D embedding.
179
+ - `model_path`: Path to save the trained model.
180
+ - `results_path`: Path to save the transformed data and any plots.
181
+
182
+ ### Usage Example
183
+
184
+ ```bash
185
+ python train_dimred_model.py \
186
+ --model_module pca \
187
+ --data_path data/breast_cancer/data.csv \
188
+ --drop_columns "id,diagnosis" \
189
+ --visualize
190
+ ```
191
+
192
+ ---
193
+
194
+ ## `train_anomaly_detection.py`
195
+
196
+ A script for training **anomaly/outlier detection** models (Isolation Forest, One-Class SVM, etc.). It supports dropping/selecting columns, label-encoding, saving anomaly predictions (0 = normal, 1 = outlier), and optionally visualizing points in 2D with outliers colored differently.
197
+
198
+ ### Features
199
+
200
+ - Supports various anomaly models in `models/unsupervised/anomaly`.
201
+ - Saves the model and an outlier predictions CSV.
202
+ - If `visualize` is enabled, performs PCA → 2D for plotting normal vs. outliers.
203
+
204
+ ### Usage
205
+
206
+ ```bash
207
+ python train_anomaly_detection.py --model_module MODEL_MODULE \
208
+ --data_path DATA_PATH/DATA_NAME.csv [OPTIONS]
209
+ ```
210
+
211
+ **Key Arguments**:
212
+ - `model_module`: Name of the anomaly detection module (e.g., `isolation_forest`, `one_class_svm`, `local_outlier_factor`).
213
+ - `data_path`: Path to the CSV dataset.
214
+
215
+ **Optional Arguments**:
216
+ - `drop_columns`: Comma-separated column names to drop.
217
+ - `select_columns`: Comma-separated column names to keep.
218
+ - `visualize`: If set, attempts a 2D scatter (via PCA) and colors outliers in red.
219
+ - `model_path`: Path to save the anomaly model.
220
+ - `results_path`: Path to save outlier predictions and plots.
221
+
222
+ ### Usage Example
223
+
224
+ ```bash
225
+ python train_anomaly_detection.py \
226
+ --model_module isolation_forest \
227
+ --data_path data/breast_cancer/data.csv \
228
+ --drop_columns "id,diagnosis" \
229
+ --visualize
230
+ ```
scripts/train_anomaly_detection.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ train_anomaly_detection.py
4
+
5
+ Trains an anomaly detection model (Isolation Forest, One-Class SVM, etc.) on a dataset.
6
+ Allows dropping or selecting columns, label-encoding for non-numeric data,
7
+ saves predictions (0 = normal, 1 = outlier) and optionally visualizes in 2D.
8
+
9
+ Usage Example:
10
+ --------------
11
+ python scripts/train_anomaly_detection.py \
12
+ --model_module isolation_forest \
13
+ --data_path data/raw/my_dataset.csv \
14
+ --drop_columns "unwanted_col" \
15
+ --select_columns "feat1,feat2,feat3" \
16
+ --visualize
17
+ """
18
+
19
+ import os
20
+ import sys
21
+ import argparse
22
+ import importlib
23
+ import pandas as pd
24
+ import numpy as np
25
+ import joblib
26
+
27
+ from sklearn.preprocessing import LabelEncoder
28
+ import matplotlib.pyplot as plt
29
+ from timeit import default_timer as timer
30
+
31
+ def main(args):
32
+ # Change to the project root if needed
33
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
34
+ os.chdir(project_root)
35
+ sys.path.insert(0, project_root)
36
+
37
+ # Dynamically import the chosen anomaly model module
38
+ model_module_path = f"models.unsupervised.anomaly.{args.model_module}"
39
+ model_module = importlib.import_module(model_module_path)
40
+
41
+ # Retrieve the estimator from the model file
42
+ estimator = model_module.estimator
43
+
44
+ # Prepare results directory
45
+ if args.results_path is None:
46
+ args.results_path = os.path.join("results", f"{estimator.__class__.__name__}_Anomaly")
47
+ os.makedirs(args.results_path, exist_ok=True)
48
+
49
+ # Load data
50
+ df = pd.read_csv(args.data_path)
51
+ print(f"Data loaded from {args.data_path}, initial shape: {df.shape}")
52
+
53
+ # Drop empty columns
54
+ df = df.dropna(axis='columns', how='all')
55
+ print("After dropping empty columns:", df.shape)
56
+
57
+ # Drop specified columns if any
58
+ if args.drop_columns:
59
+ drop_cols = [c.strip() for c in args.drop_columns.split(',') if c.strip()]
60
+ df.drop(columns=drop_cols, inplace=True, errors='ignore')
61
+ print(f"Dropped columns: {drop_cols} | New shape: {df.shape}")
62
+
63
+ # Select specified columns if any
64
+ if args.select_columns:
65
+ keep_cols = [c.strip() for c in args.select_columns.split(',') if c.strip()]
66
+ df = df[keep_cols]
67
+ print(f"Selected columns: {keep_cols} | New shape: {df.shape}")
68
+
69
+ # Label-encode non-numeric columns
70
+ for col in df.columns:
71
+ if df[col].dtype == 'object':
72
+ le = LabelEncoder()
73
+ df[col] = le.fit_transform(df[col])
74
+
75
+ # Convert DataFrame to numpy array
76
+ X = df.values
77
+ print(f"Final data shape after dropping/selecting columns and encoding: {X.shape}")
78
+
79
+ # Fit the anomaly model
80
+ start_time = timer()
81
+ estimator.fit(X)
82
+ end_time = timer()
83
+ train_time = end_time - start_time
84
+ print(f"Anomaly detection training with {args.model_module} completed in {train_time:.2f} seconds.")
85
+
86
+ # Save the model
87
+ model_output_path = os.path.join(args.results_path, "anomaly_model.pkl")
88
+ os.makedirs(args.model_path, exist_ok=True)
89
+ joblib.dump(estimator, model_output_path)
90
+ print(f"Model saved to {model_output_path}")
91
+
92
+ # Predict outliers: Typically returns 1 for inliers, -1 for outliers (or vice versa)
93
+ # We'll unify them to 0 = normal, 1 = outlier
94
+ raw_preds = estimator.predict(X)
95
+ # Some anomaly detectors do the opposite: IsolationForest => +1 inlier, -1 outlier
96
+ # Convert to 0/1:
97
+ preds_binary = np.where(raw_preds == 1, 0, 1)
98
+
99
+ outlier_count = np.sum(preds_binary)
100
+ inlier_count = len(preds_binary) - outlier_count
101
+ print(f"Detected {outlier_count} outliers out of {len(X)} samples. ({inlier_count} normal)")
102
+
103
+ # Save predictions
104
+ pred_df = pd.DataFrame({
105
+ 'OutlierPrediction': preds_binary
106
+ })
107
+ pred_path = os.path.join(args.results_path, "predictions.csv")
108
+ pred_df.to_csv(pred_path, index=False)
109
+ print(f"Predictions saved to {pred_path}")
110
+
111
+ # Visualization if 2D or 3D
112
+ if args.visualize:
113
+ print("Creating anomaly detection visualization...")
114
+ # We'll do PCA => 2D if dimension > 2
115
+ if X.shape[1] > 2:
116
+ from sklearn.decomposition import PCA
117
+ pca = PCA(n_components=2)
118
+ X_2d = pca.fit_transform(X)
119
+ x_label = "PC1"
120
+ y_label = "PC2"
121
+ elif X.shape[1] == 2:
122
+ X_2d = X
123
+ x_label = df.columns[0] if df.shape[1] == 2 else "Feature 1"
124
+ y_label = df.columns[1] if df.shape[1] == 2 else "Feature 2"
125
+ else:
126
+ # 1D or 0D => skip
127
+ print("Only 1 feature or none; can't create 2D scatter. Skipping.")
128
+ return
129
+
130
+ # Plot
131
+ plt.figure(figsize=(6,5))
132
+ # color outliers differently
133
+ colors = np.where(preds_binary == 1, 'r', 'b')
134
+ plt.scatter(X_2d[:,0], X_2d[:,1], c=colors, s=30, alpha=0.7)
135
+ plt.title(f"{estimator.__class__.__name__} Anomaly Detection")
136
+ plt.xlabel(x_label)
137
+ plt.ylabel(y_label)
138
+
139
+ # Save
140
+ plot_path = os.path.join(args.results_path, "anomaly_plot.png")
141
+ plt.savefig(plot_path)
142
+ plt.show()
143
+ print(f"Anomaly plot saved to {plot_path}")
144
+
145
+
146
+ if __name__ == "__main__":
147
+ parser = argparse.ArgumentParser(description="Train an anomaly detection model.")
148
+ parser.add_argument('--model_module', type=str, required=True,
149
+ help='Name of the anomaly detection model (e.g. isolation_forest, one_class_svm).')
150
+ parser.add_argument('--data_path', type=str, required=True,
151
+ help='Path to the CSV dataset file.')
152
+ parser.add_argument('--model_path', type=str, default='saved_models/Anomaly',
153
+ help='Path to save the trained model.')
154
+ parser.add_argument('--results_path', type=str, default=None,
155
+ help='Directory to save results (predictions, plots).')
156
+ parser.add_argument('--drop_columns', type=str, default='',
157
+ help='Comma-separated column names to drop.')
158
+ parser.add_argument('--select_columns', type=str, default='',
159
+ help='Comma-separated column names to keep (ignore the rest).')
160
+ parser.add_argument('--visualize', action='store_true',
161
+ help='If set, reduce to 2D (via PCA if needed) and color outliers vs. normal points.')
162
+ args = parser.parse_args()
163
+ main(args)
scripts/train_clustering_model.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ train_clustering_model.py
4
+
5
+ A script to train clustering models (K-Means, DBSCAN, Gaussian Mixture, etc.).
6
+ It can optionally perform hyperparameter tuning using silhouette score,
7
+ trains the model, saves it, and visualizes clusters if requested.
8
+ """
9
+
10
+ import os
11
+ import sys
12
+ import argparse
13
+ import importlib
14
+ import pandas as pd
15
+ import numpy as np
16
+ import joblib
17
+
18
+ from sklearn import datasets
19
+ from sklearn.metrics import silhouette_score
20
+ from sklearn.preprocessing import LabelEncoder
21
+ import matplotlib.pyplot as plt
22
+ import seaborn as sns
23
+ from timeit import default_timer as timer
24
+
25
+ def main(args):
26
+ # Change to the project root if needed
27
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
28
+ os.chdir(project_root)
29
+ sys.path.insert(0, project_root)
30
+
31
+ # Optional: import the unsupervised hyperparameter tuning function
32
+ from utils.unsupervised_hyperparameter_tuning import clustering_hyperparameter_tuning
33
+
34
+ # Dynamically import the chosen clustering model module
35
+ model_module_path = f"models.unsupervised.clustering.{args.model_module}"
36
+ model_module = importlib.import_module(model_module_path)
37
+
38
+ # Retrieve the estimator and param grid from the model file
39
+ estimator = model_module.estimator
40
+ param_grid = getattr(model_module, 'param_grid', {})
41
+ default_scoring = getattr(model_module, 'default_scoring', 'silhouette') # fallback
42
+
43
+ # Prepare results directory
44
+ if args.results_path is None:
45
+ # e.g., 'results/KMeans_Clustering'
46
+ args.results_path = os.path.join('results', f"{estimator.__class__.__name__}_Clustering")
47
+ os.makedirs(args.results_path, exist_ok=True)
48
+
49
+ # Load data from CSV
50
+ df = pd.read_csv(args.data_path)
51
+ print(f"Data loaded from {args.data_path}, initial shape: {df.shape}")
52
+
53
+ # Drop empty columns
54
+ df = df.dropna(axis='columns', how='all')
55
+ print("After dropping empty columns:", df.shape)
56
+
57
+ # Drop specified columns if any
58
+ if args.drop_columns:
59
+ drop_cols = [col.strip() for col in args.drop_columns.split(',') if col.strip()]
60
+ df = df.drop(columns=drop_cols, errors='ignore')
61
+ print(f"Dropped columns: {drop_cols} | New shape: {df.shape}")
62
+
63
+ # Select specified columns if any
64
+ if args.select_columns:
65
+ keep_cols = [col.strip() for col in args.select_columns.split(',') if col.strip()]
66
+ # Keep only these columns (intersection with what's in df)
67
+ df = df[keep_cols]
68
+ print(f"Selected columns: {keep_cols} | New shape: {df.shape}")
69
+
70
+ # For each non-numeric column, apply label encoding
71
+ for col in df.columns:
72
+ if df[col].dtype == 'object':
73
+ le = LabelEncoder()
74
+ df[col] = le.fit_transform(df[col])
75
+
76
+ # Convert DataFrame to NumPy array for clustering
77
+ X = df.values
78
+ print(f"Final shape after dropping/selecting columns and encoding: {X.shape}")
79
+
80
+ # If user wants hyperparam tuning
81
+ if args.tune:
82
+ print("Performing hyperparameter tuning...")
83
+ best_model, best_params = clustering_hyperparameter_tuning(
84
+ X, estimator, param_grid, scoring=default_scoring, cv=args.cv_folds
85
+ )
86
+ estimator = best_model # the fitted best model
87
+ print("Best Params:", best_params)
88
+ else:
89
+ # Just fit the model directly
90
+ print("No hyperparameter tuning; fitting model with default parameters...")
91
+ start_time = timer()
92
+ estimator.fit(X)
93
+ end_time = timer()
94
+ print(f"Training time (no tuning): {end_time - start_time:.2f}s")
95
+
96
+ # Ensure the model is fitted at this point
97
+ model_output_path = os.path.join(args.results_path, "best_model.pkl")
98
+ os.makedirs(args.model_path, exist_ok=True) # ensure directory exists
99
+ joblib.dump(estimator, model_output_path)
100
+ print(f"Model saved to {model_output_path}")
101
+
102
+ # Evaluate using silhouette if possible
103
+ # Some clusterers use .labels_, others require .predict(X)
104
+ if hasattr(estimator, 'labels_'):
105
+ labels = estimator.labels_
106
+ else:
107
+ labels = estimator.predict(X) # e.g. KMeans, GaussianMixture
108
+
109
+ unique_labels = set(labels)
110
+ if len(unique_labels) > 1:
111
+ sil = silhouette_score(X, labels)
112
+ print(f"Silhouette Score: {sil:.4f}")
113
+ pd.DataFrame({"Silhouette": [sil]}).to_csv(
114
+ os.path.join(args.results_path, "metrics.csv"), index=False
115
+ )
116
+ else:
117
+ print("Only one cluster found; silhouette score not meaningful.")
118
+
119
+ # Visualization
120
+ if args.visualize:
121
+ print("Creating cluster visualization...")
122
+
123
+ # If X has more than 2 dims, do PCA => 2D
124
+ if X.shape[1] > 2:
125
+ from sklearn.decomposition import PCA
126
+ pca = PCA(n_components=2)
127
+ X_2d = pca.fit_transform(X)
128
+ var_ratio = pca.explained_variance_ratio_
129
+ pc1_var = var_ratio[0] * 100
130
+ pc2_var = var_ratio[1] * 100
131
+ x_label = f"PC1 ({pc1_var:.2f}% var)"
132
+ y_label = f"PC2 ({pc2_var:.2f}% var)"
133
+ elif X.shape[1] == 2:
134
+ # If we know 'df' and shape matches, label with col names
135
+ if df.shape[1] == 2:
136
+ x_label = df.columns[0]
137
+ y_label = df.columns[1]
138
+ else:
139
+ x_label = "Feature 1"
140
+ y_label = "Feature 2"
141
+ X_2d = X
142
+ else:
143
+ # 1D or 0D => skip
144
+ if X.shape[1] == 1:
145
+ print("Only 1 feature available; cannot create a 2D scatter plot.")
146
+ else:
147
+ print("No features available for plotting.")
148
+ return
149
+
150
+ plt.figure(figsize=(6, 5))
151
+ plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', s=30)
152
+ plt.title(f"{estimator.__class__.__name__} Clusters")
153
+ plt.xlabel(x_label)
154
+ plt.ylabel(y_label)
155
+
156
+ # Save the figure
157
+ plot_path = os.path.join(args.results_path, "clusters.png")
158
+ plt.savefig(plot_path)
159
+ plt.show()
160
+ print(f"Cluster plot saved to {plot_path}")
161
+
162
+ if __name__ == "__main__":
163
+ parser = argparse.ArgumentParser(description="Train a clustering model.")
164
+ parser.add_argument('--model_module', type=str, required=True,
165
+ help='Name of the clustering model module (e.g. kmeans, dbscan, etc.).')
166
+ parser.add_argument('--data_path', type=str, required=True,
167
+ help='Path to the CSV dataset.')
168
+ parser.add_argument('--model_path', type=str, default='saved_models/Clustering',
169
+ help='Path to save the trained model.')
170
+ parser.add_argument('--results_path', type=str, default=None,
171
+ help='Directory to save results (metrics, plots).')
172
+ parser.add_argument('--cv_folds', type=int, default=5,
173
+ help='Number of folds for hyperparam tuning.')
174
+ parser.add_argument('--tune', action='store_true',
175
+ help='Perform hyperparameter tuning with silhouette score.')
176
+ parser.add_argument('--visualize', action='store_true',
177
+ help='Generate a 2D visualization of the clusters.')
178
+ parser.add_argument('--drop_columns', type=str, default='',
179
+ help='Comma-separated column names to drop from the dataset.')
180
+ parser.add_argument('--select_columns', type=str, default='',
181
+ help='Comma-separated column names to keep (ignore all others).')
182
+ args = parser.parse_args()
183
+ main(args)
scripts/train_dimred_model.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ train_dimred_model.py
4
+
5
+ Trains a dimensionality reduction model (e.g., PCA, t-SNE, UMAP) on a dataset.
6
+ It can drop or select specific columns, perform label encoding on any non-numeric columns,
7
+ and optionally visualize the reduced data (2D or 3D).
8
+
9
+ Example Usage:
10
+ --------------
11
+ python scripts/train_dimred_model.py \
12
+ --model_module pca \
13
+ --data_path data/raw/breast-cancer-wisconsin-data/data.csv \
14
+ --drop_columns "id" \
15
+ --select_columns "radius_mean, texture_mean, perimeter_mean, area_mean" \
16
+ --visualize
17
+ """
18
+
19
+ import os
20
+ import sys
21
+ import argparse
22
+ import importlib
23
+ import pandas as pd
24
+ import numpy as np
25
+ import joblib
26
+
27
+ from sklearn.impute import SimpleImputer
28
+ from sklearn.preprocessing import LabelEncoder
29
+ import matplotlib.pyplot as plt
30
+
31
+ def main(args):
32
+ # Move to project root if needed
33
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
34
+ os.chdir(project_root)
35
+ sys.path.insert(0, project_root)
36
+
37
+ # Dynamically import the chosen model module (pca.py, tsne.py, umap.py, etc.)
38
+ model_module_path = f"models.unsupervised.dimred.{args.model_module}"
39
+ model_module = importlib.import_module(model_module_path)
40
+
41
+ # Retrieve the estimator from the model file
42
+ estimator = model_module.estimator
43
+ default_n_components = getattr(model_module, 'default_n_components', 2) # fallback
44
+
45
+ # Prepare results directory
46
+ if args.results_path is None:
47
+ # e.g., 'results/PCA_DimRed'
48
+ args.results_path = os.path.join('results', f"{estimator.__class__.__name__}_DimRed")
49
+ os.makedirs(args.results_path, exist_ok=True)
50
+
51
+ # Load data from CSV
52
+ df = pd.read_csv(args.data_path)
53
+ print(f"Data loaded from {args.data_path}, initial shape: {df.shape}")
54
+
55
+ # Drop empty columns
56
+ df = df.dropna(axis='columns', how='all')
57
+ print("After dropping empty columns:", df.shape)
58
+
59
+ # Drop specified columns if any
60
+ if args.drop_columns:
61
+ drop_cols = [col.strip() for col in args.drop_columns.split(',') if col.strip()]
62
+ df = df.drop(columns=drop_cols, errors='ignore')
63
+ print(f"Dropped columns: {drop_cols} | New shape: {df.shape}")
64
+
65
+ # Select specified columns if any
66
+ if args.select_columns:
67
+ keep_cols = [col.strip() for col in args.select_columns.split(',') if col.strip()]
68
+ df = df[keep_cols]
69
+ print(f"Selected columns: {keep_cols} | New shape: {df.shape}")
70
+
71
+ # Label-encode non-numeric columns
72
+ for col in df.columns:
73
+ if df[col].dtype == 'object':
74
+ le = LabelEncoder()
75
+ df[col] = le.fit_transform(df[col])
76
+
77
+ # Impute
78
+ imputer = SimpleImputer(strategy='mean') # or 'median'
79
+ df_array = imputer.fit_transform(df)
80
+ df_imputed = pd.DataFrame(df_array, columns=df.columns)
81
+ print("After label-encoding and imputation:", df_imputed.shape)
82
+
83
+ # Convert DataFrame to numpy array
84
+ X = df_imputed.values
85
+ print(f"Final data shape after dropping/selecting columns and encoding: {X.shape}")
86
+
87
+ # Fit-transform the data (typical for dimensionality reduction)
88
+ X_transformed = estimator.fit_transform(X)
89
+ print(f"Dimensionality reduction done using {args.model_module}. Output shape: {X_transformed.shape}")
90
+
91
+ # Save the model
92
+ model_output_path = os.path.join(args.results_path, "dimred_model.pkl")
93
+ os.makedirs(args.model_path, exist_ok=True) # ensure directory
94
+ joblib.dump(estimator, model_output_path)
95
+ print(f"Model saved to {model_output_path}")
96
+
97
+ # Save the transformed data
98
+ transformed_path = os.path.join(args.results_path, "X_transformed.csv")
99
+ pd.DataFrame(X_transformed).to_csv(transformed_path, index=False)
100
+ print(f"Transformed data saved to {transformed_path}")
101
+
102
+ # Visualization (only if 2D or 3D)
103
+ if args.visualize:
104
+ n_dims = X_transformed.shape[1]
105
+ if n_dims == 2:
106
+ plt.figure(figsize=(6,5))
107
+ plt.scatter(X_transformed[:,0], X_transformed[:,1], s=30, alpha=0.7, c='blue')
108
+ plt.title(f"{estimator.__class__.__name__} 2D Projection")
109
+ plt.xlabel("Component 1")
110
+ plt.ylabel("Component 2")
111
+ plot_path = os.path.join(args.results_path, "dimred_plot_2D.png")
112
+ plt.savefig(plot_path)
113
+ plt.show()
114
+ print(f"2D plot saved to {plot_path}")
115
+ elif n_dims == 3:
116
+ from mpl_toolkits.mplot3d import Axes3D
117
+ fig = plt.figure()
118
+ ax = fig.add_subplot(111, projection='3d')
119
+ ax.scatter(X_transformed[:,0], X_transformed[:,1], X_transformed[:,2], s=30, alpha=0.7, c='blue')
120
+ ax.set_title(f"{estimator.__class__.__name__} 3D Projection")
121
+ ax.set_xlabel("Component 1")
122
+ ax.set_ylabel("Component 2")
123
+ ax.set_zlabel("Component 3")
124
+ plot_path = os.path.join(args.results_path, "dimred_plot_3D.png")
125
+ plt.savefig(plot_path)
126
+ plt.show()
127
+ print(f"3D plot saved to {plot_path}")
128
+ else:
129
+ print(f"Visualization only supported for 2D or 3D outputs. Got {n_dims}D, skipping.")
130
+
131
+
132
+ if __name__ == "__main__":
133
+ parser = argparse.ArgumentParser(description="Train a dimensionality reduction model.")
134
+ parser.add_argument('--model_module', type=str, required=True,
135
+ help='Name of the dimred model module (e.g. pca, tsne, umap).')
136
+ parser.add_argument('--data_path', type=str, required=True,
137
+ help='Path to the CSV dataset file.')
138
+ parser.add_argument('--model_path', type=str, default='saved_models/DimRed',
139
+ help='Where to save the fitted model.')
140
+ parser.add_argument('--results_path', type=str, default=None,
141
+ help='Directory to store results (transformed data, plots).')
142
+ parser.add_argument('--visualize', action='store_true',
143
+ help='Plot the transformed data if 2D or 3D.')
144
+ parser.add_argument('--drop_columns', type=str, default='',
145
+ help='Comma-separated column names to drop from the dataset.')
146
+ parser.add_argument('--select_columns', type=str, default='',
147
+ help='Comma-separated column names to keep (ignore the rest).')
148
+
149
+ args = parser.parse_args()
150
+ main(args)
utils/README.md ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utils
2
+
3
+ This directory contains utility scripts and helper functions that are used throughout the project. These scripts provide common functionalities such as data preprocessing, hyperparameter tuning, and other support functions that assist in model training and evaluation for **supervised** (regression and classification) as well as **unsupervised** (clustering) tasks.
4
+
5
+ ## Contents
6
+
7
+ - [supervised_hyperparameter_tuning.py](#supervised_hyperparameter_tuningpy)
8
+ - [unsupervised_hyperparameter_tuning.py](#unsupervised_hyperparameter_tuningpy)
9
+
10
+ ---
11
+
12
+ ## `supervised_hyperparameter_tuning.py`
13
+
14
+ This script contains functions for performing hyperparameter tuning on **supervised learning** models (both regression and classification) using scikit-learn's `Pipeline` and `GridSearchCV`.
15
+
16
+ ### Functions
17
+
18
+ #### `regression_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None)`
19
+
20
+ Performs hyperparameter tuning for **regression** models.
21
+
22
+ - **Parameters**:
23
+ - `X (pd.DataFrame)`: Feature matrix.
24
+ - `y (pd.Series)`: Numeric target variable.
25
+ - `estimator`: A scikit-learn regressor (e.g., `LinearRegression()`).
26
+ - `param_grid (dict)`: Parameter names and lists of values (e.g. `{'model__fit_intercept': [True, False]}`).
27
+ - `cv (int)`: Number of cross-validation folds (default 5).
28
+ - `scoring (str)`: Scoring metric (e.g., `'neg_root_mean_squared_error'`).
29
+ - **Returns**:
30
+ - `best_model`: The pipeline with the best hyperparameters.
31
+ - `best_params (dict)`: The dictionary of best hyperparameters.
32
+
33
+ **Example**:
34
+
35
+ ```python
36
+ from utils.supervised_hyperparameter_tuning import regression_hyperparameter_tuning
37
+ from sklearn.linear_model import LinearRegression
38
+
39
+ X = ... # Your regression features
40
+ y = ... # Your numeric target variable
41
+ param_grid = {
42
+ 'model__fit_intercept': [True, False]
43
+ }
44
+
45
+ best_model, best_params = regression_hyperparameter_tuning(
46
+ X, y, LinearRegression(), param_grid, scoring='neg_root_mean_squared_error'
47
+ )
48
+ ```
49
+
50
+ ---
51
+
52
+ #### `classification_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None)`
53
+
54
+ Performs hyperparameter tuning for **classification** models.
55
+
56
+ - **Parameters**:
57
+ - `X (pd.DataFrame)`: Feature matrix.
58
+ - `y (pd.Series)`: Target variable (binary or multi-class).
59
+ - `estimator`: A scikit-learn classifier (e.g., `LogisticRegression()`, `RandomForestClassifier()`).
60
+ - `param_grid (dict)`: Parameter names and lists of values (e.g. `{'model__n_estimators': [100, 200]}`).
61
+ - `cv (int)`: Number of cross-validation folds (default 5).
62
+ - `scoring (str)`: Scoring metric (e.g., `'accuracy'`, `'f1'`, `'roc_auc'`).
63
+ - **Returns**:
64
+ - `best_model`: The pipeline with the best hyperparameters.
65
+ - `best_params (dict)`: The dictionary of best hyperparameters.
66
+
67
+ **Example**:
68
+
69
+ ```python
70
+ from utils.supervised_hyperparameter_tuning import classification_hyperparameter_tuning
71
+ from sklearn.ensemble import RandomForestClassifier
72
+
73
+ X = ... # Your classification features
74
+ y = ... # Binary or multi-class labels
75
+ param_grid = {
76
+ 'model__n_estimators': [100, 200],
77
+ 'model__max_depth': [None, 10]
78
+ }
79
+
80
+ best_model, best_params = classification_hyperparameter_tuning(
81
+ X, y, RandomForestClassifier(), param_grid, scoring='accuracy'
82
+ )
83
+ ```
84
+
85
+ ---
86
+
87
+ ## `unsupervised_hyperparameter_tuning.py`
88
+
89
+ This script provides a function for **hyperparameter tuning of clustering models** using **silhouette score** as the objective metric. Unlike supervised approaches, clustering does not have labeled data, so the silhouette score is used to measure how well-separated the clusters are.
90
+
91
+ ### Functions
92
+
93
+ #### `clustering_hyperparameter_tuning(X, estimator, param_grid, scoring='silhouette', cv=5)`
94
+
95
+ A simple manual hyperparameter search for clustering models.
96
+
97
+ - **Parameters**:
98
+ - `X (array-like)`: Feature matrix for clustering.
99
+ - `estimator`: A scikit-learn clustering estimator supporting `.fit(X)` and either `.labels_` or `.predict(X)` (e.g., `KMeans`, `DBSCAN`, `GaussianMixture`).
100
+ - `param_grid (dict)`: Dictionary of hyperparams (e.g., `{'model__n_clusters': [2,3,4]}`).
101
+ - `scoring (str)`: Only `'silhouette'` is supported.
102
+ - `cv (int)`: Optionally, you could do repeated subsampling or advanced logic for more stable estimates, but the default implementation does a single fit.
103
+ - **Returns**:
104
+ - `best_estimator`: The fitted estimator with the best silhouette score.
105
+ - `best_params (dict)`: The dictionary of best hyperparameters found.
106
+
107
+ **Key Steps**:
108
+ 1. **Parameter Loop**: For each combination of parameters in `ParameterGrid(param_grid)`, clone and fit the estimator.
109
+ 2. **Retrieve Labels**: If the estimator has `.labels_`, use it; otherwise use `.predict(X)`.
110
+ 3. **Compute Silhouette**: If more than one cluster is found, calculate `silhouette_score(X, labels)`.
111
+ 4. **Track the Best**: Keep track of the parameter set yielding the highest silhouette score.
112
+ 5. **Fallback**: If no valid parameter combos produce more than one cluster, it falls back to the original estimator.
113
+
114
+ **Example**:
115
+
116
+ ```python
117
+ from utils.unsupervised_hyperparameter_tuning import clustering_hyperparameter_tuning
118
+ from sklearn.cluster import KMeans
119
+
120
+ X = ... # Your numeric data for clustering
121
+ param_grid = {
122
+ 'model__n_clusters': [2, 3, 4],
123
+ 'model__init': ['k-means++', 'random']
124
+ }
125
+
126
+ best_model, best_params = clustering_hyperparameter_tuning(
127
+ X, KMeans(random_state=42), param_grid, scoring='silhouette'
128
+ )
129
+ print("Best Silhouette Score found:", best_params)
130
+ ```
131
+
132
+ **Note**: This approach is simpler than using `GridSearchCV` for clustering because unsupervised tasks do not have a “true” label. The silhouette score is a common measure, but you could adapt the function for other internal cluster metrics if desired.
utils/unsupervised_hyperparameter_tuning.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ """
3
+ unsupervised_hyperparameter_tuning.py
4
+
5
+ Provides a function for hyperparameter tuning of clustering models
6
+ using silhouette score as an objective.
7
+ """
8
+
9
+ import numpy as np
10
+ from sklearn.model_selection import ParameterGrid
11
+ from sklearn.metrics import silhouette_score
12
+ import copy
13
+
14
+ def clustering_hyperparameter_tuning(X, estimator, param_grid, scoring='silhouette', cv=5):
15
+ """
16
+ A simple manual hyperparameter search for clustering models,
17
+ using silhouette_score for evaluation.
18
+
19
+ Args:
20
+ X (array-like): Feature data for clustering.
21
+ estimator: An estimator with .fit() and .predict() or .labels_ attribute.
22
+ param_grid (dict): Dictionary of hyperparams, e.g. {'model__n_clusters': [2,3,4]}.
23
+ scoring (str): Only 'silhouette' is supported here.
24
+ cv (int): We can do repeated subsampling or something similar to get stable silhouette.
25
+
26
+ Returns:
27
+ best_estimator: The estimator with best silhouette score.
28
+ best_params: Dictionary of best parameters found.
29
+ """
30
+ if not param_grid:
31
+ # If param_grid is empty, just fit once
32
+ estimator.fit(X)
33
+ return estimator, {}
34
+
35
+ best_score = -1 # silhouette ranges -1 to 1
36
+ best_params = None
37
+ best_estimator = None
38
+
39
+ for params in ParameterGrid(param_grid):
40
+ # Clone the original estimator
41
+ from sklearn.base import clone
42
+ current_estimator = clone(estimator)
43
+
44
+ # Apply params
45
+ for param, val in params.items():
46
+ # param might look like "model__n_clusters"
47
+ # We adapt: if param starts with 'model__', we set on current_estimator
48
+ path = param.split('__')
49
+ if len(path) > 1:
50
+ # E.g., path = ['model','n_clusters']
51
+ # we set current_estimator.n_clusters = val
52
+ setattr(current_estimator, path[1], val)
53
+ else:
54
+ # If there's no 'model__' prefix
55
+ setattr(current_estimator, param, val)
56
+
57
+ # Simple approach to do multiple splits if we want
58
+ # For now, let's do a single fit to keep it straightforward
59
+ current_estimator.fit(X)
60
+
61
+ # Use the fitted current_estimator here, not 'estimator'
62
+ if hasattr(current_estimator, 'labels_') and current_estimator.labels_ is not None:
63
+ labels = current_estimator.labels_
64
+ elif hasattr(current_estimator, 'predict'):
65
+ labels = current_estimator.predict(X)
66
+ else:
67
+ raise ValueError("No valid way to retrieve cluster labels for this estimator.")
68
+
69
+ unique_labels = set(labels)
70
+ if len(unique_labels) > 1:
71
+ score = silhouette_score(X, labels)
72
+ else:
73
+ score = -999 # invalid scenario if only 1 cluster
74
+
75
+ if score > best_score:
76
+ best_score = score
77
+ best_params = params
78
+ best_estimator = current_estimator
79
+
80
+ if best_estimator is None:
81
+ print("No valid parameter combination produced more than 1 cluster. Falling back to original estimator.")
82
+ estimator.fit(X)
83
+ return estimator, {}
84
+ else:
85
+ print(f"Best silhouette score: {best_score:.4f}")
86
+ return best_estimator, best_params