Spaces:
Sleeping
Sleeping
Commit ·
4c91838
1
Parent(s): a9ab4a2
first commit
Browse files- app.py +323 -0
- data/README.md +1 -0
- data/datasets/README.md +21 -0
- data/datasets/kaggle_data.py +115 -0
- data/raw/README.md +1 -0
- models/unsupervised/anomaly/README.md +34 -0
- models/unsupervised/anomaly/isolation_forest.py +34 -0
- models/unsupervised/anomaly/local_outlier_factor.py +39 -0
- models/unsupervised/anomaly/one_class_svm.py +35 -0
- models/unsupervised/clustering/README.md +37 -0
- models/unsupervised/clustering/dbscan.py +29 -0
- models/unsupervised/clustering/gaussian_mixture.py +32 -0
- models/unsupervised/clustering/hierarchical_clustering.py +33 -0
- models/unsupervised/clustering/kmeans.py +32 -0
- models/unsupervised/dimred/README.md +34 -0
- models/unsupervised/dimred/pca.py +28 -0
- models/unsupervised/dimred/tsne.py +30 -0
- models/unsupervised/dimred/umap.py +35 -0
- requirements.txt +13 -0
- scripts/README.md +230 -0
- scripts/train_anomaly_detection.py +163 -0
- scripts/train_clustering_model.py +183 -0
- scripts/train_dimred_model.py +150 -0
- utils/README.md +132 -0
- utils/unsupervised_hyperparameter_tuning.py +86 -0
app.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
Gradio Interface for Unsupervised Learning (Clustering, Dimensionality Reduction, Anomaly Detection)
|
| 4 |
+
|
| 5 |
+
This script provides a single Gradio-based interface to run three unsupervised tasks:
|
| 6 |
+
1. Clustering
|
| 7 |
+
2. Dimensionality Reduction
|
| 8 |
+
3. Anomaly (Outlier) Detection
|
| 9 |
+
|
| 10 |
+
Each task is placed in its own Gradio Tab. The user can:
|
| 11 |
+
- Choose a model from the relevant unsupervised directory (clustering/dimred/anomaly).
|
| 12 |
+
- Specify dataset input (upload, local path, or Kaggle).
|
| 13 |
+
- Select columns to drop or keep.
|
| 14 |
+
- Execute the relevant training script (train_clustering_model.py, train_dimred_model.py, or train_anomaly_detection.py).
|
| 15 |
+
- View logs and optional plots.
|
| 16 |
+
|
| 17 |
+
Project Requirements:
|
| 18 |
+
- Python 3.7+.
|
| 19 |
+
- Gradio, scikit-learn, pandas, etc. in requirements.txt.
|
| 20 |
+
- Properly structured project with:
|
| 21 |
+
- scripts/train_clustering_model.py
|
| 22 |
+
- scripts/train_dimred_model.py
|
| 23 |
+
- scripts/train_anomaly_detection.py
|
| 24 |
+
- models/unsupervised/<task>/<model>.py
|
| 25 |
+
- data/datasets/kaggle_data.py (optional for Kaggle usage).
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
import gradio as gr
|
| 29 |
+
import pandas as pd
|
| 30 |
+
import os
|
| 31 |
+
import subprocess
|
| 32 |
+
import sys
|
| 33 |
+
import glob
|
| 34 |
+
import re
|
| 35 |
+
|
| 36 |
+
#####################################
|
| 37 |
+
# Helper Functions
|
| 38 |
+
#####################################
|
| 39 |
+
|
| 40 |
+
def get_model_modules(task_type):
|
| 41 |
+
"""
|
| 42 |
+
Dynamically fetch model modules from the unsupervised subdirectories:
|
| 43 |
+
- clustering
|
| 44 |
+
- dimred
|
| 45 |
+
- anomaly
|
| 46 |
+
"""
|
| 47 |
+
models_dir = os.path.join(project_root, 'models', 'unsupervised', task_type)
|
| 48 |
+
if not os.path.exists(models_dir):
|
| 49 |
+
print(f"Directory does not exist: {models_dir}")
|
| 50 |
+
return []
|
| 51 |
+
model_files = glob.glob(os.path.join(models_dir, '*.py'))
|
| 52 |
+
modules = [
|
| 53 |
+
os.path.splitext(os.path.basename(f))[0]
|
| 54 |
+
for f in model_files if not f.endswith('__init__.py')
|
| 55 |
+
]
|
| 56 |
+
return modules
|
| 57 |
+
|
| 58 |
+
def download_kaggle_data(json_path, dataset_name, is_competition):
|
| 59 |
+
from data.datasets.kaggle_data import get_kaggle_data
|
| 60 |
+
data_path = get_kaggle_data(json_path=json_path, data_name=dataset_name, is_competition=is_competition)
|
| 61 |
+
return data_path
|
| 62 |
+
|
| 63 |
+
def run_subprocess(script_path, script_args):
|
| 64 |
+
"""
|
| 65 |
+
Run a subprocess call to the given script with the specified arguments.
|
| 66 |
+
Returns (output_text, plot_image_path_or_None).
|
| 67 |
+
"""
|
| 68 |
+
try:
|
| 69 |
+
result = subprocess.run(script_args, capture_output=True, text=True)
|
| 70 |
+
output = result.stdout
|
| 71 |
+
errors = result.stderr
|
| 72 |
+
if result.returncode != 0:
|
| 73 |
+
return f"Error during training:\n{errors}", None
|
| 74 |
+
else:
|
| 75 |
+
# Attempt to parse any 'Visualization saved to ...' line for an image path
|
| 76 |
+
output = re.sub(r"Figure\(\d+x\d+\)", "", output).strip()
|
| 77 |
+
image_path = None
|
| 78 |
+
|
| 79 |
+
# Look for "Plot saved to ..." or any ".png" reference
|
| 80 |
+
match_plot = re.search(r"Plot saved to (.+)", output)
|
| 81 |
+
if match_plot:
|
| 82 |
+
image_path = match_plot.group(1).strip()
|
| 83 |
+
else:
|
| 84 |
+
match_png = re.search(r"(\S+\.png)", output)
|
| 85 |
+
if match_png:
|
| 86 |
+
image_path = match_png.group(1)
|
| 87 |
+
|
| 88 |
+
if image_path and os.path.exists(image_path):
|
| 89 |
+
return f"Completed successfully.\n\n{output}", image_path
|
| 90 |
+
else:
|
| 91 |
+
return f"Completed successfully.\n\n{output}", None
|
| 92 |
+
except Exception as e:
|
| 93 |
+
return f"An error occurred:\n{str(e)}", None
|
| 94 |
+
|
| 95 |
+
def get_columns_from_data(data_option, data_file, data_path,
|
| 96 |
+
kaggle_json_file, kaggle_competition_name, kaggle_data_name,
|
| 97 |
+
is_competition):
|
| 98 |
+
"""
|
| 99 |
+
Attempt to load the CSV and return columns.
|
| 100 |
+
"""
|
| 101 |
+
final_path = None
|
| 102 |
+
if data_option == "Upload Data File":
|
| 103 |
+
if data_file is None:
|
| 104 |
+
return []
|
| 105 |
+
final_path = data_file
|
| 106 |
+
elif data_option == "Provide Data Path":
|
| 107 |
+
if os.path.exists(data_path):
|
| 108 |
+
final_path = data_path
|
| 109 |
+
else:
|
| 110 |
+
print("Provided path does not exist.")
|
| 111 |
+
return []
|
| 112 |
+
elif data_option == "Download from Kaggle":
|
| 113 |
+
if kaggle_json_file is None:
|
| 114 |
+
print("No kaggle.json uploaded.")
|
| 115 |
+
return []
|
| 116 |
+
import shutil
|
| 117 |
+
kaggle_config_dir = os.path.expanduser('~/.kaggle')
|
| 118 |
+
os.makedirs(kaggle_config_dir, exist_ok=True)
|
| 119 |
+
kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json')
|
| 120 |
+
shutil.copy(kaggle_json_file.name, kaggle_json_path)
|
| 121 |
+
os.chmod(kaggle_json_path, 0o600)
|
| 122 |
+
|
| 123 |
+
data_dir = download_kaggle_data(kaggle_json_path, kaggle_competition_name, is_competition)
|
| 124 |
+
if data_dir is None:
|
| 125 |
+
print("Failed to download from Kaggle.")
|
| 126 |
+
return []
|
| 127 |
+
final_path = os.path.join(data_dir, kaggle_data_name)
|
| 128 |
+
if not os.path.exists(final_path):
|
| 129 |
+
print(f"{kaggle_data_name} not found in Kaggle data.")
|
| 130 |
+
return []
|
| 131 |
+
else:
|
| 132 |
+
print("Invalid data option.")
|
| 133 |
+
return []
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
df = pd.read_csv(final_path)
|
| 137 |
+
return df.columns.tolist()
|
| 138 |
+
except Exception as e:
|
| 139 |
+
print(f"Error reading {final_path}: {e}")
|
| 140 |
+
return []
|
| 141 |
+
|
| 142 |
+
#####################################
|
| 143 |
+
# Creating the Gradio Tab
|
| 144 |
+
#####################################
|
| 145 |
+
|
| 146 |
+
def create_task_tab(task_name, model_modules, script_path):
|
| 147 |
+
"""
|
| 148 |
+
Creates a Gradio Tab for a specific unsupervised task (Clustering, DimRed, Anomaly).
|
| 149 |
+
- model_modules: list of model modules from get_model_modules(task_type)
|
| 150 |
+
- script_path: e.g. 'scripts/train_clustering_model.py'
|
| 151 |
+
"""
|
| 152 |
+
|
| 153 |
+
with gr.Tab(task_name):
|
| 154 |
+
gr.Markdown(f"## {task_name} Task")
|
| 155 |
+
|
| 156 |
+
# Model selection
|
| 157 |
+
model_select = gr.Dropdown(choices=model_modules, label=f"{task_name} Model Module")
|
| 158 |
+
|
| 159 |
+
# Data input approach
|
| 160 |
+
data_option = gr.Radio(
|
| 161 |
+
choices=["Upload Data File", "Provide Data Path", "Download from Kaggle"],
|
| 162 |
+
label="Data Input Option",
|
| 163 |
+
value="Upload Data File"
|
| 164 |
+
)
|
| 165 |
+
|
| 166 |
+
with gr.Column(visible=True) as upload_data_col:
|
| 167 |
+
data_file = gr.File(label="Upload CSV Data File", type="filepath")
|
| 168 |
+
|
| 169 |
+
with gr.Column(visible=False) as path_data_col:
|
| 170 |
+
data_path_txt = gr.Textbox(label="Data File Path")
|
| 171 |
+
|
| 172 |
+
with gr.Column(visible=False) as kaggle_data_col:
|
| 173 |
+
kaggle_json = gr.File(label="Upload kaggle.json File", type="filepath")
|
| 174 |
+
kaggle_competition_name = gr.Textbox(value='', label="Kaggle Competition/Dataset Name")
|
| 175 |
+
kaggle_data_name = gr.Textbox(value='data.csv', label="Data File Name in Kaggle dataset")
|
| 176 |
+
kaggle_is_competition = gr.Checkbox(label="Is Kaggle Competition?", value=False)
|
| 177 |
+
|
| 178 |
+
# Toggle data input columns
|
| 179 |
+
def toggle_data_input(choice):
|
| 180 |
+
if choice == "Upload Data File":
|
| 181 |
+
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
|
| 182 |
+
elif choice == "Provide Data Path":
|
| 183 |
+
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
|
| 184 |
+
elif choice == "Download from Kaggle":
|
| 185 |
+
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
|
| 186 |
+
else:
|
| 187 |
+
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
|
| 188 |
+
|
| 189 |
+
data_option.change(
|
| 190 |
+
toggle_data_input,
|
| 191 |
+
inputs=[data_option],
|
| 192 |
+
outputs=[upload_data_col, path_data_col, kaggle_data_col]
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
# Update columns button
|
| 196 |
+
update_cols_btn = gr.Button("Update Columns")
|
| 197 |
+
|
| 198 |
+
# We remove "Columns in Data (for reference)" as requested
|
| 199 |
+
drop_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Drop")
|
| 200 |
+
select_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Keep (if empty, keep all)")
|
| 201 |
+
|
| 202 |
+
# Visualization param
|
| 203 |
+
visualize_chk = gr.Checkbox(label="Visualize 2D (using PCA if needed)", value=True)
|
| 204 |
+
|
| 205 |
+
# Model / results path with empty default, and label "(optional)"
|
| 206 |
+
model_path_txt = gr.Textbox(label="Model Save Path (optional)", value="")
|
| 207 |
+
results_path_txt = gr.Textbox(label="Results Save Path (optional)", value="")
|
| 208 |
+
|
| 209 |
+
# The Train button
|
| 210 |
+
train_btn = gr.Button(f"Train {task_name}")
|
| 211 |
+
|
| 212 |
+
# Logs/Output
|
| 213 |
+
output_box = gr.Textbox(label="Logs / Output")
|
| 214 |
+
image_display = gr.Image(label="Plot Output", visible=True)
|
| 215 |
+
|
| 216 |
+
# Function to update columns
|
| 217 |
+
def update_columns_fn(dataopt, f, p, kagfile, kcname, kdname, iscomp):
|
| 218 |
+
cols = get_columns_from_data(dataopt, f, p, kagfile, kcname, kdname, iscomp)
|
| 219 |
+
# Return updated choices for drop_cols_chk, select_cols_chk
|
| 220 |
+
if cols:
|
| 221 |
+
return gr.update(choices=cols), gr.update(choices=cols)
|
| 222 |
+
else:
|
| 223 |
+
return gr.update(choices=[]), gr.update(choices=[])
|
| 224 |
+
|
| 225 |
+
update_cols_btn.click(
|
| 226 |
+
fn=update_columns_fn,
|
| 227 |
+
inputs=[
|
| 228 |
+
data_option, data_file, data_path_txt,
|
| 229 |
+
kaggle_json, kaggle_competition_name, kaggle_data_name,
|
| 230 |
+
kaggle_is_competition
|
| 231 |
+
],
|
| 232 |
+
outputs=[drop_cols_chk, select_cols_chk]
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
def run_task(model_mod, dataopt, f, p, kagfile, kcname, kdname, iscomp,
|
| 236 |
+
drop_cols, select_cols, visualize, mpath, rpath):
|
| 237 |
+
# Build the command for the relevant script
|
| 238 |
+
script_cmd = [sys.executable, os.path.join(project_root, script_path)]
|
| 239 |
+
script_cmd.extend(["--model_module", model_mod])
|
| 240 |
+
|
| 241 |
+
# Minimal approach for data path logic
|
| 242 |
+
final_path = None
|
| 243 |
+
if dataopt == "Upload Data File" and f is not None:
|
| 244 |
+
final_path = f
|
| 245 |
+
elif dataopt == "Provide Data Path" and os.path.exists(p):
|
| 246 |
+
final_path = p
|
| 247 |
+
else:
|
| 248 |
+
# For Kaggle or other complexities, skipping for brevity.
|
| 249 |
+
# Could handle it similarly to get_columns_from_data approach
|
| 250 |
+
final_path = ""
|
| 251 |
+
|
| 252 |
+
if final_path:
|
| 253 |
+
script_cmd.extend(["--data_path", final_path])
|
| 254 |
+
|
| 255 |
+
# drop cols
|
| 256 |
+
if drop_cols and len(drop_cols) > 0:
|
| 257 |
+
script_cmd.extend(["--drop_columns", ",".join(drop_cols)])
|
| 258 |
+
# select cols
|
| 259 |
+
if select_cols and len(select_cols) > 0:
|
| 260 |
+
script_cmd.extend(["--select_columns", ",".join(select_cols)])
|
| 261 |
+
# visualize
|
| 262 |
+
if visualize:
|
| 263 |
+
script_cmd.append("--visualize")
|
| 264 |
+
|
| 265 |
+
# model_path
|
| 266 |
+
if mpath.strip():
|
| 267 |
+
script_cmd.extend(["--model_path", mpath.strip()])
|
| 268 |
+
# results_path
|
| 269 |
+
if rpath.strip():
|
| 270 |
+
script_cmd.extend(["--results_path", rpath.strip()])
|
| 271 |
+
|
| 272 |
+
print("Executing command:", " ".join(script_cmd))
|
| 273 |
+
out_text, plot_path = run_subprocess(script_path, script_cmd)
|
| 274 |
+
return out_text, plot_path
|
| 275 |
+
|
| 276 |
+
# The Train button is above logs, so let's define the click function
|
| 277 |
+
train_btn.click(
|
| 278 |
+
fn=run_task,
|
| 279 |
+
inputs=[
|
| 280 |
+
model_select, data_option, data_file, data_path_txt,
|
| 281 |
+
kaggle_json, kaggle_competition_name, kaggle_data_name, kaggle_is_competition,
|
| 282 |
+
drop_cols_chk, select_cols_chk, visualize_chk,
|
| 283 |
+
model_path_txt, results_path_txt
|
| 284 |
+
],
|
| 285 |
+
outputs=[output_box, image_display]
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
return # end create_task_tab
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
#####################################
|
| 292 |
+
# Build the Main Gradio App
|
| 293 |
+
#####################################
|
| 294 |
+
|
| 295 |
+
with gr.Blocks() as demo:
|
| 296 |
+
gr.Markdown("# Unsupervised Learning Gradio Interface")
|
| 297 |
+
|
| 298 |
+
# 1) Clustering Tab
|
| 299 |
+
clustering_modules = get_model_modules("clustering")
|
| 300 |
+
create_task_tab(
|
| 301 |
+
task_name="Clustering",
|
| 302 |
+
model_modules=clustering_modules,
|
| 303 |
+
script_path="scripts/train_clustering_model.py"
|
| 304 |
+
)
|
| 305 |
+
|
| 306 |
+
# 2) Dimensionality Reduction Tab
|
| 307 |
+
dimred_modules = get_model_modules("dimred")
|
| 308 |
+
create_task_tab(
|
| 309 |
+
task_name="Dimensionality Reduction",
|
| 310 |
+
model_modules=dimred_modules,
|
| 311 |
+
script_path="scripts/train_dimred_model.py"
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
# 3) Anomaly Detection Tab
|
| 315 |
+
anomaly_modules = get_model_modules("anomaly")
|
| 316 |
+
create_task_tab(
|
| 317 |
+
task_name="Anomaly Detection",
|
| 318 |
+
model_modules=anomaly_modules,
|
| 319 |
+
script_path="scripts/train_anomaly_detection.py"
|
| 320 |
+
)
|
| 321 |
+
|
| 322 |
+
if __name__ == "__main__":
|
| 323 |
+
demo.launch()
|
data/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# data
|
data/datasets/README.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Datasets Utilities
|
| 2 |
+
|
| 3 |
+
This folder contains utility scripts for handling datasets, including downloading data from Kaggle.
|
| 4 |
+
|
| 5 |
+
## 📄 Scripts
|
| 6 |
+
|
| 7 |
+
### `kaggle_data.py`
|
| 8 |
+
|
| 9 |
+
- **Description**: A Python script to download Kaggle datasets or competition data seamlessly, supporting Google Colab, local Linux/Mac, and Windows environments.
|
| 10 |
+
- **Path**: [`data/datasets/kaggle_data.py`](kaggle_data.py)
|
| 11 |
+
- **Key Function**: `get_kaggle_data(json_path, data_name, is_competition=False, output_dir='data/raw')`
|
| 12 |
+
- **Example**:
|
| 13 |
+
|
| 14 |
+
```python
|
| 15 |
+
from kaggle_data import get_kaggle_data
|
| 16 |
+
|
| 17 |
+
# Download a standard Kaggle dataset
|
| 18 |
+
dataset_path = get_kaggle_data("kaggle.json", "paultimothymooney/chest-xray-pneumonia")
|
| 19 |
+
|
| 20 |
+
# Download competition data
|
| 21 |
+
competition_path = get_kaggle_data("kaggle.json", "house-prices-advanced-regression-techniques", is_competition=True)
|
data/datasets/kaggle_data.py
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This module provides a utility function to download Kaggle datasets or competition data.
|
| 3 |
+
|
| 4 |
+
The function automatically detects whether it is running in a Google Colab environment, a local Linux/Mac environment, or a Windows environment, and sets up the Kaggle API accordingly.
|
| 5 |
+
|
| 6 |
+
Requirements:
|
| 7 |
+
- Kaggle API installed (`pip install kaggle`)
|
| 8 |
+
- Kaggle API key (`kaggle.json`) with appropriate permissions.
|
| 9 |
+
|
| 10 |
+
Environment Detection:
|
| 11 |
+
- Google Colab: Uses `/root/.config/kaggle/kaggle.json`.
|
| 12 |
+
- Local Linux/Mac: Uses `~/.kaggle/kaggle.json`.
|
| 13 |
+
- Windows: Uses `C:\\Users\\<Username>\\.kaggle\\kaggle.json`.
|
| 14 |
+
|
| 15 |
+
Functions:
|
| 16 |
+
get_kaggle_data(json_path: str, data_name: str, is_competition: bool = False, output_dir: str = "data/raw") -> str
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
import zipfile
|
| 21 |
+
import sys
|
| 22 |
+
import shutil
|
| 23 |
+
import platform
|
| 24 |
+
|
| 25 |
+
def get_kaggle_data(json_path: str, data_name: str, is_competition: bool = False, output_dir: str = "data/raw") -> str:
|
| 26 |
+
"""
|
| 27 |
+
Downloads a Kaggle dataset or competition data using the Kaggle API in Google Colab, local Linux/Mac, or Windows environment.
|
| 28 |
+
|
| 29 |
+
Parameters:
|
| 30 |
+
json_path (str): Path to your 'kaggle.json' file.
|
| 31 |
+
data_name (str): Kaggle dataset or competition name (e.g., 'paultimothymooney/chest-xray-pneumonia' or 'house-prices-advanced-regression-techniques').
|
| 32 |
+
is_competition (bool): Set to True if downloading competition data. Default is False (for datasets).
|
| 33 |
+
output_dir (str): Directory to save and extract the data. Default is 'data'.
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
str: Path to the extracted dataset folder.
|
| 37 |
+
|
| 38 |
+
Raises:
|
| 39 |
+
OSError: If 'kaggle.json' is not found or cannot be copied.
|
| 40 |
+
Exception: If there is an error during download or extraction.
|
| 41 |
+
|
| 42 |
+
Example of Usage:
|
| 43 |
+
# For downloading a standard dataset
|
| 44 |
+
dataset_path = get_kaggle_data("kaggle.json", "paultimothymooney/chest-xray-pneumonia")
|
| 45 |
+
print(f"Dataset is available at: {dataset_path}")
|
| 46 |
+
|
| 47 |
+
# For downloading competition data
|
| 48 |
+
competition_path = get_kaggle_data("kaggle.json", "house-prices-advanced-regression-techniques", is_competition=True)
|
| 49 |
+
print(f"Competition data is available at: {competition_path}")
|
| 50 |
+
"""
|
| 51 |
+
# Detect environment (Colab, local Linux/Mac, or Windows)
|
| 52 |
+
is_colab = "google.colab" in sys.modules
|
| 53 |
+
is_windows = platform.system() == "Windows"
|
| 54 |
+
|
| 55 |
+
# Step 1: Setup Kaggle API credentials
|
| 56 |
+
try:
|
| 57 |
+
if is_colab:
|
| 58 |
+
config_dir = "/root/.config/kaggle"
|
| 59 |
+
os.makedirs(config_dir, exist_ok=True)
|
| 60 |
+
print("Setting up Kaggle API credentials for Colab environment.")
|
| 61 |
+
shutil.copy(json_path, os.path.join(config_dir, "kaggle.json"))
|
| 62 |
+
os.chmod(os.path.join(config_dir, "kaggle.json"), 0o600)
|
| 63 |
+
else:
|
| 64 |
+
# For both local Linux/Mac and Windows, use the home directory
|
| 65 |
+
config_dir = os.path.join(os.path.expanduser("~"), ".kaggle")
|
| 66 |
+
os.makedirs(config_dir, exist_ok=True)
|
| 67 |
+
print("Setting up Kaggle API credentials for local environment.")
|
| 68 |
+
kaggle_json_dest = os.path.join(config_dir, "kaggle.json")
|
| 69 |
+
if not os.path.exists(kaggle_json_dest):
|
| 70 |
+
shutil.copy(json_path, kaggle_json_dest)
|
| 71 |
+
if not is_windows:
|
| 72 |
+
os.chmod(kaggle_json_dest, 0o600)
|
| 73 |
+
except Exception as e:
|
| 74 |
+
raise OSError(f"Could not set up Kaggle API credentials: {e}")
|
| 75 |
+
|
| 76 |
+
# Step 2: Create output directory
|
| 77 |
+
dataset_dir = os.path.join(output_dir, data_name.split('/')[-1])
|
| 78 |
+
os.makedirs(dataset_dir, exist_ok=True)
|
| 79 |
+
original_dir = os.getcwd()
|
| 80 |
+
os.chdir(dataset_dir)
|
| 81 |
+
|
| 82 |
+
# Step 3: Download the dataset or competition data
|
| 83 |
+
try:
|
| 84 |
+
if is_competition:
|
| 85 |
+
print(f"Downloading competition data: {data_name}")
|
| 86 |
+
cmd = f"kaggle competitions download -c {data_name}"
|
| 87 |
+
else:
|
| 88 |
+
print(f"Downloading dataset: {data_name}")
|
| 89 |
+
cmd = f"kaggle datasets download -d {data_name}"
|
| 90 |
+
os.system(cmd)
|
| 91 |
+
except Exception as e:
|
| 92 |
+
print(f"Error during download: {e}")
|
| 93 |
+
os.chdir(original_dir)
|
| 94 |
+
return None
|
| 95 |
+
|
| 96 |
+
# Step 4: Unzip all downloaded files
|
| 97 |
+
zip_files = [f for f in os.listdir() if f.endswith(".zip")]
|
| 98 |
+
if not zip_files:
|
| 99 |
+
print("No zip files found. Please check the dataset or competition name.")
|
| 100 |
+
os.chdir(original_dir)
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
for zip_file in zip_files:
|
| 104 |
+
try:
|
| 105 |
+
with zipfile.ZipFile(zip_file, "r") as zip_ref:
|
| 106 |
+
zip_ref.extractall()
|
| 107 |
+
print(f"Extracted: {zip_file}")
|
| 108 |
+
os.remove(zip_file)
|
| 109 |
+
except Exception as e:
|
| 110 |
+
print(f"Error extracting {zip_file}: {e}")
|
| 111 |
+
|
| 112 |
+
# Step 5: Navigate back to the original directory
|
| 113 |
+
os.chdir(original_dir)
|
| 114 |
+
|
| 115 |
+
return dataset_dir
|
data/raw/README.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# raw
|
models/unsupervised/anomaly/README.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Anomaly (Outlier) Detection Models
|
| 2 |
+
|
| 3 |
+
This directory hosts scripts defining **anomaly detection** estimators (e.g., Isolation Forest, One-Class SVM, etc.) for use with `train_anomaly_detection.py`. Each file specifies a scikit-learn–compatible outlier detector and, if applicable, a parameter grid.
|
| 4 |
+
|
| 5 |
+
**Key Points**:
|
| 6 |
+
- **Estimator**: Must allow `.fit(X)` and `.predict(X)` or similar. Typically returns +1 / −1 for inliers / outliers (we unify to 0 / 1).
|
| 7 |
+
- **Parameter Grid**: You can define hyperparameters (like `n_estimators`, `contamination`) for potential searching.
|
| 8 |
+
- **Default Approach**: We do not rely on labeled anomalies (unsupervised). The script will produce a predictions CSV with 0 = normal, 1 = outlier.
|
| 9 |
+
|
| 10 |
+
**Note**: The main script `train_anomaly_detection.py` handles data loading, label encoding, dropping/selecting columns, the `.fit(X)`, `.predict(X)` steps, saving the outlier predictions, and (optionally) a 2D plot with outliers in red.
|
| 11 |
+
|
| 12 |
+
## Available Anomaly Detection Models
|
| 13 |
+
|
| 14 |
+
- [Isolation Forest](isolation_forest.py)
|
| 15 |
+
- [One-Class SVM](one_class_svm.py)
|
| 16 |
+
- [Local Outlier Factor (LOF)](local_outlier_factor.py)
|
| 17 |
+
|
| 18 |
+
### Usage
|
| 19 |
+
|
| 20 |
+
For example, to detect outliers with an Isolation Forest:
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
python scripts/train_anomaly_detection.py \
|
| 24 |
+
--model_module isolation_forest \
|
| 25 |
+
--data_path data/breast_cancer/data.csv \
|
| 26 |
+
--drop_columns "id,diagnosis" \
|
| 27 |
+
--visualize
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
This:
|
| 31 |
+
1. Loads `isolation_forest.py`, sets up `IsolationForest(...)`.
|
| 32 |
+
2. Fits the model to the data, saves it, then `predict(...)`.
|
| 33 |
+
3. Saves a `predictions.csv` with `OutlierPrediction`.
|
| 34 |
+
4. If `--visualize`, does a 2D PCA scatter, coloring outliers red.
|
models/unsupervised/anomaly/isolation_forest.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
isolation_forest.py
|
| 4 |
+
|
| 5 |
+
This module defines an Isolation Forest model for anomaly detection.
|
| 6 |
+
Isolation Forest is an efficient and effective algorithm for identifying
|
| 7 |
+
outliers in high-dimensional datasets.
|
| 8 |
+
|
| 9 |
+
Key Features:
|
| 10 |
+
- Utilizes a tree-based approach to isolate anomalies.
|
| 11 |
+
- Efficient for both large datasets and high-dimensional spaces.
|
| 12 |
+
- Automatically determines the expected proportion of anomalies.
|
| 13 |
+
|
| 14 |
+
Parameters:
|
| 15 |
+
- n_estimators (int): Number of base estimators in the ensemble.
|
| 16 |
+
- Default: 100.
|
| 17 |
+
- contamination (str or float): Expected proportion of outliers in the data.
|
| 18 |
+
- Default: 'auto' (automatically inferred based on dataset size).
|
| 19 |
+
- max_samples (int or float): Number of samples to draw for training each estimator.
|
| 20 |
+
- Default: 'auto' (uses min(256, number of samples)).
|
| 21 |
+
|
| 22 |
+
Default Configuration:
|
| 23 |
+
- n_estimators=100: Adequate for most datasets.
|
| 24 |
+
- contamination='auto': Automatically estimates the proportion of outliers.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from sklearn.ensemble import IsolationForest
|
| 28 |
+
|
| 29 |
+
# Define the Isolation Forest estimator
|
| 30 |
+
estimator = IsolationForest(
|
| 31 |
+
n_estimators=100, # Default number of trees
|
| 32 |
+
contamination='auto', # Automatically estimates the contamination proportion
|
| 33 |
+
random_state=42 # Ensures reproducibility
|
| 34 |
+
)
|
models/unsupervised/anomaly/local_outlier_factor.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
local_outlier_factor.py
|
| 4 |
+
|
| 5 |
+
This module defines a Local Outlier Factor (LOF) model for anomaly detection.
|
| 6 |
+
LOF identifies anomalies by comparing the local density of a sample to the density
|
| 7 |
+
of its neighbors. Samples with significantly lower density are flagged as outliers.
|
| 8 |
+
|
| 9 |
+
Key Features:
|
| 10 |
+
- Detects local anomalies in datasets with varying densities.
|
| 11 |
+
- Effective for datasets where the notion of an outlier is context-dependent.
|
| 12 |
+
- Non-parametric method that adapts to the data's structure.
|
| 13 |
+
|
| 14 |
+
Parameters:
|
| 15 |
+
- n_neighbors (int): Number of neighbors used to calculate local density.
|
| 16 |
+
- Default: 20. Higher values smooth out anomalies but may miss local patterns.
|
| 17 |
+
- contamination (str or float): Proportion of outliers in the data.
|
| 18 |
+
- 'auto': Automatically estimates the proportion based on the dataset size.
|
| 19 |
+
- float: Manually set the expected proportion (e.g., 0.1 for 10%).
|
| 20 |
+
- novelty (bool): If True, allows the model to be applied to new unseen data.
|
| 21 |
+
|
| 22 |
+
Limitations:
|
| 23 |
+
- LOF directly computes predictions during `fit_predict()` and does not support `predict()`
|
| 24 |
+
unless `novelty=True`.
|
| 25 |
+
|
| 26 |
+
Default Configuration:
|
| 27 |
+
- n_neighbors=20: Uses 20 neighbors for density comparison.
|
| 28 |
+
- contamination='auto': Automatically estimates the proportion of outliers.
|
| 29 |
+
- novelty=True: Enables predictions on unseen data.
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from sklearn.neighbors import LocalOutlierFactor
|
| 33 |
+
|
| 34 |
+
# Define the Local Outlier Factor estimator
|
| 35 |
+
estimator = LocalOutlierFactor(
|
| 36 |
+
n_neighbors=20, # Number of neighbors to calculate density
|
| 37 |
+
contamination='auto', # Auto-detect the proportion of outliers
|
| 38 |
+
novelty=True # Enables prediction on new data
|
| 39 |
+
)
|
models/unsupervised/anomaly/one_class_svm.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
one_class_svm.py
|
| 4 |
+
|
| 5 |
+
This module defines a One-Class SVM model for anomaly detection.
|
| 6 |
+
One-Class SVM identifies a decision boundary that separates normal data points from potential outliers
|
| 7 |
+
in a high-dimensional feature space.
|
| 8 |
+
|
| 9 |
+
Key Features:
|
| 10 |
+
- Effective for detecting anomalies in high-dimensional datasets.
|
| 11 |
+
- Flexible kernel options for nonlinear decision boundaries.
|
| 12 |
+
- Suitable for datasets with a small proportion of outliers.
|
| 13 |
+
|
| 14 |
+
Parameters:
|
| 15 |
+
- kernel (str): Specifies the kernel type used in the algorithm.
|
| 16 |
+
- Common options: 'linear', 'poly', 'rbf' (default), and 'sigmoid'.
|
| 17 |
+
- gamma (str or float): Kernel coefficient. Determines the influence of each sample.
|
| 18 |
+
- Default: 'scale' (1 / (n_features * X.var())).
|
| 19 |
+
- nu (float): Approximate fraction of outliers in the dataset.
|
| 20 |
+
- Must be in the range (0, 1]. Default: 0.05 (5% of data considered outliers).
|
| 21 |
+
|
| 22 |
+
Default Configuration:
|
| 23 |
+
- kernel='rbf': Radial Basis Function for nonlinear separation.
|
| 24 |
+
- gamma='scale': Automatically adjusts kernel influence based on dataset features.
|
| 25 |
+
- nu=0.05: Assumes approximately 5% of data points are outliers.
|
| 26 |
+
"""
|
| 27 |
+
|
| 28 |
+
from sklearn.svm import OneClassSVM
|
| 29 |
+
|
| 30 |
+
# Define the One-Class SVM estimator
|
| 31 |
+
estimator = OneClassSVM(
|
| 32 |
+
kernel='rbf', # Radial Basis Function kernel for nonlinear boundaries
|
| 33 |
+
gamma='scale', # Adjusts kernel influence based on dataset variance
|
| 34 |
+
nu=0.05 # Assumes 5% of the data are outliers
|
| 35 |
+
)
|
models/unsupervised/clustering/README.md
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Clustering Models
|
| 2 |
+
|
| 3 |
+
This directory contains Python scripts defining various **clustering** models and their associated hyperparameter grids. Each model file sets up a scikit-learn–compatible clustering estimator (e.g., `KMeans`, `DBSCAN`, `GaussianMixture`) and defines a param grid for the `train_clustering_model.py` script.
|
| 4 |
+
|
| 5 |
+
**Key Points**:
|
| 6 |
+
- **Estimator**: Usually supports `.fit(X)` for unsupervised training, and either `.labels_` or `.predict(X)` to retrieve cluster assignments.
|
| 7 |
+
- **Parameter Grid (`param_grid`)**: Used for silhouette-based hyperparameter tuning in `train_clustering_model.py`.
|
| 8 |
+
- **Default Scoring**: Often `'silhouette'`, but can be changed if you adapt your tuning logic.
|
| 9 |
+
|
| 10 |
+
**Note**: Preprocessing (dropping columns, label encoding) and any hyperparameter loop is handled externally by the script/utility. These model definition files simply define:
|
| 11 |
+
- An **estimator** (like `KMeans(n_clusters=3, random_state=42)`).
|
| 12 |
+
- A **`param_grid`** for silhouette tuning (e.g., `{'model__n_clusters':[2,3,4]}`).
|
| 13 |
+
- Optionally, a **`default_scoring`** set to `'silhouette'`.
|
| 14 |
+
|
| 15 |
+
## Available Clustering Models
|
| 16 |
+
|
| 17 |
+
- [KMeans](kmeans.py)
|
| 18 |
+
- [DBSCAN](dbscan.py)
|
| 19 |
+
- [Gaussian Mixture](gaussian_mixture.py)
|
| 20 |
+
- [Agglomerative Clustering (Hierarchical)](hierarchical_clustering.py) )
|
| 21 |
+
|
| 22 |
+
### Usage
|
| 23 |
+
|
| 24 |
+
To train or tune any clustering model, specify the `--model_module` argument with the appropriate model name (e.g., `kmeans`) when running `train_clustering_model.py`, for example:
|
| 25 |
+
|
| 26 |
+
```bash
|
| 27 |
+
python scripts/train_clustering_model.py \
|
| 28 |
+
--model_module kmeans \
|
| 29 |
+
--data_path data/mall_customer/Mall_Customers.csv \
|
| 30 |
+
--tune \
|
| 31 |
+
--visualize
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
This will:
|
| 35 |
+
1. Load the chosen model definition (`kmeans.py`).
|
| 36 |
+
2. Perform optional silhouette-based hyperparameter tuning if `--tune` is used.
|
| 37 |
+
3. Fit the final model, save it, and optionally generate a 2D scatter plot if requested.
|
models/unsupervised/clustering/dbscan.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
dbscan.py
|
| 4 |
+
|
| 5 |
+
This module defines a DBSCAN clustering model and a parameter grid for hyperparameter tuning.
|
| 6 |
+
|
| 7 |
+
DBSCAN (Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm.
|
| 8 |
+
It groups points closely packed together and marks as outliers those points in low-density regions.
|
| 9 |
+
|
| 10 |
+
Parameters:
|
| 11 |
+
- eps (float): The maximum distance between two samples for them to be considered as in the same neighborhood.
|
| 12 |
+
- min_samples (int): The number of samples (or total weight) in a neighborhood for a point to be considered a core point.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from sklearn.cluster import DBSCAN
|
| 16 |
+
|
| 17 |
+
# Define the DBSCAN estimator
|
| 18 |
+
estimator = DBSCAN(eps=0.5, min_samples=5)
|
| 19 |
+
|
| 20 |
+
# Define the hyperparameter grid for tuning
|
| 21 |
+
param_grid = {
|
| 22 |
+
'model__eps': [0.2, 0.5, 1.0, 1.5, 2.0], # Explore a wide range of neighborhood radii
|
| 23 |
+
'model__min_samples': [3, 5, 10, 20] # Adjust density thresholds for core points
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
# Default scoring metric
|
| 27 |
+
# Note: Silhouette score works best for convex clusters and may not always be ideal for DBSCAN.
|
| 28 |
+
# For more complex shapes, consider custom evaluation metrics.
|
| 29 |
+
default_scoring = 'silhouette'
|
models/unsupervised/clustering/gaussian_mixture.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
gaussian_mixture.py
|
| 4 |
+
|
| 5 |
+
This module defines a GaussianMixture model for clustering, along with a parameter grid for hyperparameter tuning.
|
| 6 |
+
|
| 7 |
+
Gaussian Mixture Models (GMM) assume that data is generated from a mixture of several Gaussian distributions
|
| 8 |
+
with unknown parameters. It's a probabilistic model and can handle clusters of varying sizes and shapes.
|
| 9 |
+
|
| 10 |
+
Parameters:
|
| 11 |
+
- n_components (int): Number of mixture components (clusters).
|
| 12 |
+
- covariance_type (str): Determines the shape of each cluster.
|
| 13 |
+
- 'full': Each cluster has its own general covariance matrix.
|
| 14 |
+
- 'tied': All clusters share the same covariance matrix.
|
| 15 |
+
- 'diag': Each cluster has its own diagonal covariance matrix.
|
| 16 |
+
- 'spherical': Each cluster has its own single variance.
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from sklearn.mixture import GaussianMixture
|
| 20 |
+
|
| 21 |
+
# Define the GaussianMixture estimator
|
| 22 |
+
estimator = GaussianMixture(n_components=3, random_state=42)
|
| 23 |
+
|
| 24 |
+
# Define the hyperparameter grid for tuning
|
| 25 |
+
param_grid = {
|
| 26 |
+
'model__n_components': [2, 3, 4], # Experiment with 2 to 4 clusters
|
| 27 |
+
'model__covariance_type': ['full', 'tied', 'diag', 'spherical'] # Different shapes for cluster covariance
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
# Default scoring metric
|
| 31 |
+
# Note: Silhouette score works better for convex clusters. For GMMs with non-convex clusters, consider other metrics like BIC or AIC.
|
| 32 |
+
default_scoring = 'silhouette'
|
models/unsupervised/clustering/hierarchical_clustering.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
hierarchical_clustering.py
|
| 4 |
+
|
| 5 |
+
This module defines an AgglomerativeClustering model for hierarchical clustering,
|
| 6 |
+
along with a parameter grid for hyperparameter tuning.
|
| 7 |
+
|
| 8 |
+
Hierarchical clustering creates a tree-like structure (dendrogram) to represent the nested grouping of data points
|
| 9 |
+
and their similarity levels. Agglomerative clustering starts with each data point as its own cluster and iteratively merges them.
|
| 10 |
+
|
| 11 |
+
Parameters:
|
| 12 |
+
- n_clusters (int): The number of clusters to form.
|
| 13 |
+
- linkage (str): Determines how distances between clusters are computed.
|
| 14 |
+
- 'ward': Minimizes the variance of clusters (requires Euclidean distance).
|
| 15 |
+
- 'complete': Maximum linkage, i.e., uses the farthest points between clusters.
|
| 16 |
+
- 'average': Average linkage, i.e., uses the mean distances between clusters.
|
| 17 |
+
- 'single': Minimum linkage, i.e., uses the closest points between clusters.
|
| 18 |
+
"""
|
| 19 |
+
|
| 20 |
+
from sklearn.cluster import AgglomerativeClustering
|
| 21 |
+
|
| 22 |
+
# Define the AgglomerativeClustering estimator
|
| 23 |
+
estimator = AgglomerativeClustering(n_clusters=3)
|
| 24 |
+
|
| 25 |
+
# Define the hyperparameter grid for tuning
|
| 26 |
+
param_grid = {
|
| 27 |
+
'model__n_clusters': [2, 3, 4], # Experiment with 2 to 4 clusters
|
| 28 |
+
'model__linkage': ['ward', 'complete', 'average', 'single'] # Different linkage methods for clustering
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
# Default scoring metric
|
| 32 |
+
# Note: Silhouette score works well for evaluating convex clusters formed by hierarchical clustering.
|
| 33 |
+
default_scoring = 'silhouette'
|
models/unsupervised/clustering/kmeans.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
kmeans.py
|
| 4 |
+
|
| 5 |
+
This module defines a KMeans clustering model and a parameter grid for hyperparameter tuning.
|
| 6 |
+
|
| 7 |
+
KMeans is a popular clustering algorithm that partitions data into k clusters. Each cluster is represented by the centroid of its members, and the algorithm iteratively refines the centroids to minimize the within-cluster variance.
|
| 8 |
+
|
| 9 |
+
Parameters:
|
| 10 |
+
- n_clusters (int): Number of clusters to form.
|
| 11 |
+
- init (str): Initialization method for centroids. Common options:
|
| 12 |
+
- 'k-means++' (default): Optimized centroid initialization.
|
| 13 |
+
- 'random': Random initialization.
|
| 14 |
+
- n_init (int): Number of times the algorithm runs with different centroid seeds.
|
| 15 |
+
- random_state (int): Ensures reproducibility of results.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from sklearn.cluster import KMeans
|
| 19 |
+
|
| 20 |
+
# Define the KMeans estimator
|
| 21 |
+
estimator = KMeans(n_clusters=3, random_state=42)
|
| 22 |
+
|
| 23 |
+
# Define the hyperparameter grid for tuning
|
| 24 |
+
param_grid = {
|
| 25 |
+
'model__n_clusters': [2, 3, 4, 5], # Experiment with 2 to 5 clusters
|
| 26 |
+
'model__init': ['k-means++', 'random'], # Compare optimized and random initialization
|
| 27 |
+
'model__n_init': [10, 20, 50] # Test different numbers of initializations for stability
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
# Use silhouette score as the default scoring metric
|
| 31 |
+
# Silhouette score evaluates how well clusters are separated and compact
|
| 32 |
+
default_scoring = 'silhouette'
|
models/unsupervised/dimred/README.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dimensionality Reduction Models
|
| 2 |
+
|
| 3 |
+
This directory contains Python scripts defining **dimensionality reduction** techniques (e.g., PCA, t-SNE, UMAP). Each model file sets up a scikit-learn–compatible estimator or follows a similar interface, making it easy to swap in `train_dimred_model.py`.
|
| 4 |
+
|
| 5 |
+
**Key Points**:
|
| 6 |
+
- **Estimator**: Typically supports `.fit_transform(X)` for dimension reduction.
|
| 7 |
+
- **Default Settings**: e.g., PCA might default to `n_components=2`; t-SNE might set `n_components=2` and `perplexity=30`; UMAP might define `n_neighbors=15` or `n_components=2`.
|
| 8 |
+
- **No Supervised Tuning**: Usually we pick hyperparameters based on interpretability or domain. A manual approach or specialized metric can be used if needed.
|
| 9 |
+
|
| 10 |
+
**Note**: The `train_dimred_model.py` script handles dropping columns, label encoding, performing `.fit_transform(X)`, and optionally saving a 2D/3D scatter plot if `--visualize` is used.
|
| 11 |
+
|
| 12 |
+
## Available Dimensionality Reduction Models
|
| 13 |
+
|
| 14 |
+
- [PCA](pca.py)
|
| 15 |
+
- [t-SNE](tsne.py)
|
| 16 |
+
- [UMAP](umap.py)
|
| 17 |
+
|
| 18 |
+
### Usage
|
| 19 |
+
|
| 20 |
+
To reduce data dimensions:
|
| 21 |
+
|
| 22 |
+
```bash
|
| 23 |
+
python scripts/train_dimred_model.py \
|
| 24 |
+
--model_module pca \
|
| 25 |
+
--data_path data/breast_cancer/data.csv \
|
| 26 |
+
--select_columns "radius_mean, texture_mean, area_mean, smoothness_mean" \
|
| 27 |
+
--visualize
|
| 28 |
+
```
|
| 29 |
+
|
| 30 |
+
This:
|
| 31 |
+
1. Loads `pca.py`, which defines a `PCA(n_components=2)` estimator by default.
|
| 32 |
+
2. Applies `.fit_transform(...)` to produce a 2D embedding.
|
| 33 |
+
3. Saves the model (`dimred_model.pkl`) and the transformed data (`X_transformed.csv`).
|
| 34 |
+
4. If `--visualize` is set and `n_components=2`, it scatter-plots the result.
|
models/unsupervised/dimred/pca.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
pca.py
|
| 4 |
+
|
| 5 |
+
This module defines a Principal Component Analysis (PCA) model for dimensionality reduction.
|
| 6 |
+
PCA is a widely used technique to reduce the dimensionality of large datasets by projecting the data
|
| 7 |
+
onto a lower-dimensional subspace while preserving as much variance as possible.
|
| 8 |
+
|
| 9 |
+
Key Features:
|
| 10 |
+
- Reduces computational complexity for high-dimensional data.
|
| 11 |
+
- Helps in visualizing data in 2D or 3D space.
|
| 12 |
+
- Useful as a preprocessing step for clustering or classification.
|
| 13 |
+
|
| 14 |
+
Parameters:
|
| 15 |
+
- n_components (int, float, or None): Number of principal components to keep.
|
| 16 |
+
- int: Specifies the exact number of components.
|
| 17 |
+
- float: Keeps enough components to explain the specified fraction of variance (e.g., 0.95 for 95% variance).
|
| 18 |
+
- None: Keeps all components (default).
|
| 19 |
+
|
| 20 |
+
Default:
|
| 21 |
+
- n_components=2: Projects the data onto 2 dimensions for visualization purposes.
|
| 22 |
+
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
from sklearn.decomposition import PCA
|
| 26 |
+
|
| 27 |
+
# Define the PCA estimator
|
| 28 |
+
estimator = PCA(n_components=2) # Default to 2D projection for visualization
|
models/unsupervised/dimred/tsne.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
tsne.py
|
| 4 |
+
|
| 5 |
+
This module defines a t-Distributed Stochastic Neighbor Embedding (t-SNE) model
|
| 6 |
+
for dimensionality reduction. t-SNE is primarily used for visualizing high-dimensional
|
| 7 |
+
data by projecting it into a lower-dimensional space (typically 2D or 3D).
|
| 8 |
+
|
| 9 |
+
Key Features:
|
| 10 |
+
- Nonlinear dimensionality reduction technique.
|
| 11 |
+
- Preserves local relationships within the data.
|
| 12 |
+
- Useful for exploring clustering structures in high-dimensional datasets.
|
| 13 |
+
|
| 14 |
+
Parameters:
|
| 15 |
+
- n_components (int): Number of dimensions for projection (default: 2 for visualization).
|
| 16 |
+
- perplexity (float): Controls the balance between local and global data structure.
|
| 17 |
+
- Typical values range between 5 and 50.
|
| 18 |
+
- learning_rate (float, optional): Learning rate for optimization (default: 'auto').
|
| 19 |
+
- random_state (int, optional): Ensures reproducibility of the results.
|
| 20 |
+
|
| 21 |
+
Default:
|
| 22 |
+
- n_components=2: Projects the data into a 2D space for visualization purposes.
|
| 23 |
+
- perplexity=30: A good starting point for most datasets.
|
| 24 |
+
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from sklearn.manifold import TSNE
|
| 28 |
+
|
| 29 |
+
# Define the t-SNE estimator
|
| 30 |
+
estimator = TSNE(n_components=2, perplexity=30) # Default to 2D projection with a reasonable perplexity
|
models/unsupervised/dimred/umap.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
umap.py
|
| 4 |
+
|
| 5 |
+
This module defines a Uniform Manifold Approximation and Projection (UMAP) model
|
| 6 |
+
for dimensionality reduction. UMAP is a nonlinear dimensionality reduction technique
|
| 7 |
+
that is efficient for visualizing and analyzing high-dimensional data.
|
| 8 |
+
|
| 9 |
+
Key Features:
|
| 10 |
+
- Preserves both local and global data structures better than t-SNE in some cases.
|
| 11 |
+
- Scales efficiently to larger datasets compared to t-SNE.
|
| 12 |
+
- Suitable for exploratory data analysis and clustering.
|
| 13 |
+
|
| 14 |
+
Parameters:
|
| 15 |
+
- n_components (int): Number of dimensions for projection (default: 2 for visualization).
|
| 16 |
+
- n_neighbors (int): Determines the size of the local neighborhood to consider for manifold approximation.
|
| 17 |
+
- Typical values range between 5 and 50.
|
| 18 |
+
- min_dist (float): Minimum distance between points in the low-dimensional space.
|
| 19 |
+
- Smaller values maintain tighter clusters.
|
| 20 |
+
- metric (str): Distance metric for computing similarity (default: 'euclidean').
|
| 21 |
+
|
| 22 |
+
Default:
|
| 23 |
+
- n_components=2: Projects the data into a 2D space for visualization purposes.
|
| 24 |
+
- n_neighbors=15: Balances local and global structure preservation.
|
| 25 |
+
- min_dist=0.1: Provides moderate clustering while preserving distances.
|
| 26 |
+
|
| 27 |
+
Requirements:
|
| 28 |
+
- umap-learn library must be installed.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
# Import UMAP from the umap-learn library
|
| 32 |
+
import umap.umap_ as umap
|
| 33 |
+
|
| 34 |
+
# Define the UMAP estimator
|
| 35 |
+
estimator = umap.UMAP(n_components=2, n_neighbors=15, min_dist=0.1) # Default configuration for 2D projection
|
requirements.txt
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas==2.2.2
|
| 2 |
+
numpy==1.26.4
|
| 3 |
+
matplotlib==3.8.0
|
| 4 |
+
seaborn==0.13.2
|
| 5 |
+
kaggle==1.6.17
|
| 6 |
+
scikit-learn==1.5.2
|
| 7 |
+
catboost==1.2.7
|
| 8 |
+
dask[dataframe]==2024.10.0
|
| 9 |
+
xgboost==2.1.2
|
| 10 |
+
lightgbm==4.5.0
|
| 11 |
+
joblib==1.4.2
|
| 12 |
+
gradio==5.7.1
|
| 13 |
+
umap-learn==0.5.7
|
scripts/README.md
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Scripts
|
| 2 |
+
|
| 3 |
+
This directory contains executable scripts for training, testing, and other tasks related to model development and evaluation.
|
| 4 |
+
|
| 5 |
+
## Contents
|
| 6 |
+
|
| 7 |
+
Supervised Learning:
|
| 8 |
+
- [train_regression_model.py](#train_regression_modelpy)
|
| 9 |
+
- [train_classification_model.py](#train_classification_modelpy)
|
| 10 |
+
|
| 11 |
+
Unsupervised Learning:
|
| 12 |
+
- [train_clustering_model.py](#train_clustering_modelpy)
|
| 13 |
+
- [train_dimred_model.py](#train_dimred_modelpy)
|
| 14 |
+
- [train_anomaly_detection.py](#train_anomaly_detectionpy)
|
| 15 |
+
|
| 16 |
+
---
|
| 17 |
+
|
| 18 |
+
## `train_regression_model.py`
|
| 19 |
+
|
| 20 |
+
A script for training supervised learning **regression** models using scikit-learn. It handles data loading, preprocessing, optional log transformation, hyperparameter tuning, model evaluation, and saving of models, metrics, and visualizations.
|
| 21 |
+
|
| 22 |
+
### Features
|
| 23 |
+
|
| 24 |
+
- Supports various regression models defined in `models/supervised/regression`.
|
| 25 |
+
- Performs hyperparameter tuning using grid search cross-validation.
|
| 26 |
+
- Saves trained models and evaluation metrics.
|
| 27 |
+
- Generates visualizations if specified.
|
| 28 |
+
|
| 29 |
+
### Usage
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
python train_regression_model.py --model_module MODEL_MODULE \
|
| 33 |
+
--data_path DATA_PATH/DATA_NAME.csv \
|
| 34 |
+
--target_variable TARGET_VARIABLE [OPTIONS]
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
**Required Arguments**:
|
| 38 |
+
- `model_module`: Name of the regression model module to import (e.g., `linear_regression`).
|
| 39 |
+
- `data_path`: Path to the dataset directory, including the data file name.
|
| 40 |
+
- `target_variable`: Name of the target variable.
|
| 41 |
+
|
| 42 |
+
**Optional Arguments**:
|
| 43 |
+
- `test_size`: Proportion of the dataset to include in the test split (default: `0.2`).
|
| 44 |
+
- `random_state`: Random seed for reproducibility (default: `42`).
|
| 45 |
+
- `log_transform`: Apply log transformation to the target variable (regression only).
|
| 46 |
+
- `cv_folds`: Number of cross-validation folds (default: `5`).
|
| 47 |
+
- `scoring_metric`: Scoring metric for model evaluation.
|
| 48 |
+
- `model_path`: Path to save the trained model.
|
| 49 |
+
- `results_path`: Path to save results and metrics.
|
| 50 |
+
- `visualize`: Generate and save visualizations (e.g., scatter or actual vs. predicted).
|
| 51 |
+
- `drop_columns`: Comma-separated column names to drop from the dataset.
|
| 52 |
+
|
| 53 |
+
### Usage Example
|
| 54 |
+
|
| 55 |
+
```bash
|
| 56 |
+
python train_regression_model.py --model_module linear_regression \
|
| 57 |
+
--data_path data/house_prices/train.csv \
|
| 58 |
+
--target_variable SalePrice --drop_columns Id \
|
| 59 |
+
--log_transform --visualize
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
---
|
| 63 |
+
|
| 64 |
+
## `train_classification_model.py`
|
| 65 |
+
|
| 66 |
+
A script for training supervised learning **classification** models using scikit-learn. It handles data loading, preprocessing, hyperparameter tuning (via grid search CV), model evaluation using classification metrics, and saving of models, metrics, and visualizations.
|
| 67 |
+
|
| 68 |
+
### Features
|
| 69 |
+
|
| 70 |
+
- Supports various classification models defined in `models/supervised/classification`.
|
| 71 |
+
- Performs hyperparameter tuning using grid search cross-validation (via `classification_hyperparameter_tuning`).
|
| 72 |
+
- Saves trained models and evaluation metrics (accuracy, precision, recall, F1).
|
| 73 |
+
- If `visualize` is enabled, it generates a metrics bar chart and a confusion matrix plot.
|
| 74 |
+
|
| 75 |
+
### Usage
|
| 76 |
+
|
| 77 |
+
```bash
|
| 78 |
+
python train_classification_model.py --model_module MODEL_MODULE \
|
| 79 |
+
--data_path DATA_PATH/DATA_NAME.csv \
|
| 80 |
+
--target_variable TARGET_VARIABLE [OPTIONS]
|
| 81 |
+
```
|
| 82 |
+
|
| 83 |
+
**Required Arguments**:
|
| 84 |
+
- `model_module`: Name of the classification model module to import (e.g., `logistic_regression`).
|
| 85 |
+
- `data_path`: Path to the dataset directory, including the data file name.
|
| 86 |
+
- `target_variable`: Name of the target variable (categorical).
|
| 87 |
+
|
| 88 |
+
**Optional Arguments**:
|
| 89 |
+
- `test_size`: Proportion of the dataset to include in the test split (default: `0.2`).
|
| 90 |
+
- `random_state`: Random seed for reproducibility (default: `42`).
|
| 91 |
+
- `cv_folds`: Number of cross-validation folds (default: `5`).
|
| 92 |
+
- `scoring_metric`: Scoring metric for model evaluation (e.g., `accuracy`, `f1`, `roc_auc`).
|
| 93 |
+
- `model_path`: Path to save the trained model.
|
| 94 |
+
- `results_path`: Path to save results and metrics.
|
| 95 |
+
- `visualize`: Generate and save visualizations (metrics bar chart, confusion matrix).
|
| 96 |
+
- `drop_columns`: Comma-separated column names to drop from the dataset.
|
| 97 |
+
|
| 98 |
+
### Usage Example
|
| 99 |
+
|
| 100 |
+
```bash
|
| 101 |
+
python train_classification_model.py --model_module logistic_regression \
|
| 102 |
+
--data_path data/adult_income/train.csv \
|
| 103 |
+
--target_variable income_bracket \
|
| 104 |
+
--scoring_metric accuracy --visualize
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
## `train_clustering_model.py`
|
| 110 |
+
|
| 111 |
+
A script for training **clustering** models (K-Means, DBSCAN, Gaussian Mixture, etc.) in an unsupervised manner. It supports data loading, optional drop/select of columns, label encoding for non-numeric features, optional hyperparameter tuning (silhouette-based), saving the final model, and generating a 2D cluster plot if needed.
|
| 112 |
+
|
| 113 |
+
### Features
|
| 114 |
+
|
| 115 |
+
- Supports various clustering models defined in `models/unsupervised/clustering`.
|
| 116 |
+
- Optional hyperparameter tuning (silhouette score) via `clustering_hyperparameter_tuning`.
|
| 117 |
+
- Saves the trained clustering model and optional silhouette metrics.
|
| 118 |
+
- Generates a 2D scatter plot if `visualize` is enabled (using PCA if needed).
|
| 119 |
+
|
| 120 |
+
### Usage
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
python train_clustering_model.py --model_module MODEL_MODULE \
|
| 124 |
+
--data_path DATA_PATH/DATA_NAME.csv [OPTIONS]
|
| 125 |
+
```
|
| 126 |
+
|
| 127 |
+
**Key Arguments**:
|
| 128 |
+
- `model_module`: Name of the clustering model module (e.g., `kmeans`, `dbscan`, `gaussian_mixture`).
|
| 129 |
+
- `data_path`: Path to the CSV dataset.
|
| 130 |
+
|
| 131 |
+
**Optional Arguments**:
|
| 132 |
+
- `drop_columns`: Comma-separated column names to drop.
|
| 133 |
+
- `select_columns`: Comma-separated column names to keep.
|
| 134 |
+
- `tune`: If set, performs silhouette-based hyperparameter tuning.
|
| 135 |
+
- `cv_folds`: Number of folds or times for silhouette-based repeated runs (basic approach).
|
| 136 |
+
- `scoring_metric`: Typically `'silhouette'`.
|
| 137 |
+
- `visualize`: If set, attempts a 2D scatter, using PCA if more than 2 features remain.
|
| 138 |
+
- `model_path`: Path to save the trained model.
|
| 139 |
+
- `results_path`: Path to save results (metrics, plots).
|
| 140 |
+
|
| 141 |
+
### Usage Example
|
| 142 |
+
|
| 143 |
+
```bash
|
| 144 |
+
python train_clustering_model.py \
|
| 145 |
+
--model_module kmeans \
|
| 146 |
+
--data_path data/mall_customer/Mall_Customers.csv \
|
| 147 |
+
--drop_columns "Gender" \
|
| 148 |
+
--select_columns "Annual Income (k$),Spending Score (1-100)" \
|
| 149 |
+
--visualize
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
---
|
| 153 |
+
|
| 154 |
+
## `train_dimred_model.py`
|
| 155 |
+
|
| 156 |
+
A script for **dimensionality reduction** tasks (e.g., PCA, t-SNE, UMAP). It loads data, optionally drops or selects columns, label-encodes categorical features, fits the chosen dimensionality reduction model, saves the transformed data, and can visualize 2D/3D outputs.
|
| 157 |
+
|
| 158 |
+
### Features
|
| 159 |
+
|
| 160 |
+
- Supports various dimension reduction models in `models/unsupervised/dimred`.
|
| 161 |
+
- Saves the fitted model and the transformed data (in CSV).
|
| 162 |
+
- Optionally creates a 2D or 3D scatter plot if the output dimension is 2 or 3.
|
| 163 |
+
|
| 164 |
+
### Usage
|
| 165 |
+
|
| 166 |
+
```bash
|
| 167 |
+
python train_dimred_model.py --model_module MODEL_MODULE \
|
| 168 |
+
--data_path DATA_PATH/DATA_NAME.csv [OPTIONS]
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
**Key Arguments**:
|
| 172 |
+
- `model_module`: Name of the dimension reduction module (e.g., `pca`, `tsne`, `umap`).
|
| 173 |
+
- `data_path`: Path to the CSV dataset.
|
| 174 |
+
|
| 175 |
+
**Optional Arguments**:
|
| 176 |
+
- `drop_columns`: Comma-separated column names to drop.
|
| 177 |
+
- `select_columns`: Comma-separated column names to keep.
|
| 178 |
+
- `visualize`: If set, plots the 2D or 3D embedding.
|
| 179 |
+
- `model_path`: Path to save the trained model.
|
| 180 |
+
- `results_path`: Path to save the transformed data and any plots.
|
| 181 |
+
|
| 182 |
+
### Usage Example
|
| 183 |
+
|
| 184 |
+
```bash
|
| 185 |
+
python train_dimred_model.py \
|
| 186 |
+
--model_module pca \
|
| 187 |
+
--data_path data/breast_cancer/data.csv \
|
| 188 |
+
--drop_columns "id,diagnosis" \
|
| 189 |
+
--visualize
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
---
|
| 193 |
+
|
| 194 |
+
## `train_anomaly_detection.py`
|
| 195 |
+
|
| 196 |
+
A script for training **anomaly/outlier detection** models (Isolation Forest, One-Class SVM, etc.). It supports dropping/selecting columns, label-encoding, saving anomaly predictions (0 = normal, 1 = outlier), and optionally visualizing points in 2D with outliers colored differently.
|
| 197 |
+
|
| 198 |
+
### Features
|
| 199 |
+
|
| 200 |
+
- Supports various anomaly models in `models/unsupervised/anomaly`.
|
| 201 |
+
- Saves the model and an outlier predictions CSV.
|
| 202 |
+
- If `visualize` is enabled, performs PCA → 2D for plotting normal vs. outliers.
|
| 203 |
+
|
| 204 |
+
### Usage
|
| 205 |
+
|
| 206 |
+
```bash
|
| 207 |
+
python train_anomaly_detection.py --model_module MODEL_MODULE \
|
| 208 |
+
--data_path DATA_PATH/DATA_NAME.csv [OPTIONS]
|
| 209 |
+
```
|
| 210 |
+
|
| 211 |
+
**Key Arguments**:
|
| 212 |
+
- `model_module`: Name of the anomaly detection module (e.g., `isolation_forest`, `one_class_svm`, `local_outlier_factor`).
|
| 213 |
+
- `data_path`: Path to the CSV dataset.
|
| 214 |
+
|
| 215 |
+
**Optional Arguments**:
|
| 216 |
+
- `drop_columns`: Comma-separated column names to drop.
|
| 217 |
+
- `select_columns`: Comma-separated column names to keep.
|
| 218 |
+
- `visualize`: If set, attempts a 2D scatter (via PCA) and colors outliers in red.
|
| 219 |
+
- `model_path`: Path to save the anomaly model.
|
| 220 |
+
- `results_path`: Path to save outlier predictions and plots.
|
| 221 |
+
|
| 222 |
+
### Usage Example
|
| 223 |
+
|
| 224 |
+
```bash
|
| 225 |
+
python train_anomaly_detection.py \
|
| 226 |
+
--model_module isolation_forest \
|
| 227 |
+
--data_path data/breast_cancer/data.csv \
|
| 228 |
+
--drop_columns "id,diagnosis" \
|
| 229 |
+
--visualize
|
| 230 |
+
```
|
scripts/train_anomaly_detection.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
train_anomaly_detection.py
|
| 4 |
+
|
| 5 |
+
Trains an anomaly detection model (Isolation Forest, One-Class SVM, etc.) on a dataset.
|
| 6 |
+
Allows dropping or selecting columns, label-encoding for non-numeric data,
|
| 7 |
+
saves predictions (0 = normal, 1 = outlier) and optionally visualizes in 2D.
|
| 8 |
+
|
| 9 |
+
Usage Example:
|
| 10 |
+
--------------
|
| 11 |
+
python scripts/train_anomaly_detection.py \
|
| 12 |
+
--model_module isolation_forest \
|
| 13 |
+
--data_path data/raw/my_dataset.csv \
|
| 14 |
+
--drop_columns "unwanted_col" \
|
| 15 |
+
--select_columns "feat1,feat2,feat3" \
|
| 16 |
+
--visualize
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import argparse
|
| 22 |
+
import importlib
|
| 23 |
+
import pandas as pd
|
| 24 |
+
import numpy as np
|
| 25 |
+
import joblib
|
| 26 |
+
|
| 27 |
+
from sklearn.preprocessing import LabelEncoder
|
| 28 |
+
import matplotlib.pyplot as plt
|
| 29 |
+
from timeit import default_timer as timer
|
| 30 |
+
|
| 31 |
+
def main(args):
|
| 32 |
+
# Change to the project root if needed
|
| 33 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
| 34 |
+
os.chdir(project_root)
|
| 35 |
+
sys.path.insert(0, project_root)
|
| 36 |
+
|
| 37 |
+
# Dynamically import the chosen anomaly model module
|
| 38 |
+
model_module_path = f"models.unsupervised.anomaly.{args.model_module}"
|
| 39 |
+
model_module = importlib.import_module(model_module_path)
|
| 40 |
+
|
| 41 |
+
# Retrieve the estimator from the model file
|
| 42 |
+
estimator = model_module.estimator
|
| 43 |
+
|
| 44 |
+
# Prepare results directory
|
| 45 |
+
if args.results_path is None:
|
| 46 |
+
args.results_path = os.path.join("results", f"{estimator.__class__.__name__}_Anomaly")
|
| 47 |
+
os.makedirs(args.results_path, exist_ok=True)
|
| 48 |
+
|
| 49 |
+
# Load data
|
| 50 |
+
df = pd.read_csv(args.data_path)
|
| 51 |
+
print(f"Data loaded from {args.data_path}, initial shape: {df.shape}")
|
| 52 |
+
|
| 53 |
+
# Drop empty columns
|
| 54 |
+
df = df.dropna(axis='columns', how='all')
|
| 55 |
+
print("After dropping empty columns:", df.shape)
|
| 56 |
+
|
| 57 |
+
# Drop specified columns if any
|
| 58 |
+
if args.drop_columns:
|
| 59 |
+
drop_cols = [c.strip() for c in args.drop_columns.split(',') if c.strip()]
|
| 60 |
+
df.drop(columns=drop_cols, inplace=True, errors='ignore')
|
| 61 |
+
print(f"Dropped columns: {drop_cols} | New shape: {df.shape}")
|
| 62 |
+
|
| 63 |
+
# Select specified columns if any
|
| 64 |
+
if args.select_columns:
|
| 65 |
+
keep_cols = [c.strip() for c in args.select_columns.split(',') if c.strip()]
|
| 66 |
+
df = df[keep_cols]
|
| 67 |
+
print(f"Selected columns: {keep_cols} | New shape: {df.shape}")
|
| 68 |
+
|
| 69 |
+
# Label-encode non-numeric columns
|
| 70 |
+
for col in df.columns:
|
| 71 |
+
if df[col].dtype == 'object':
|
| 72 |
+
le = LabelEncoder()
|
| 73 |
+
df[col] = le.fit_transform(df[col])
|
| 74 |
+
|
| 75 |
+
# Convert DataFrame to numpy array
|
| 76 |
+
X = df.values
|
| 77 |
+
print(f"Final data shape after dropping/selecting columns and encoding: {X.shape}")
|
| 78 |
+
|
| 79 |
+
# Fit the anomaly model
|
| 80 |
+
start_time = timer()
|
| 81 |
+
estimator.fit(X)
|
| 82 |
+
end_time = timer()
|
| 83 |
+
train_time = end_time - start_time
|
| 84 |
+
print(f"Anomaly detection training with {args.model_module} completed in {train_time:.2f} seconds.")
|
| 85 |
+
|
| 86 |
+
# Save the model
|
| 87 |
+
model_output_path = os.path.join(args.results_path, "anomaly_model.pkl")
|
| 88 |
+
os.makedirs(args.model_path, exist_ok=True)
|
| 89 |
+
joblib.dump(estimator, model_output_path)
|
| 90 |
+
print(f"Model saved to {model_output_path}")
|
| 91 |
+
|
| 92 |
+
# Predict outliers: Typically returns 1 for inliers, -1 for outliers (or vice versa)
|
| 93 |
+
# We'll unify them to 0 = normal, 1 = outlier
|
| 94 |
+
raw_preds = estimator.predict(X)
|
| 95 |
+
# Some anomaly detectors do the opposite: IsolationForest => +1 inlier, -1 outlier
|
| 96 |
+
# Convert to 0/1:
|
| 97 |
+
preds_binary = np.where(raw_preds == 1, 0, 1)
|
| 98 |
+
|
| 99 |
+
outlier_count = np.sum(preds_binary)
|
| 100 |
+
inlier_count = len(preds_binary) - outlier_count
|
| 101 |
+
print(f"Detected {outlier_count} outliers out of {len(X)} samples. ({inlier_count} normal)")
|
| 102 |
+
|
| 103 |
+
# Save predictions
|
| 104 |
+
pred_df = pd.DataFrame({
|
| 105 |
+
'OutlierPrediction': preds_binary
|
| 106 |
+
})
|
| 107 |
+
pred_path = os.path.join(args.results_path, "predictions.csv")
|
| 108 |
+
pred_df.to_csv(pred_path, index=False)
|
| 109 |
+
print(f"Predictions saved to {pred_path}")
|
| 110 |
+
|
| 111 |
+
# Visualization if 2D or 3D
|
| 112 |
+
if args.visualize:
|
| 113 |
+
print("Creating anomaly detection visualization...")
|
| 114 |
+
# We'll do PCA => 2D if dimension > 2
|
| 115 |
+
if X.shape[1] > 2:
|
| 116 |
+
from sklearn.decomposition import PCA
|
| 117 |
+
pca = PCA(n_components=2)
|
| 118 |
+
X_2d = pca.fit_transform(X)
|
| 119 |
+
x_label = "PC1"
|
| 120 |
+
y_label = "PC2"
|
| 121 |
+
elif X.shape[1] == 2:
|
| 122 |
+
X_2d = X
|
| 123 |
+
x_label = df.columns[0] if df.shape[1] == 2 else "Feature 1"
|
| 124 |
+
y_label = df.columns[1] if df.shape[1] == 2 else "Feature 2"
|
| 125 |
+
else:
|
| 126 |
+
# 1D or 0D => skip
|
| 127 |
+
print("Only 1 feature or none; can't create 2D scatter. Skipping.")
|
| 128 |
+
return
|
| 129 |
+
|
| 130 |
+
# Plot
|
| 131 |
+
plt.figure(figsize=(6,5))
|
| 132 |
+
# color outliers differently
|
| 133 |
+
colors = np.where(preds_binary == 1, 'r', 'b')
|
| 134 |
+
plt.scatter(X_2d[:,0], X_2d[:,1], c=colors, s=30, alpha=0.7)
|
| 135 |
+
plt.title(f"{estimator.__class__.__name__} Anomaly Detection")
|
| 136 |
+
plt.xlabel(x_label)
|
| 137 |
+
plt.ylabel(y_label)
|
| 138 |
+
|
| 139 |
+
# Save
|
| 140 |
+
plot_path = os.path.join(args.results_path, "anomaly_plot.png")
|
| 141 |
+
plt.savefig(plot_path)
|
| 142 |
+
plt.show()
|
| 143 |
+
print(f"Anomaly plot saved to {plot_path}")
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
if __name__ == "__main__":
|
| 147 |
+
parser = argparse.ArgumentParser(description="Train an anomaly detection model.")
|
| 148 |
+
parser.add_argument('--model_module', type=str, required=True,
|
| 149 |
+
help='Name of the anomaly detection model (e.g. isolation_forest, one_class_svm).')
|
| 150 |
+
parser.add_argument('--data_path', type=str, required=True,
|
| 151 |
+
help='Path to the CSV dataset file.')
|
| 152 |
+
parser.add_argument('--model_path', type=str, default='saved_models/Anomaly',
|
| 153 |
+
help='Path to save the trained model.')
|
| 154 |
+
parser.add_argument('--results_path', type=str, default=None,
|
| 155 |
+
help='Directory to save results (predictions, plots).')
|
| 156 |
+
parser.add_argument('--drop_columns', type=str, default='',
|
| 157 |
+
help='Comma-separated column names to drop.')
|
| 158 |
+
parser.add_argument('--select_columns', type=str, default='',
|
| 159 |
+
help='Comma-separated column names to keep (ignore the rest).')
|
| 160 |
+
parser.add_argument('--visualize', action='store_true',
|
| 161 |
+
help='If set, reduce to 2D (via PCA if needed) and color outliers vs. normal points.')
|
| 162 |
+
args = parser.parse_args()
|
| 163 |
+
main(args)
|
scripts/train_clustering_model.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
train_clustering_model.py
|
| 4 |
+
|
| 5 |
+
A script to train clustering models (K-Means, DBSCAN, Gaussian Mixture, etc.).
|
| 6 |
+
It can optionally perform hyperparameter tuning using silhouette score,
|
| 7 |
+
trains the model, saves it, and visualizes clusters if requested.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import os
|
| 11 |
+
import sys
|
| 12 |
+
import argparse
|
| 13 |
+
import importlib
|
| 14 |
+
import pandas as pd
|
| 15 |
+
import numpy as np
|
| 16 |
+
import joblib
|
| 17 |
+
|
| 18 |
+
from sklearn import datasets
|
| 19 |
+
from sklearn.metrics import silhouette_score
|
| 20 |
+
from sklearn.preprocessing import LabelEncoder
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
import seaborn as sns
|
| 23 |
+
from timeit import default_timer as timer
|
| 24 |
+
|
| 25 |
+
def main(args):
|
| 26 |
+
# Change to the project root if needed
|
| 27 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
| 28 |
+
os.chdir(project_root)
|
| 29 |
+
sys.path.insert(0, project_root)
|
| 30 |
+
|
| 31 |
+
# Optional: import the unsupervised hyperparameter tuning function
|
| 32 |
+
from utils.unsupervised_hyperparameter_tuning import clustering_hyperparameter_tuning
|
| 33 |
+
|
| 34 |
+
# Dynamically import the chosen clustering model module
|
| 35 |
+
model_module_path = f"models.unsupervised.clustering.{args.model_module}"
|
| 36 |
+
model_module = importlib.import_module(model_module_path)
|
| 37 |
+
|
| 38 |
+
# Retrieve the estimator and param grid from the model file
|
| 39 |
+
estimator = model_module.estimator
|
| 40 |
+
param_grid = getattr(model_module, 'param_grid', {})
|
| 41 |
+
default_scoring = getattr(model_module, 'default_scoring', 'silhouette') # fallback
|
| 42 |
+
|
| 43 |
+
# Prepare results directory
|
| 44 |
+
if args.results_path is None:
|
| 45 |
+
# e.g., 'results/KMeans_Clustering'
|
| 46 |
+
args.results_path = os.path.join('results', f"{estimator.__class__.__name__}_Clustering")
|
| 47 |
+
os.makedirs(args.results_path, exist_ok=True)
|
| 48 |
+
|
| 49 |
+
# Load data from CSV
|
| 50 |
+
df = pd.read_csv(args.data_path)
|
| 51 |
+
print(f"Data loaded from {args.data_path}, initial shape: {df.shape}")
|
| 52 |
+
|
| 53 |
+
# Drop empty columns
|
| 54 |
+
df = df.dropna(axis='columns', how='all')
|
| 55 |
+
print("After dropping empty columns:", df.shape)
|
| 56 |
+
|
| 57 |
+
# Drop specified columns if any
|
| 58 |
+
if args.drop_columns:
|
| 59 |
+
drop_cols = [col.strip() for col in args.drop_columns.split(',') if col.strip()]
|
| 60 |
+
df = df.drop(columns=drop_cols, errors='ignore')
|
| 61 |
+
print(f"Dropped columns: {drop_cols} | New shape: {df.shape}")
|
| 62 |
+
|
| 63 |
+
# Select specified columns if any
|
| 64 |
+
if args.select_columns:
|
| 65 |
+
keep_cols = [col.strip() for col in args.select_columns.split(',') if col.strip()]
|
| 66 |
+
# Keep only these columns (intersection with what's in df)
|
| 67 |
+
df = df[keep_cols]
|
| 68 |
+
print(f"Selected columns: {keep_cols} | New shape: {df.shape}")
|
| 69 |
+
|
| 70 |
+
# For each non-numeric column, apply label encoding
|
| 71 |
+
for col in df.columns:
|
| 72 |
+
if df[col].dtype == 'object':
|
| 73 |
+
le = LabelEncoder()
|
| 74 |
+
df[col] = le.fit_transform(df[col])
|
| 75 |
+
|
| 76 |
+
# Convert DataFrame to NumPy array for clustering
|
| 77 |
+
X = df.values
|
| 78 |
+
print(f"Final shape after dropping/selecting columns and encoding: {X.shape}")
|
| 79 |
+
|
| 80 |
+
# If user wants hyperparam tuning
|
| 81 |
+
if args.tune:
|
| 82 |
+
print("Performing hyperparameter tuning...")
|
| 83 |
+
best_model, best_params = clustering_hyperparameter_tuning(
|
| 84 |
+
X, estimator, param_grid, scoring=default_scoring, cv=args.cv_folds
|
| 85 |
+
)
|
| 86 |
+
estimator = best_model # the fitted best model
|
| 87 |
+
print("Best Params:", best_params)
|
| 88 |
+
else:
|
| 89 |
+
# Just fit the model directly
|
| 90 |
+
print("No hyperparameter tuning; fitting model with default parameters...")
|
| 91 |
+
start_time = timer()
|
| 92 |
+
estimator.fit(X)
|
| 93 |
+
end_time = timer()
|
| 94 |
+
print(f"Training time (no tuning): {end_time - start_time:.2f}s")
|
| 95 |
+
|
| 96 |
+
# Ensure the model is fitted at this point
|
| 97 |
+
model_output_path = os.path.join(args.results_path, "best_model.pkl")
|
| 98 |
+
os.makedirs(args.model_path, exist_ok=True) # ensure directory exists
|
| 99 |
+
joblib.dump(estimator, model_output_path)
|
| 100 |
+
print(f"Model saved to {model_output_path}")
|
| 101 |
+
|
| 102 |
+
# Evaluate using silhouette if possible
|
| 103 |
+
# Some clusterers use .labels_, others require .predict(X)
|
| 104 |
+
if hasattr(estimator, 'labels_'):
|
| 105 |
+
labels = estimator.labels_
|
| 106 |
+
else:
|
| 107 |
+
labels = estimator.predict(X) # e.g. KMeans, GaussianMixture
|
| 108 |
+
|
| 109 |
+
unique_labels = set(labels)
|
| 110 |
+
if len(unique_labels) > 1:
|
| 111 |
+
sil = silhouette_score(X, labels)
|
| 112 |
+
print(f"Silhouette Score: {sil:.4f}")
|
| 113 |
+
pd.DataFrame({"Silhouette": [sil]}).to_csv(
|
| 114 |
+
os.path.join(args.results_path, "metrics.csv"), index=False
|
| 115 |
+
)
|
| 116 |
+
else:
|
| 117 |
+
print("Only one cluster found; silhouette score not meaningful.")
|
| 118 |
+
|
| 119 |
+
# Visualization
|
| 120 |
+
if args.visualize:
|
| 121 |
+
print("Creating cluster visualization...")
|
| 122 |
+
|
| 123 |
+
# If X has more than 2 dims, do PCA => 2D
|
| 124 |
+
if X.shape[1] > 2:
|
| 125 |
+
from sklearn.decomposition import PCA
|
| 126 |
+
pca = PCA(n_components=2)
|
| 127 |
+
X_2d = pca.fit_transform(X)
|
| 128 |
+
var_ratio = pca.explained_variance_ratio_
|
| 129 |
+
pc1_var = var_ratio[0] * 100
|
| 130 |
+
pc2_var = var_ratio[1] * 100
|
| 131 |
+
x_label = f"PC1 ({pc1_var:.2f}% var)"
|
| 132 |
+
y_label = f"PC2 ({pc2_var:.2f}% var)"
|
| 133 |
+
elif X.shape[1] == 2:
|
| 134 |
+
# If we know 'df' and shape matches, label with col names
|
| 135 |
+
if df.shape[1] == 2:
|
| 136 |
+
x_label = df.columns[0]
|
| 137 |
+
y_label = df.columns[1]
|
| 138 |
+
else:
|
| 139 |
+
x_label = "Feature 1"
|
| 140 |
+
y_label = "Feature 2"
|
| 141 |
+
X_2d = X
|
| 142 |
+
else:
|
| 143 |
+
# 1D or 0D => skip
|
| 144 |
+
if X.shape[1] == 1:
|
| 145 |
+
print("Only 1 feature available; cannot create a 2D scatter plot.")
|
| 146 |
+
else:
|
| 147 |
+
print("No features available for plotting.")
|
| 148 |
+
return
|
| 149 |
+
|
| 150 |
+
plt.figure(figsize=(6, 5))
|
| 151 |
+
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=labels, cmap='viridis', s=30)
|
| 152 |
+
plt.title(f"{estimator.__class__.__name__} Clusters")
|
| 153 |
+
plt.xlabel(x_label)
|
| 154 |
+
plt.ylabel(y_label)
|
| 155 |
+
|
| 156 |
+
# Save the figure
|
| 157 |
+
plot_path = os.path.join(args.results_path, "clusters.png")
|
| 158 |
+
plt.savefig(plot_path)
|
| 159 |
+
plt.show()
|
| 160 |
+
print(f"Cluster plot saved to {plot_path}")
|
| 161 |
+
|
| 162 |
+
if __name__ == "__main__":
|
| 163 |
+
parser = argparse.ArgumentParser(description="Train a clustering model.")
|
| 164 |
+
parser.add_argument('--model_module', type=str, required=True,
|
| 165 |
+
help='Name of the clustering model module (e.g. kmeans, dbscan, etc.).')
|
| 166 |
+
parser.add_argument('--data_path', type=str, required=True,
|
| 167 |
+
help='Path to the CSV dataset.')
|
| 168 |
+
parser.add_argument('--model_path', type=str, default='saved_models/Clustering',
|
| 169 |
+
help='Path to save the trained model.')
|
| 170 |
+
parser.add_argument('--results_path', type=str, default=None,
|
| 171 |
+
help='Directory to save results (metrics, plots).')
|
| 172 |
+
parser.add_argument('--cv_folds', type=int, default=5,
|
| 173 |
+
help='Number of folds for hyperparam tuning.')
|
| 174 |
+
parser.add_argument('--tune', action='store_true',
|
| 175 |
+
help='Perform hyperparameter tuning with silhouette score.')
|
| 176 |
+
parser.add_argument('--visualize', action='store_true',
|
| 177 |
+
help='Generate a 2D visualization of the clusters.')
|
| 178 |
+
parser.add_argument('--drop_columns', type=str, default='',
|
| 179 |
+
help='Comma-separated column names to drop from the dataset.')
|
| 180 |
+
parser.add_argument('--select_columns', type=str, default='',
|
| 181 |
+
help='Comma-separated column names to keep (ignore all others).')
|
| 182 |
+
args = parser.parse_args()
|
| 183 |
+
main(args)
|
scripts/train_dimred_model.py
ADDED
|
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
train_dimred_model.py
|
| 4 |
+
|
| 5 |
+
Trains a dimensionality reduction model (e.g., PCA, t-SNE, UMAP) on a dataset.
|
| 6 |
+
It can drop or select specific columns, perform label encoding on any non-numeric columns,
|
| 7 |
+
and optionally visualize the reduced data (2D or 3D).
|
| 8 |
+
|
| 9 |
+
Example Usage:
|
| 10 |
+
--------------
|
| 11 |
+
python scripts/train_dimred_model.py \
|
| 12 |
+
--model_module pca \
|
| 13 |
+
--data_path data/raw/breast-cancer-wisconsin-data/data.csv \
|
| 14 |
+
--drop_columns "id" \
|
| 15 |
+
--select_columns "radius_mean, texture_mean, perimeter_mean, area_mean" \
|
| 16 |
+
--visualize
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
import sys
|
| 21 |
+
import argparse
|
| 22 |
+
import importlib
|
| 23 |
+
import pandas as pd
|
| 24 |
+
import numpy as np
|
| 25 |
+
import joblib
|
| 26 |
+
|
| 27 |
+
from sklearn.impute import SimpleImputer
|
| 28 |
+
from sklearn.preprocessing import LabelEncoder
|
| 29 |
+
import matplotlib.pyplot as plt
|
| 30 |
+
|
| 31 |
+
def main(args):
|
| 32 |
+
# Move to project root if needed
|
| 33 |
+
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
| 34 |
+
os.chdir(project_root)
|
| 35 |
+
sys.path.insert(0, project_root)
|
| 36 |
+
|
| 37 |
+
# Dynamically import the chosen model module (pca.py, tsne.py, umap.py, etc.)
|
| 38 |
+
model_module_path = f"models.unsupervised.dimred.{args.model_module}"
|
| 39 |
+
model_module = importlib.import_module(model_module_path)
|
| 40 |
+
|
| 41 |
+
# Retrieve the estimator from the model file
|
| 42 |
+
estimator = model_module.estimator
|
| 43 |
+
default_n_components = getattr(model_module, 'default_n_components', 2) # fallback
|
| 44 |
+
|
| 45 |
+
# Prepare results directory
|
| 46 |
+
if args.results_path is None:
|
| 47 |
+
# e.g., 'results/PCA_DimRed'
|
| 48 |
+
args.results_path = os.path.join('results', f"{estimator.__class__.__name__}_DimRed")
|
| 49 |
+
os.makedirs(args.results_path, exist_ok=True)
|
| 50 |
+
|
| 51 |
+
# Load data from CSV
|
| 52 |
+
df = pd.read_csv(args.data_path)
|
| 53 |
+
print(f"Data loaded from {args.data_path}, initial shape: {df.shape}")
|
| 54 |
+
|
| 55 |
+
# Drop empty columns
|
| 56 |
+
df = df.dropna(axis='columns', how='all')
|
| 57 |
+
print("After dropping empty columns:", df.shape)
|
| 58 |
+
|
| 59 |
+
# Drop specified columns if any
|
| 60 |
+
if args.drop_columns:
|
| 61 |
+
drop_cols = [col.strip() for col in args.drop_columns.split(',') if col.strip()]
|
| 62 |
+
df = df.drop(columns=drop_cols, errors='ignore')
|
| 63 |
+
print(f"Dropped columns: {drop_cols} | New shape: {df.shape}")
|
| 64 |
+
|
| 65 |
+
# Select specified columns if any
|
| 66 |
+
if args.select_columns:
|
| 67 |
+
keep_cols = [col.strip() for col in args.select_columns.split(',') if col.strip()]
|
| 68 |
+
df = df[keep_cols]
|
| 69 |
+
print(f"Selected columns: {keep_cols} | New shape: {df.shape}")
|
| 70 |
+
|
| 71 |
+
# Label-encode non-numeric columns
|
| 72 |
+
for col in df.columns:
|
| 73 |
+
if df[col].dtype == 'object':
|
| 74 |
+
le = LabelEncoder()
|
| 75 |
+
df[col] = le.fit_transform(df[col])
|
| 76 |
+
|
| 77 |
+
# Impute
|
| 78 |
+
imputer = SimpleImputer(strategy='mean') # or 'median'
|
| 79 |
+
df_array = imputer.fit_transform(df)
|
| 80 |
+
df_imputed = pd.DataFrame(df_array, columns=df.columns)
|
| 81 |
+
print("After label-encoding and imputation:", df_imputed.shape)
|
| 82 |
+
|
| 83 |
+
# Convert DataFrame to numpy array
|
| 84 |
+
X = df_imputed.values
|
| 85 |
+
print(f"Final data shape after dropping/selecting columns and encoding: {X.shape}")
|
| 86 |
+
|
| 87 |
+
# Fit-transform the data (typical for dimensionality reduction)
|
| 88 |
+
X_transformed = estimator.fit_transform(X)
|
| 89 |
+
print(f"Dimensionality reduction done using {args.model_module}. Output shape: {X_transformed.shape}")
|
| 90 |
+
|
| 91 |
+
# Save the model
|
| 92 |
+
model_output_path = os.path.join(args.results_path, "dimred_model.pkl")
|
| 93 |
+
os.makedirs(args.model_path, exist_ok=True) # ensure directory
|
| 94 |
+
joblib.dump(estimator, model_output_path)
|
| 95 |
+
print(f"Model saved to {model_output_path}")
|
| 96 |
+
|
| 97 |
+
# Save the transformed data
|
| 98 |
+
transformed_path = os.path.join(args.results_path, "X_transformed.csv")
|
| 99 |
+
pd.DataFrame(X_transformed).to_csv(transformed_path, index=False)
|
| 100 |
+
print(f"Transformed data saved to {transformed_path}")
|
| 101 |
+
|
| 102 |
+
# Visualization (only if 2D or 3D)
|
| 103 |
+
if args.visualize:
|
| 104 |
+
n_dims = X_transformed.shape[1]
|
| 105 |
+
if n_dims == 2:
|
| 106 |
+
plt.figure(figsize=(6,5))
|
| 107 |
+
plt.scatter(X_transformed[:,0], X_transformed[:,1], s=30, alpha=0.7, c='blue')
|
| 108 |
+
plt.title(f"{estimator.__class__.__name__} 2D Projection")
|
| 109 |
+
plt.xlabel("Component 1")
|
| 110 |
+
plt.ylabel("Component 2")
|
| 111 |
+
plot_path = os.path.join(args.results_path, "dimred_plot_2D.png")
|
| 112 |
+
plt.savefig(plot_path)
|
| 113 |
+
plt.show()
|
| 114 |
+
print(f"2D plot saved to {plot_path}")
|
| 115 |
+
elif n_dims == 3:
|
| 116 |
+
from mpl_toolkits.mplot3d import Axes3D
|
| 117 |
+
fig = plt.figure()
|
| 118 |
+
ax = fig.add_subplot(111, projection='3d')
|
| 119 |
+
ax.scatter(X_transformed[:,0], X_transformed[:,1], X_transformed[:,2], s=30, alpha=0.7, c='blue')
|
| 120 |
+
ax.set_title(f"{estimator.__class__.__name__} 3D Projection")
|
| 121 |
+
ax.set_xlabel("Component 1")
|
| 122 |
+
ax.set_ylabel("Component 2")
|
| 123 |
+
ax.set_zlabel("Component 3")
|
| 124 |
+
plot_path = os.path.join(args.results_path, "dimred_plot_3D.png")
|
| 125 |
+
plt.savefig(plot_path)
|
| 126 |
+
plt.show()
|
| 127 |
+
print(f"3D plot saved to {plot_path}")
|
| 128 |
+
else:
|
| 129 |
+
print(f"Visualization only supported for 2D or 3D outputs. Got {n_dims}D, skipping.")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
if __name__ == "__main__":
|
| 133 |
+
parser = argparse.ArgumentParser(description="Train a dimensionality reduction model.")
|
| 134 |
+
parser.add_argument('--model_module', type=str, required=True,
|
| 135 |
+
help='Name of the dimred model module (e.g. pca, tsne, umap).')
|
| 136 |
+
parser.add_argument('--data_path', type=str, required=True,
|
| 137 |
+
help='Path to the CSV dataset file.')
|
| 138 |
+
parser.add_argument('--model_path', type=str, default='saved_models/DimRed',
|
| 139 |
+
help='Where to save the fitted model.')
|
| 140 |
+
parser.add_argument('--results_path', type=str, default=None,
|
| 141 |
+
help='Directory to store results (transformed data, plots).')
|
| 142 |
+
parser.add_argument('--visualize', action='store_true',
|
| 143 |
+
help='Plot the transformed data if 2D or 3D.')
|
| 144 |
+
parser.add_argument('--drop_columns', type=str, default='',
|
| 145 |
+
help='Comma-separated column names to drop from the dataset.')
|
| 146 |
+
parser.add_argument('--select_columns', type=str, default='',
|
| 147 |
+
help='Comma-separated column names to keep (ignore the rest).')
|
| 148 |
+
|
| 149 |
+
args = parser.parse_args()
|
| 150 |
+
main(args)
|
utils/README.md
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Utils
|
| 2 |
+
|
| 3 |
+
This directory contains utility scripts and helper functions that are used throughout the project. These scripts provide common functionalities such as data preprocessing, hyperparameter tuning, and other support functions that assist in model training and evaluation for **supervised** (regression and classification) as well as **unsupervised** (clustering) tasks.
|
| 4 |
+
|
| 5 |
+
## Contents
|
| 6 |
+
|
| 7 |
+
- [supervised_hyperparameter_tuning.py](#supervised_hyperparameter_tuningpy)
|
| 8 |
+
- [unsupervised_hyperparameter_tuning.py](#unsupervised_hyperparameter_tuningpy)
|
| 9 |
+
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
## `supervised_hyperparameter_tuning.py`
|
| 13 |
+
|
| 14 |
+
This script contains functions for performing hyperparameter tuning on **supervised learning** models (both regression and classification) using scikit-learn's `Pipeline` and `GridSearchCV`.
|
| 15 |
+
|
| 16 |
+
### Functions
|
| 17 |
+
|
| 18 |
+
#### `regression_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None)`
|
| 19 |
+
|
| 20 |
+
Performs hyperparameter tuning for **regression** models.
|
| 21 |
+
|
| 22 |
+
- **Parameters**:
|
| 23 |
+
- `X (pd.DataFrame)`: Feature matrix.
|
| 24 |
+
- `y (pd.Series)`: Numeric target variable.
|
| 25 |
+
- `estimator`: A scikit-learn regressor (e.g., `LinearRegression()`).
|
| 26 |
+
- `param_grid (dict)`: Parameter names and lists of values (e.g. `{'model__fit_intercept': [True, False]}`).
|
| 27 |
+
- `cv (int)`: Number of cross-validation folds (default 5).
|
| 28 |
+
- `scoring (str)`: Scoring metric (e.g., `'neg_root_mean_squared_error'`).
|
| 29 |
+
- **Returns**:
|
| 30 |
+
- `best_model`: The pipeline with the best hyperparameters.
|
| 31 |
+
- `best_params (dict)`: The dictionary of best hyperparameters.
|
| 32 |
+
|
| 33 |
+
**Example**:
|
| 34 |
+
|
| 35 |
+
```python
|
| 36 |
+
from utils.supervised_hyperparameter_tuning import regression_hyperparameter_tuning
|
| 37 |
+
from sklearn.linear_model import LinearRegression
|
| 38 |
+
|
| 39 |
+
X = ... # Your regression features
|
| 40 |
+
y = ... # Your numeric target variable
|
| 41 |
+
param_grid = {
|
| 42 |
+
'model__fit_intercept': [True, False]
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
best_model, best_params = regression_hyperparameter_tuning(
|
| 46 |
+
X, y, LinearRegression(), param_grid, scoring='neg_root_mean_squared_error'
|
| 47 |
+
)
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
---
|
| 51 |
+
|
| 52 |
+
#### `classification_hyperparameter_tuning(X, y, estimator, param_grid, cv=5, scoring=None)`
|
| 53 |
+
|
| 54 |
+
Performs hyperparameter tuning for **classification** models.
|
| 55 |
+
|
| 56 |
+
- **Parameters**:
|
| 57 |
+
- `X (pd.DataFrame)`: Feature matrix.
|
| 58 |
+
- `y (pd.Series)`: Target variable (binary or multi-class).
|
| 59 |
+
- `estimator`: A scikit-learn classifier (e.g., `LogisticRegression()`, `RandomForestClassifier()`).
|
| 60 |
+
- `param_grid (dict)`: Parameter names and lists of values (e.g. `{'model__n_estimators': [100, 200]}`).
|
| 61 |
+
- `cv (int)`: Number of cross-validation folds (default 5).
|
| 62 |
+
- `scoring (str)`: Scoring metric (e.g., `'accuracy'`, `'f1'`, `'roc_auc'`).
|
| 63 |
+
- **Returns**:
|
| 64 |
+
- `best_model`: The pipeline with the best hyperparameters.
|
| 65 |
+
- `best_params (dict)`: The dictionary of best hyperparameters.
|
| 66 |
+
|
| 67 |
+
**Example**:
|
| 68 |
+
|
| 69 |
+
```python
|
| 70 |
+
from utils.supervised_hyperparameter_tuning import classification_hyperparameter_tuning
|
| 71 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 72 |
+
|
| 73 |
+
X = ... # Your classification features
|
| 74 |
+
y = ... # Binary or multi-class labels
|
| 75 |
+
param_grid = {
|
| 76 |
+
'model__n_estimators': [100, 200],
|
| 77 |
+
'model__max_depth': [None, 10]
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
best_model, best_params = classification_hyperparameter_tuning(
|
| 81 |
+
X, y, RandomForestClassifier(), param_grid, scoring='accuracy'
|
| 82 |
+
)
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
---
|
| 86 |
+
|
| 87 |
+
## `unsupervised_hyperparameter_tuning.py`
|
| 88 |
+
|
| 89 |
+
This script provides a function for **hyperparameter tuning of clustering models** using **silhouette score** as the objective metric. Unlike supervised approaches, clustering does not have labeled data, so the silhouette score is used to measure how well-separated the clusters are.
|
| 90 |
+
|
| 91 |
+
### Functions
|
| 92 |
+
|
| 93 |
+
#### `clustering_hyperparameter_tuning(X, estimator, param_grid, scoring='silhouette', cv=5)`
|
| 94 |
+
|
| 95 |
+
A simple manual hyperparameter search for clustering models.
|
| 96 |
+
|
| 97 |
+
- **Parameters**:
|
| 98 |
+
- `X (array-like)`: Feature matrix for clustering.
|
| 99 |
+
- `estimator`: A scikit-learn clustering estimator supporting `.fit(X)` and either `.labels_` or `.predict(X)` (e.g., `KMeans`, `DBSCAN`, `GaussianMixture`).
|
| 100 |
+
- `param_grid (dict)`: Dictionary of hyperparams (e.g., `{'model__n_clusters': [2,3,4]}`).
|
| 101 |
+
- `scoring (str)`: Only `'silhouette'` is supported.
|
| 102 |
+
- `cv (int)`: Optionally, you could do repeated subsampling or advanced logic for more stable estimates, but the default implementation does a single fit.
|
| 103 |
+
- **Returns**:
|
| 104 |
+
- `best_estimator`: The fitted estimator with the best silhouette score.
|
| 105 |
+
- `best_params (dict)`: The dictionary of best hyperparameters found.
|
| 106 |
+
|
| 107 |
+
**Key Steps**:
|
| 108 |
+
1. **Parameter Loop**: For each combination of parameters in `ParameterGrid(param_grid)`, clone and fit the estimator.
|
| 109 |
+
2. **Retrieve Labels**: If the estimator has `.labels_`, use it; otherwise use `.predict(X)`.
|
| 110 |
+
3. **Compute Silhouette**: If more than one cluster is found, calculate `silhouette_score(X, labels)`.
|
| 111 |
+
4. **Track the Best**: Keep track of the parameter set yielding the highest silhouette score.
|
| 112 |
+
5. **Fallback**: If no valid parameter combos produce more than one cluster, it falls back to the original estimator.
|
| 113 |
+
|
| 114 |
+
**Example**:
|
| 115 |
+
|
| 116 |
+
```python
|
| 117 |
+
from utils.unsupervised_hyperparameter_tuning import clustering_hyperparameter_tuning
|
| 118 |
+
from sklearn.cluster import KMeans
|
| 119 |
+
|
| 120 |
+
X = ... # Your numeric data for clustering
|
| 121 |
+
param_grid = {
|
| 122 |
+
'model__n_clusters': [2, 3, 4],
|
| 123 |
+
'model__init': ['k-means++', 'random']
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
best_model, best_params = clustering_hyperparameter_tuning(
|
| 127 |
+
X, KMeans(random_state=42), param_grid, scoring='silhouette'
|
| 128 |
+
)
|
| 129 |
+
print("Best Silhouette Score found:", best_params)
|
| 130 |
+
```
|
| 131 |
+
|
| 132 |
+
**Note**: This approach is simpler than using `GridSearchCV` for clustering because unsupervised tasks do not have a “true” label. The silhouette score is a common measure, but you could adapt the function for other internal cluster metrics if desired.
|
utils/unsupervised_hyperparameter_tuning.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
"""
|
| 3 |
+
unsupervised_hyperparameter_tuning.py
|
| 4 |
+
|
| 5 |
+
Provides a function for hyperparameter tuning of clustering models
|
| 6 |
+
using silhouette score as an objective.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
from sklearn.model_selection import ParameterGrid
|
| 11 |
+
from sklearn.metrics import silhouette_score
|
| 12 |
+
import copy
|
| 13 |
+
|
| 14 |
+
def clustering_hyperparameter_tuning(X, estimator, param_grid, scoring='silhouette', cv=5):
|
| 15 |
+
"""
|
| 16 |
+
A simple manual hyperparameter search for clustering models,
|
| 17 |
+
using silhouette_score for evaluation.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
X (array-like): Feature data for clustering.
|
| 21 |
+
estimator: An estimator with .fit() and .predict() or .labels_ attribute.
|
| 22 |
+
param_grid (dict): Dictionary of hyperparams, e.g. {'model__n_clusters': [2,3,4]}.
|
| 23 |
+
scoring (str): Only 'silhouette' is supported here.
|
| 24 |
+
cv (int): We can do repeated subsampling or something similar to get stable silhouette.
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
best_estimator: The estimator with best silhouette score.
|
| 28 |
+
best_params: Dictionary of best parameters found.
|
| 29 |
+
"""
|
| 30 |
+
if not param_grid:
|
| 31 |
+
# If param_grid is empty, just fit once
|
| 32 |
+
estimator.fit(X)
|
| 33 |
+
return estimator, {}
|
| 34 |
+
|
| 35 |
+
best_score = -1 # silhouette ranges -1 to 1
|
| 36 |
+
best_params = None
|
| 37 |
+
best_estimator = None
|
| 38 |
+
|
| 39 |
+
for params in ParameterGrid(param_grid):
|
| 40 |
+
# Clone the original estimator
|
| 41 |
+
from sklearn.base import clone
|
| 42 |
+
current_estimator = clone(estimator)
|
| 43 |
+
|
| 44 |
+
# Apply params
|
| 45 |
+
for param, val in params.items():
|
| 46 |
+
# param might look like "model__n_clusters"
|
| 47 |
+
# We adapt: if param starts with 'model__', we set on current_estimator
|
| 48 |
+
path = param.split('__')
|
| 49 |
+
if len(path) > 1:
|
| 50 |
+
# E.g., path = ['model','n_clusters']
|
| 51 |
+
# we set current_estimator.n_clusters = val
|
| 52 |
+
setattr(current_estimator, path[1], val)
|
| 53 |
+
else:
|
| 54 |
+
# If there's no 'model__' prefix
|
| 55 |
+
setattr(current_estimator, param, val)
|
| 56 |
+
|
| 57 |
+
# Simple approach to do multiple splits if we want
|
| 58 |
+
# For now, let's do a single fit to keep it straightforward
|
| 59 |
+
current_estimator.fit(X)
|
| 60 |
+
|
| 61 |
+
# Use the fitted current_estimator here, not 'estimator'
|
| 62 |
+
if hasattr(current_estimator, 'labels_') and current_estimator.labels_ is not None:
|
| 63 |
+
labels = current_estimator.labels_
|
| 64 |
+
elif hasattr(current_estimator, 'predict'):
|
| 65 |
+
labels = current_estimator.predict(X)
|
| 66 |
+
else:
|
| 67 |
+
raise ValueError("No valid way to retrieve cluster labels for this estimator.")
|
| 68 |
+
|
| 69 |
+
unique_labels = set(labels)
|
| 70 |
+
if len(unique_labels) > 1:
|
| 71 |
+
score = silhouette_score(X, labels)
|
| 72 |
+
else:
|
| 73 |
+
score = -999 # invalid scenario if only 1 cluster
|
| 74 |
+
|
| 75 |
+
if score > best_score:
|
| 76 |
+
best_score = score
|
| 77 |
+
best_params = params
|
| 78 |
+
best_estimator = current_estimator
|
| 79 |
+
|
| 80 |
+
if best_estimator is None:
|
| 81 |
+
print("No valid parameter combination produced more than 1 cluster. Falling back to original estimator.")
|
| 82 |
+
estimator.fit(X)
|
| 83 |
+
return estimator, {}
|
| 84 |
+
else:
|
| 85 |
+
print(f"Best silhouette score: {best_score:.4f}")
|
| 86 |
+
return best_estimator, best_params
|