Spaces:
Sleeping
Sleeping
File size: 12,353 Bytes
4c91838 0922d39 4c91838 0922d39 4c91838 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 |
"""
Gradio Interface for Unsupervised Learning (Clustering, Dimensionality Reduction, Anomaly Detection)
This script provides a single Gradio-based interface to run three unsupervised tasks:
1. Clustering
2. Dimensionality Reduction
3. Anomaly (Outlier) Detection
Each task is placed in its own Gradio Tab. The user can:
- Choose a model from the relevant unsupervised directory (clustering/dimred/anomaly).
- Specify dataset input (upload, local path, or Kaggle).
- Select columns to drop or keep.
- Execute the relevant training script (train_clustering_model.py, train_dimred_model.py, or train_anomaly_detection.py).
- View logs and optional plots.
Project Requirements:
- Python 3.7+.
- Gradio, scikit-learn, pandas, etc. in requirements.txt.
- Properly structured project with:
- scripts/train_clustering_model.py
- scripts/train_dimred_model.py
- scripts/train_anomaly_detection.py
- models/unsupervised/<task>/<model>.py
- data/datasets/kaggle_data.py (optional for Kaggle usage).
"""
import gradio as gr
import pandas as pd
import os
import subprocess
import sys
import glob
import re
#####################################
# Helper Functions
#####################################
def get_model_modules(task_type):
"""
Dynamically fetch model modules from the unsupervised subdirectories:
- clustering
- dimred
- anomaly
"""
models_dir = os.path.join('models', 'unsupervised', task_type)
if not os.path.exists(models_dir):
print(f"Directory does not exist: {models_dir}")
return []
model_files = glob.glob(os.path.join(models_dir, '*.py'))
modules = [
os.path.splitext(os.path.basename(f))[0]
for f in model_files if not f.endswith('__init__.py')
]
return modules
def download_kaggle_data(json_path, dataset_name, is_competition):
from data.datasets.kaggle_data import get_kaggle_data
data_path = get_kaggle_data(json_path=json_path, data_name=dataset_name, is_competition=is_competition)
return data_path
def run_subprocess(script_path, script_args):
"""
Run a subprocess call to the given script with the specified arguments.
Returns (output_text, plot_image_path_or_None).
"""
try:
result = subprocess.run(script_args, capture_output=True, text=True)
output = result.stdout
errors = result.stderr
if result.returncode != 0:
return f"Error during training:\n{errors}", None
else:
# Attempt to parse any 'Visualization saved to ...' line for an image path
output = re.sub(r"Figure\(\d+x\d+\)", "", output).strip()
image_path = None
# Look for "Plot saved to ..." or any ".png" reference
match_plot = re.search(r"Plot saved to (.+)", output)
if match_plot:
image_path = match_plot.group(1).strip()
else:
match_png = re.search(r"(\S+\.png)", output)
if match_png:
image_path = match_png.group(1)
if image_path and os.path.exists(image_path):
return f"Completed successfully.\n\n{output}", image_path
else:
return f"Completed successfully.\n\n{output}", None
except Exception as e:
return f"An error occurred:\n{str(e)}", None
def get_columns_from_data(data_option, data_file, data_path,
kaggle_json_file, kaggle_competition_name, kaggle_data_name,
is_competition):
"""
Attempt to load the CSV and return columns.
"""
final_path = None
if data_option == "Upload Data File":
if data_file is None:
return []
final_path = data_file
elif data_option == "Provide Data Path":
if os.path.exists(data_path):
final_path = data_path
else:
print("Provided path does not exist.")
return []
elif data_option == "Download from Kaggle":
if kaggle_json_file is None:
print("No kaggle.json uploaded.")
return []
import shutil
kaggle_config_dir = os.path.expanduser('~/.kaggle')
os.makedirs(kaggle_config_dir, exist_ok=True)
kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json')
shutil.copy(kaggle_json_file.name, kaggle_json_path)
os.chmod(kaggle_json_path, 0o600)
data_dir = download_kaggle_data(kaggle_json_path, kaggle_competition_name, is_competition)
if data_dir is None:
print("Failed to download from Kaggle.")
return []
final_path = os.path.join(data_dir, kaggle_data_name)
if not os.path.exists(final_path):
print(f"{kaggle_data_name} not found in Kaggle data.")
return []
else:
print("Invalid data option.")
return []
try:
df = pd.read_csv(final_path)
return df.columns.tolist()
except Exception as e:
print(f"Error reading {final_path}: {e}")
return []
#####################################
# Creating the Gradio Tab
#####################################
def create_task_tab(task_name, model_modules, script_path):
"""
Creates a Gradio Tab for a specific unsupervised task (Clustering, DimRed, Anomaly).
- model_modules: list of model modules from get_model_modules(task_type)
- script_path: e.g. 'scripts/train_clustering_model.py'
"""
with gr.Tab(task_name):
gr.Markdown(f"## {task_name} Task")
# Model selection
model_select = gr.Dropdown(choices=model_modules, label=f"{task_name} Model Module")
# Data input approach
data_option = gr.Radio(
choices=["Upload Data File", "Provide Data Path", "Download from Kaggle"],
label="Data Input Option",
value="Upload Data File"
)
with gr.Column(visible=True) as upload_data_col:
data_file = gr.File(label="Upload CSV Data File", type="filepath")
with gr.Column(visible=False) as path_data_col:
data_path_txt = gr.Textbox(label="Data File Path")
with gr.Column(visible=False) as kaggle_data_col:
kaggle_json = gr.File(label="Upload kaggle.json File", type="filepath")
kaggle_competition_name = gr.Textbox(value='', label="Kaggle Competition/Dataset Name")
kaggle_data_name = gr.Textbox(value='data.csv', label="Data File Name in Kaggle dataset")
kaggle_is_competition = gr.Checkbox(label="Is Kaggle Competition?", value=False)
# Toggle data input columns
def toggle_data_input(choice):
if choice == "Upload Data File":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
elif choice == "Provide Data Path":
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
elif choice == "Download from Kaggle":
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
data_option.change(
toggle_data_input,
inputs=[data_option],
outputs=[upload_data_col, path_data_col, kaggle_data_col]
)
# Update columns button
update_cols_btn = gr.Button("Update Columns")
# We remove "Columns in Data (for reference)" as requested
drop_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Drop")
select_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Keep (if empty, keep all)")
# Visualization param
visualize_chk = gr.Checkbox(label="Visualize 2D (using PCA if needed)", value=True)
# Model / results path with empty default, and label "(optional)"
model_path_txt = gr.Textbox(label="Model Save Path (optional)", value="")
results_path_txt = gr.Textbox(label="Results Save Path (optional)", value="")
# The Train button
train_btn = gr.Button(f"Train {task_name}")
# Logs/Output
output_box = gr.Textbox(label="Logs / Output")
image_display = gr.Image(label="Plot Output", visible=True)
# Function to update columns
def update_columns_fn(dataopt, f, p, kagfile, kcname, kdname, iscomp):
cols = get_columns_from_data(dataopt, f, p, kagfile, kcname, kdname, iscomp)
# Return updated choices for drop_cols_chk, select_cols_chk
if cols:
return gr.update(choices=cols), gr.update(choices=cols)
else:
return gr.update(choices=[]), gr.update(choices=[])
update_cols_btn.click(
fn=update_columns_fn,
inputs=[
data_option, data_file, data_path_txt,
kaggle_json, kaggle_competition_name, kaggle_data_name,
kaggle_is_competition
],
outputs=[drop_cols_chk, select_cols_chk]
)
def run_task(model_mod, dataopt, f, p, kagfile, kcname, kdname, iscomp,
drop_cols, select_cols, visualize, mpath, rpath):
# Build the command for the relevant script
script_cmd = [sys.executable, os.path.join(script_path)]
script_cmd.extend(["--model_module", model_mod])
# Minimal approach for data path logic
final_path = None
if dataopt == "Upload Data File" and f is not None:
final_path = f
elif dataopt == "Provide Data Path" and os.path.exists(p):
final_path = p
else:
# For Kaggle or other complexities, skipping for brevity.
# Could handle it similarly to get_columns_from_data approach
final_path = ""
if final_path:
script_cmd.extend(["--data_path", final_path])
# drop cols
if drop_cols and len(drop_cols) > 0:
script_cmd.extend(["--drop_columns", ",".join(drop_cols)])
# select cols
if select_cols and len(select_cols) > 0:
script_cmd.extend(["--select_columns", ",".join(select_cols)])
# visualize
if visualize:
script_cmd.append("--visualize")
# model_path
if mpath.strip():
script_cmd.extend(["--model_path", mpath.strip()])
# results_path
if rpath.strip():
script_cmd.extend(["--results_path", rpath.strip()])
print("Executing command:", " ".join(script_cmd))
out_text, plot_path = run_subprocess(script_path, script_cmd)
return out_text, plot_path
# The Train button is above logs, so let's define the click function
train_btn.click(
fn=run_task,
inputs=[
model_select, data_option, data_file, data_path_txt,
kaggle_json, kaggle_competition_name, kaggle_data_name, kaggle_is_competition,
drop_cols_chk, select_cols_chk, visualize_chk,
model_path_txt, results_path_txt
],
outputs=[output_box, image_display]
)
return # end create_task_tab
#####################################
# Build the Main Gradio App
#####################################
with gr.Blocks() as demo:
gr.Markdown("# Unsupervised Learning Gradio Interface")
# 1) Clustering Tab
clustering_modules = get_model_modules("clustering")
create_task_tab(
task_name="Clustering",
model_modules=clustering_modules,
script_path="scripts/train_clustering_model.py"
)
# 2) Dimensionality Reduction Tab
dimred_modules = get_model_modules("dimred")
create_task_tab(
task_name="Dimensionality Reduction",
model_modules=dimred_modules,
script_path="scripts/train_dimred_model.py"
)
# 3) Anomaly Detection Tab
anomaly_modules = get_model_modules("anomaly")
create_task_tab(
task_name="Anomaly Detection",
model_modules=anomaly_modules,
script_path="scripts/train_anomaly_detection.py"
)
if __name__ == "__main__":
demo.launch()
|