mboukabous's picture
fixe project root
0922d39
"""
Gradio Interface for Unsupervised Learning (Clustering, Dimensionality Reduction, Anomaly Detection)
This script provides a single Gradio-based interface to run three unsupervised tasks:
1. Clustering
2. Dimensionality Reduction
3. Anomaly (Outlier) Detection
Each task is placed in its own Gradio Tab. The user can:
- Choose a model from the relevant unsupervised directory (clustering/dimred/anomaly).
- Specify dataset input (upload, local path, or Kaggle).
- Select columns to drop or keep.
- Execute the relevant training script (train_clustering_model.py, train_dimred_model.py, or train_anomaly_detection.py).
- View logs and optional plots.
Project Requirements:
- Python 3.7+.
- Gradio, scikit-learn, pandas, etc. in requirements.txt.
- Properly structured project with:
- scripts/train_clustering_model.py
- scripts/train_dimred_model.py
- scripts/train_anomaly_detection.py
- models/unsupervised/<task>/<model>.py
- data/datasets/kaggle_data.py (optional for Kaggle usage).
"""
import gradio as gr
import pandas as pd
import os
import subprocess
import sys
import glob
import re
#####################################
# Helper Functions
#####################################
def get_model_modules(task_type):
"""
Dynamically fetch model modules from the unsupervised subdirectories:
- clustering
- dimred
- anomaly
"""
models_dir = os.path.join('models', 'unsupervised', task_type)
if not os.path.exists(models_dir):
print(f"Directory does not exist: {models_dir}")
return []
model_files = glob.glob(os.path.join(models_dir, '*.py'))
modules = [
os.path.splitext(os.path.basename(f))[0]
for f in model_files if not f.endswith('__init__.py')
]
return modules
def download_kaggle_data(json_path, dataset_name, is_competition):
from data.datasets.kaggle_data import get_kaggle_data
data_path = get_kaggle_data(json_path=json_path, data_name=dataset_name, is_competition=is_competition)
return data_path
def run_subprocess(script_path, script_args):
"""
Run a subprocess call to the given script with the specified arguments.
Returns (output_text, plot_image_path_or_None).
"""
try:
result = subprocess.run(script_args, capture_output=True, text=True)
output = result.stdout
errors = result.stderr
if result.returncode != 0:
return f"Error during training:\n{errors}", None
else:
# Attempt to parse any 'Visualization saved to ...' line for an image path
output = re.sub(r"Figure\(\d+x\d+\)", "", output).strip()
image_path = None
# Look for "Plot saved to ..." or any ".png" reference
match_plot = re.search(r"Plot saved to (.+)", output)
if match_plot:
image_path = match_plot.group(1).strip()
else:
match_png = re.search(r"(\S+\.png)", output)
if match_png:
image_path = match_png.group(1)
if image_path and os.path.exists(image_path):
return f"Completed successfully.\n\n{output}", image_path
else:
return f"Completed successfully.\n\n{output}", None
except Exception as e:
return f"An error occurred:\n{str(e)}", None
def get_columns_from_data(data_option, data_file, data_path,
kaggle_json_file, kaggle_competition_name, kaggle_data_name,
is_competition):
"""
Attempt to load the CSV and return columns.
"""
final_path = None
if data_option == "Upload Data File":
if data_file is None:
return []
final_path = data_file
elif data_option == "Provide Data Path":
if os.path.exists(data_path):
final_path = data_path
else:
print("Provided path does not exist.")
return []
elif data_option == "Download from Kaggle":
if kaggle_json_file is None:
print("No kaggle.json uploaded.")
return []
import shutil
kaggle_config_dir = os.path.expanduser('~/.kaggle')
os.makedirs(kaggle_config_dir, exist_ok=True)
kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json')
shutil.copy(kaggle_json_file.name, kaggle_json_path)
os.chmod(kaggle_json_path, 0o600)
data_dir = download_kaggle_data(kaggle_json_path, kaggle_competition_name, is_competition)
if data_dir is None:
print("Failed to download from Kaggle.")
return []
final_path = os.path.join(data_dir, kaggle_data_name)
if not os.path.exists(final_path):
print(f"{kaggle_data_name} not found in Kaggle data.")
return []
else:
print("Invalid data option.")
return []
try:
df = pd.read_csv(final_path)
return df.columns.tolist()
except Exception as e:
print(f"Error reading {final_path}: {e}")
return []
#####################################
# Creating the Gradio Tab
#####################################
def create_task_tab(task_name, model_modules, script_path):
"""
Creates a Gradio Tab for a specific unsupervised task (Clustering, DimRed, Anomaly).
- model_modules: list of model modules from get_model_modules(task_type)
- script_path: e.g. 'scripts/train_clustering_model.py'
"""
with gr.Tab(task_name):
gr.Markdown(f"## {task_name} Task")
# Model selection
model_select = gr.Dropdown(choices=model_modules, label=f"{task_name} Model Module")
# Data input approach
data_option = gr.Radio(
choices=["Upload Data File", "Provide Data Path", "Download from Kaggle"],
label="Data Input Option",
value="Upload Data File"
)
with gr.Column(visible=True) as upload_data_col:
data_file = gr.File(label="Upload CSV Data File", type="filepath")
with gr.Column(visible=False) as path_data_col:
data_path_txt = gr.Textbox(label="Data File Path")
with gr.Column(visible=False) as kaggle_data_col:
kaggle_json = gr.File(label="Upload kaggle.json File", type="filepath")
kaggle_competition_name = gr.Textbox(value='', label="Kaggle Competition/Dataset Name")
kaggle_data_name = gr.Textbox(value='data.csv', label="Data File Name in Kaggle dataset")
kaggle_is_competition = gr.Checkbox(label="Is Kaggle Competition?", value=False)
# Toggle data input columns
def toggle_data_input(choice):
if choice == "Upload Data File":
return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
elif choice == "Provide Data Path":
return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
elif choice == "Download from Kaggle":
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
else:
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
data_option.change(
toggle_data_input,
inputs=[data_option],
outputs=[upload_data_col, path_data_col, kaggle_data_col]
)
# Update columns button
update_cols_btn = gr.Button("Update Columns")
# We remove "Columns in Data (for reference)" as requested
drop_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Drop")
select_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Keep (if empty, keep all)")
# Visualization param
visualize_chk = gr.Checkbox(label="Visualize 2D (using PCA if needed)", value=True)
# Model / results path with empty default, and label "(optional)"
model_path_txt = gr.Textbox(label="Model Save Path (optional)", value="")
results_path_txt = gr.Textbox(label="Results Save Path (optional)", value="")
# The Train button
train_btn = gr.Button(f"Train {task_name}")
# Logs/Output
output_box = gr.Textbox(label="Logs / Output")
image_display = gr.Image(label="Plot Output", visible=True)
# Function to update columns
def update_columns_fn(dataopt, f, p, kagfile, kcname, kdname, iscomp):
cols = get_columns_from_data(dataopt, f, p, kagfile, kcname, kdname, iscomp)
# Return updated choices for drop_cols_chk, select_cols_chk
if cols:
return gr.update(choices=cols), gr.update(choices=cols)
else:
return gr.update(choices=[]), gr.update(choices=[])
update_cols_btn.click(
fn=update_columns_fn,
inputs=[
data_option, data_file, data_path_txt,
kaggle_json, kaggle_competition_name, kaggle_data_name,
kaggle_is_competition
],
outputs=[drop_cols_chk, select_cols_chk]
)
def run_task(model_mod, dataopt, f, p, kagfile, kcname, kdname, iscomp,
drop_cols, select_cols, visualize, mpath, rpath):
# Build the command for the relevant script
script_cmd = [sys.executable, os.path.join(script_path)]
script_cmd.extend(["--model_module", model_mod])
# Minimal approach for data path logic
final_path = None
if dataopt == "Upload Data File" and f is not None:
final_path = f
elif dataopt == "Provide Data Path" and os.path.exists(p):
final_path = p
else:
# For Kaggle or other complexities, skipping for brevity.
# Could handle it similarly to get_columns_from_data approach
final_path = ""
if final_path:
script_cmd.extend(["--data_path", final_path])
# drop cols
if drop_cols and len(drop_cols) > 0:
script_cmd.extend(["--drop_columns", ",".join(drop_cols)])
# select cols
if select_cols and len(select_cols) > 0:
script_cmd.extend(["--select_columns", ",".join(select_cols)])
# visualize
if visualize:
script_cmd.append("--visualize")
# model_path
if mpath.strip():
script_cmd.extend(["--model_path", mpath.strip()])
# results_path
if rpath.strip():
script_cmd.extend(["--results_path", rpath.strip()])
print("Executing command:", " ".join(script_cmd))
out_text, plot_path = run_subprocess(script_path, script_cmd)
return out_text, plot_path
# The Train button is above logs, so let's define the click function
train_btn.click(
fn=run_task,
inputs=[
model_select, data_option, data_file, data_path_txt,
kaggle_json, kaggle_competition_name, kaggle_data_name, kaggle_is_competition,
drop_cols_chk, select_cols_chk, visualize_chk,
model_path_txt, results_path_txt
],
outputs=[output_box, image_display]
)
return # end create_task_tab
#####################################
# Build the Main Gradio App
#####################################
with gr.Blocks() as demo:
gr.Markdown("# Unsupervised Learning Gradio Interface")
# 1) Clustering Tab
clustering_modules = get_model_modules("clustering")
create_task_tab(
task_name="Clustering",
model_modules=clustering_modules,
script_path="scripts/train_clustering_model.py"
)
# 2) Dimensionality Reduction Tab
dimred_modules = get_model_modules("dimred")
create_task_tab(
task_name="Dimensionality Reduction",
model_modules=dimred_modules,
script_path="scripts/train_dimred_model.py"
)
# 3) Anomaly Detection Tab
anomaly_modules = get_model_modules("anomaly")
create_task_tab(
task_name="Anomaly Detection",
model_modules=anomaly_modules,
script_path="scripts/train_anomaly_detection.py"
)
if __name__ == "__main__":
demo.launch()