Spaces:

mboukabous
/

train_unsupervised

Sleeping

File size: 12,353 Bytes


"""
Gradio Interface for Unsupervised Learning (Clustering, Dimensionality Reduction, Anomaly Detection)

This script provides a single Gradio-based interface to run three unsupervised tasks:
1. Clustering
2. Dimensionality Reduction
3. Anomaly (Outlier) Detection

Each task is placed in its own Gradio Tab. The user can:
- Choose a model from the relevant unsupervised directory (clustering/dimred/anomaly).
- Specify dataset input (upload, local path, or Kaggle).
- Select columns to drop or keep.
- Execute the relevant training script (train_clustering_model.py, train_dimred_model.py, or train_anomaly_detection.py).
- View logs and optional plots.

Project Requirements:
- Python 3.7+.
- Gradio, scikit-learn, pandas, etc. in requirements.txt.
- Properly structured project with:
  - scripts/train_clustering_model.py
  - scripts/train_dimred_model.py
  - scripts/train_anomaly_detection.py
  - models/unsupervised/<task>/<model>.py
  - data/datasets/kaggle_data.py (optional for Kaggle usage).
"""

import gradio as gr
import pandas as pd
import os
import subprocess
import sys
import glob
import re

#####################################
# Helper Functions
#####################################

def get_model_modules(task_type):
    """
    Dynamically fetch model modules from the unsupervised subdirectories:
    - clustering
    - dimred
    - anomaly
    """
    models_dir = os.path.join('models', 'unsupervised', task_type)
    if not os.path.exists(models_dir):
        print(f"Directory does not exist: {models_dir}")
        return []
    model_files = glob.glob(os.path.join(models_dir, '*.py'))
    modules = [
        os.path.splitext(os.path.basename(f))[0]
        for f in model_files if not f.endswith('__init__.py')
    ]
    return modules

def download_kaggle_data(json_path, dataset_name, is_competition):
    from data.datasets.kaggle_data import get_kaggle_data
    data_path = get_kaggle_data(json_path=json_path, data_name=dataset_name, is_competition=is_competition)
    return data_path

def run_subprocess(script_path, script_args):
    """
    Run a subprocess call to the given script with the specified arguments.
    Returns (output_text, plot_image_path_or_None).
    """
    try:
        result = subprocess.run(script_args, capture_output=True, text=True)
        output = result.stdout
        errors = result.stderr
        if result.returncode != 0:
            return f"Error during training:\n{errors}", None
        else:
            # Attempt to parse any 'Visualization saved to ...' line for an image path
            output = re.sub(r"Figure\(\d+x\d+\)", "", output).strip()
            image_path = None

            # Look for "Plot saved to ..." or any ".png" reference
            match_plot = re.search(r"Plot saved to (.+)", output)
            if match_plot:
                image_path = match_plot.group(1).strip()
            else:
                match_png = re.search(r"(\S+\.png)", output)
                if match_png:
                    image_path = match_png.group(1)

            if image_path and os.path.exists(image_path):
                return f"Completed successfully.\n\n{output}", image_path
            else:
                return f"Completed successfully.\n\n{output}", None
    except Exception as e:
        return f"An error occurred:\n{str(e)}", None

def get_columns_from_data(data_option, data_file, data_path,
                          kaggle_json_file, kaggle_competition_name, kaggle_data_name,
                          is_competition):
    """
    Attempt to load the CSV and return columns.
    """
    final_path = None
    if data_option == "Upload Data File":
        if data_file is None:
            return []
        final_path = data_file
    elif data_option == "Provide Data Path":
        if os.path.exists(data_path):
            final_path = data_path
        else:
            print("Provided path does not exist.")
            return []
    elif data_option == "Download from Kaggle":
        if kaggle_json_file is None:
            print("No kaggle.json uploaded.")
            return []
        import shutil
        kaggle_config_dir = os.path.expanduser('~/.kaggle')
        os.makedirs(kaggle_config_dir, exist_ok=True)
        kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json')
        shutil.copy(kaggle_json_file.name, kaggle_json_path)
        os.chmod(kaggle_json_path, 0o600)

        data_dir = download_kaggle_data(kaggle_json_path, kaggle_competition_name, is_competition)
        if data_dir is None:
            print("Failed to download from Kaggle.")
            return []
        final_path = os.path.join(data_dir, kaggle_data_name)
        if not os.path.exists(final_path):
            print(f"{kaggle_data_name} not found in Kaggle data.")
            return []
    else:
        print("Invalid data option.")
        return []

    try:
        df = pd.read_csv(final_path)
        return df.columns.tolist()
    except Exception as e:
        print(f"Error reading {final_path}: {e}")
        return []

#####################################
# Creating the Gradio Tab
#####################################

def create_task_tab(task_name, model_modules, script_path):
    """
    Creates a Gradio Tab for a specific unsupervised task (Clustering, DimRed, Anomaly).
    - model_modules: list of model modules from get_model_modules(task_type)
    - script_path: e.g. 'scripts/train_clustering_model.py'
    """

    with gr.Tab(task_name):
        gr.Markdown(f"## {task_name} Task")

        # Model selection
        model_select = gr.Dropdown(choices=model_modules, label=f"{task_name} Model Module")

        # Data input approach
        data_option = gr.Radio(
            choices=["Upload Data File", "Provide Data Path", "Download from Kaggle"],
            label="Data Input Option",
            value="Upload Data File"
        )

        with gr.Column(visible=True) as upload_data_col:
            data_file = gr.File(label="Upload CSV Data File", type="filepath")

        with gr.Column(visible=False) as path_data_col:
            data_path_txt = gr.Textbox(label="Data File Path")

        with gr.Column(visible=False) as kaggle_data_col:
            kaggle_json = gr.File(label="Upload kaggle.json File", type="filepath")
            kaggle_competition_name = gr.Textbox(value='', label="Kaggle Competition/Dataset Name")
            kaggle_data_name = gr.Textbox(value='data.csv', label="Data File Name in Kaggle dataset")
            kaggle_is_competition = gr.Checkbox(label="Is Kaggle Competition?", value=False)

        # Toggle data input columns
        def toggle_data_input(choice):
            if choice == "Upload Data File":
                return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
            elif choice == "Provide Data Path":
                return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
            elif choice == "Download from Kaggle":
                return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
            else:
                return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)

        data_option.change(
            toggle_data_input,
            inputs=[data_option],
            outputs=[upload_data_col, path_data_col, kaggle_data_col]
        )

        # Update columns button
        update_cols_btn = gr.Button("Update Columns")

        # We remove "Columns in Data (for reference)" as requested
        drop_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Drop")
        select_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Keep (if empty, keep all)")

        # Visualization param
        visualize_chk = gr.Checkbox(label="Visualize 2D (using PCA if needed)", value=True)

        # Model / results path with empty default, and label "(optional)"
        model_path_txt = gr.Textbox(label="Model Save Path (optional)", value="")
        results_path_txt = gr.Textbox(label="Results Save Path (optional)", value="")

        # The Train button
        train_btn = gr.Button(f"Train {task_name}")

        # Logs/Output
        output_box = gr.Textbox(label="Logs / Output")
        image_display = gr.Image(label="Plot Output", visible=True)

        # Function to update columns
        def update_columns_fn(dataopt, f, p, kagfile, kcname, kdname, iscomp):
            cols = get_columns_from_data(dataopt, f, p, kagfile, kcname, kdname, iscomp)
            # Return updated choices for drop_cols_chk, select_cols_chk
            if cols:
                return gr.update(choices=cols), gr.update(choices=cols)
            else:
                return gr.update(choices=[]), gr.update(choices=[])

        update_cols_btn.click(
            fn=update_columns_fn,
            inputs=[
                data_option, data_file, data_path_txt,
                kaggle_json, kaggle_competition_name, kaggle_data_name,
                kaggle_is_competition
            ],
            outputs=[drop_cols_chk, select_cols_chk]
        )

        def run_task(model_mod, dataopt, f, p, kagfile, kcname, kdname, iscomp,
                     drop_cols, select_cols, visualize, mpath, rpath):
            # Build the command for the relevant script
            script_cmd = [sys.executable, os.path.join(script_path)]
            script_cmd.extend(["--model_module", model_mod])

            # Minimal approach for data path logic
            final_path = None
            if dataopt == "Upload Data File" and f is not None:
                final_path = f
            elif dataopt == "Provide Data Path" and os.path.exists(p):
                final_path = p
            else:
                # For Kaggle or other complexities, skipping for brevity.
                # Could handle it similarly to get_columns_from_data approach
                final_path = ""

            if final_path:
                script_cmd.extend(["--data_path", final_path])

            # drop cols
            if drop_cols and len(drop_cols) > 0:
                script_cmd.extend(["--drop_columns", ",".join(drop_cols)])
            # select cols
            if select_cols and len(select_cols) > 0:
                script_cmd.extend(["--select_columns", ",".join(select_cols)])
            # visualize
            if visualize:
                script_cmd.append("--visualize")

            # model_path
            if mpath.strip():
                script_cmd.extend(["--model_path", mpath.strip()])
            # results_path
            if rpath.strip():
                script_cmd.extend(["--results_path", rpath.strip()])

            print("Executing command:", " ".join(script_cmd))
            out_text, plot_path = run_subprocess(script_path, script_cmd)
            return out_text, plot_path

        # The Train button is above logs, so let's define the click function
        train_btn.click(
            fn=run_task,
            inputs=[
                model_select, data_option, data_file, data_path_txt,
                kaggle_json, kaggle_competition_name, kaggle_data_name, kaggle_is_competition,
                drop_cols_chk, select_cols_chk, visualize_chk,
                model_path_txt, results_path_txt
            ],
            outputs=[output_box, image_display]
        )

    return  # end create_task_tab


#####################################
# Build the Main Gradio App
#####################################

with gr.Blocks() as demo:
    gr.Markdown("# Unsupervised Learning Gradio Interface")

    # 1) Clustering Tab
    clustering_modules = get_model_modules("clustering")
    create_task_tab(
        task_name="Clustering",
        model_modules=clustering_modules,
        script_path="scripts/train_clustering_model.py"
    )

    # 2) Dimensionality Reduction Tab
    dimred_modules = get_model_modules("dimred")
    create_task_tab(
        task_name="Dimensionality Reduction",
        model_modules=dimred_modules,
        script_path="scripts/train_dimred_model.py"
    )

    # 3) Anomaly Detection Tab
    anomaly_modules = get_model_modules("anomaly")
    create_task_tab(
        task_name="Anomaly Detection",
        model_modules=anomaly_modules,
        script_path="scripts/train_anomaly_detection.py"
    )

if __name__ == "__main__":
    demo.launch()