""" Gradio Interface for Unsupervised Learning (Clustering, Dimensionality Reduction, Anomaly Detection) This script provides a single Gradio-based interface to run three unsupervised tasks: 1. Clustering 2. Dimensionality Reduction 3. Anomaly (Outlier) Detection Each task is placed in its own Gradio Tab. The user can: - Choose a model from the relevant unsupervised directory (clustering/dimred/anomaly). - Specify dataset input (upload, local path, or Kaggle). - Select columns to drop or keep. - Execute the relevant training script (train_clustering_model.py, train_dimred_model.py, or train_anomaly_detection.py). - View logs and optional plots. Project Requirements: - Python 3.7+. - Gradio, scikit-learn, pandas, etc. in requirements.txt. - Properly structured project with: - scripts/train_clustering_model.py - scripts/train_dimred_model.py - scripts/train_anomaly_detection.py - models/unsupervised//.py - data/datasets/kaggle_data.py (optional for Kaggle usage). """ import gradio as gr import pandas as pd import os import subprocess import sys import glob import re ##################################### # Helper Functions ##################################### def get_model_modules(task_type): """ Dynamically fetch model modules from the unsupervised subdirectories: - clustering - dimred - anomaly """ models_dir = os.path.join('models', 'unsupervised', task_type) if not os.path.exists(models_dir): print(f"Directory does not exist: {models_dir}") return [] model_files = glob.glob(os.path.join(models_dir, '*.py')) modules = [ os.path.splitext(os.path.basename(f))[0] for f in model_files if not f.endswith('__init__.py') ] return modules def download_kaggle_data(json_path, dataset_name, is_competition): from data.datasets.kaggle_data import get_kaggle_data data_path = get_kaggle_data(json_path=json_path, data_name=dataset_name, is_competition=is_competition) return data_path def run_subprocess(script_path, script_args): """ Run a subprocess call to the given script with the specified arguments. Returns (output_text, plot_image_path_or_None). """ try: result = subprocess.run(script_args, capture_output=True, text=True) output = result.stdout errors = result.stderr if result.returncode != 0: return f"Error during training:\n{errors}", None else: # Attempt to parse any 'Visualization saved to ...' line for an image path output = re.sub(r"Figure\(\d+x\d+\)", "", output).strip() image_path = None # Look for "Plot saved to ..." or any ".png" reference match_plot = re.search(r"Plot saved to (.+)", output) if match_plot: image_path = match_plot.group(1).strip() else: match_png = re.search(r"(\S+\.png)", output) if match_png: image_path = match_png.group(1) if image_path and os.path.exists(image_path): return f"Completed successfully.\n\n{output}", image_path else: return f"Completed successfully.\n\n{output}", None except Exception as e: return f"An error occurred:\n{str(e)}", None def get_columns_from_data(data_option, data_file, data_path, kaggle_json_file, kaggle_competition_name, kaggle_data_name, is_competition): """ Attempt to load the CSV and return columns. """ final_path = None if data_option == "Upload Data File": if data_file is None: return [] final_path = data_file elif data_option == "Provide Data Path": if os.path.exists(data_path): final_path = data_path else: print("Provided path does not exist.") return [] elif data_option == "Download from Kaggle": if kaggle_json_file is None: print("No kaggle.json uploaded.") return [] import shutil kaggle_config_dir = os.path.expanduser('~/.kaggle') os.makedirs(kaggle_config_dir, exist_ok=True) kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json') shutil.copy(kaggle_json_file.name, kaggle_json_path) os.chmod(kaggle_json_path, 0o600) data_dir = download_kaggle_data(kaggle_json_path, kaggle_competition_name, is_competition) if data_dir is None: print("Failed to download from Kaggle.") return [] final_path = os.path.join(data_dir, kaggle_data_name) if not os.path.exists(final_path): print(f"{kaggle_data_name} not found in Kaggle data.") return [] else: print("Invalid data option.") return [] try: df = pd.read_csv(final_path) return df.columns.tolist() except Exception as e: print(f"Error reading {final_path}: {e}") return [] ##################################### # Creating the Gradio Tab ##################################### def create_task_tab(task_name, model_modules, script_path): """ Creates a Gradio Tab for a specific unsupervised task (Clustering, DimRed, Anomaly). - model_modules: list of model modules from get_model_modules(task_type) - script_path: e.g. 'scripts/train_clustering_model.py' """ with gr.Tab(task_name): gr.Markdown(f"## {task_name} Task") # Model selection model_select = gr.Dropdown(choices=model_modules, label=f"{task_name} Model Module") # Data input approach data_option = gr.Radio( choices=["Upload Data File", "Provide Data Path", "Download from Kaggle"], label="Data Input Option", value="Upload Data File" ) with gr.Column(visible=True) as upload_data_col: data_file = gr.File(label="Upload CSV Data File", type="filepath") with gr.Column(visible=False) as path_data_col: data_path_txt = gr.Textbox(label="Data File Path") with gr.Column(visible=False) as kaggle_data_col: kaggle_json = gr.File(label="Upload kaggle.json File", type="filepath") kaggle_competition_name = gr.Textbox(value='', label="Kaggle Competition/Dataset Name") kaggle_data_name = gr.Textbox(value='data.csv', label="Data File Name in Kaggle dataset") kaggle_is_competition = gr.Checkbox(label="Is Kaggle Competition?", value=False) # Toggle data input columns def toggle_data_input(choice): if choice == "Upload Data File": return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) elif choice == "Provide Data Path": return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) elif choice == "Download from Kaggle": return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True) else: return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) data_option.change( toggle_data_input, inputs=[data_option], outputs=[upload_data_col, path_data_col, kaggle_data_col] ) # Update columns button update_cols_btn = gr.Button("Update Columns") # We remove "Columns in Data (for reference)" as requested drop_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Drop") select_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Keep (if empty, keep all)") # Visualization param visualize_chk = gr.Checkbox(label="Visualize 2D (using PCA if needed)", value=True) # Model / results path with empty default, and label "(optional)" model_path_txt = gr.Textbox(label="Model Save Path (optional)", value="") results_path_txt = gr.Textbox(label="Results Save Path (optional)", value="") # The Train button train_btn = gr.Button(f"Train {task_name}") # Logs/Output output_box = gr.Textbox(label="Logs / Output") image_display = gr.Image(label="Plot Output", visible=True) # Function to update columns def update_columns_fn(dataopt, f, p, kagfile, kcname, kdname, iscomp): cols = get_columns_from_data(dataopt, f, p, kagfile, kcname, kdname, iscomp) # Return updated choices for drop_cols_chk, select_cols_chk if cols: return gr.update(choices=cols), gr.update(choices=cols) else: return gr.update(choices=[]), gr.update(choices=[]) update_cols_btn.click( fn=update_columns_fn, inputs=[ data_option, data_file, data_path_txt, kaggle_json, kaggle_competition_name, kaggle_data_name, kaggle_is_competition ], outputs=[drop_cols_chk, select_cols_chk] ) def run_task(model_mod, dataopt, f, p, kagfile, kcname, kdname, iscomp, drop_cols, select_cols, visualize, mpath, rpath): # Build the command for the relevant script script_cmd = [sys.executable, os.path.join(script_path)] script_cmd.extend(["--model_module", model_mod]) # Minimal approach for data path logic final_path = None if dataopt == "Upload Data File" and f is not None: final_path = f elif dataopt == "Provide Data Path" and os.path.exists(p): final_path = p else: # For Kaggle or other complexities, skipping for brevity. # Could handle it similarly to get_columns_from_data approach final_path = "" if final_path: script_cmd.extend(["--data_path", final_path]) # drop cols if drop_cols and len(drop_cols) > 0: script_cmd.extend(["--drop_columns", ",".join(drop_cols)]) # select cols if select_cols and len(select_cols) > 0: script_cmd.extend(["--select_columns", ",".join(select_cols)]) # visualize if visualize: script_cmd.append("--visualize") # model_path if mpath.strip(): script_cmd.extend(["--model_path", mpath.strip()]) # results_path if rpath.strip(): script_cmd.extend(["--results_path", rpath.strip()]) print("Executing command:", " ".join(script_cmd)) out_text, plot_path = run_subprocess(script_path, script_cmd) return out_text, plot_path # The Train button is above logs, so let's define the click function train_btn.click( fn=run_task, inputs=[ model_select, data_option, data_file, data_path_txt, kaggle_json, kaggle_competition_name, kaggle_data_name, kaggle_is_competition, drop_cols_chk, select_cols_chk, visualize_chk, model_path_txt, results_path_txt ], outputs=[output_box, image_display] ) return # end create_task_tab ##################################### # Build the Main Gradio App ##################################### with gr.Blocks() as demo: gr.Markdown("# Unsupervised Learning Gradio Interface") # 1) Clustering Tab clustering_modules = get_model_modules("clustering") create_task_tab( task_name="Clustering", model_modules=clustering_modules, script_path="scripts/train_clustering_model.py" ) # 2) Dimensionality Reduction Tab dimred_modules = get_model_modules("dimred") create_task_tab( task_name="Dimensionality Reduction", model_modules=dimred_modules, script_path="scripts/train_dimred_model.py" ) # 3) Anomaly Detection Tab anomaly_modules = get_model_modules("anomaly") create_task_tab( task_name="Anomaly Detection", model_modules=anomaly_modules, script_path="scripts/train_anomaly_detection.py" ) if __name__ == "__main__": demo.launch()