File size: 12,353 Bytes
4c91838
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0922d39
4c91838
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0922d39
4c91838
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324

"""
Gradio Interface for Unsupervised Learning (Clustering, Dimensionality Reduction, Anomaly Detection)

This script provides a single Gradio-based interface to run three unsupervised tasks:
1. Clustering
2. Dimensionality Reduction
3. Anomaly (Outlier) Detection

Each task is placed in its own Gradio Tab. The user can:
- Choose a model from the relevant unsupervised directory (clustering/dimred/anomaly).
- Specify dataset input (upload, local path, or Kaggle).
- Select columns to drop or keep.
- Execute the relevant training script (train_clustering_model.py, train_dimred_model.py, or train_anomaly_detection.py).
- View logs and optional plots.

Project Requirements:
- Python 3.7+.
- Gradio, scikit-learn, pandas, etc. in requirements.txt.
- Properly structured project with:
  - scripts/train_clustering_model.py
  - scripts/train_dimred_model.py
  - scripts/train_anomaly_detection.py
  - models/unsupervised/<task>/<model>.py
  - data/datasets/kaggle_data.py (optional for Kaggle usage).
"""

import gradio as gr
import pandas as pd
import os
import subprocess
import sys
import glob
import re

#####################################
# Helper Functions
#####################################

def get_model_modules(task_type):
    """
    Dynamically fetch model modules from the unsupervised subdirectories:
    - clustering
    - dimred
    - anomaly
    """
    models_dir = os.path.join('models', 'unsupervised', task_type)
    if not os.path.exists(models_dir):
        print(f"Directory does not exist: {models_dir}")
        return []
    model_files = glob.glob(os.path.join(models_dir, '*.py'))
    modules = [
        os.path.splitext(os.path.basename(f))[0]
        for f in model_files if not f.endswith('__init__.py')
    ]
    return modules

def download_kaggle_data(json_path, dataset_name, is_competition):
    from data.datasets.kaggle_data import get_kaggle_data
    data_path = get_kaggle_data(json_path=json_path, data_name=dataset_name, is_competition=is_competition)
    return data_path

def run_subprocess(script_path, script_args):
    """
    Run a subprocess call to the given script with the specified arguments.
    Returns (output_text, plot_image_path_or_None).
    """
    try:
        result = subprocess.run(script_args, capture_output=True, text=True)
        output = result.stdout
        errors = result.stderr
        if result.returncode != 0:
            return f"Error during training:\n{errors}", None
        else:
            # Attempt to parse any 'Visualization saved to ...' line for an image path
            output = re.sub(r"Figure\(\d+x\d+\)", "", output).strip()
            image_path = None

            # Look for "Plot saved to ..." or any ".png" reference
            match_plot = re.search(r"Plot saved to (.+)", output)
            if match_plot:
                image_path = match_plot.group(1).strip()
            else:
                match_png = re.search(r"(\S+\.png)", output)
                if match_png:
                    image_path = match_png.group(1)

            if image_path and os.path.exists(image_path):
                return f"Completed successfully.\n\n{output}", image_path
            else:
                return f"Completed successfully.\n\n{output}", None
    except Exception as e:
        return f"An error occurred:\n{str(e)}", None

def get_columns_from_data(data_option, data_file, data_path,
                          kaggle_json_file, kaggle_competition_name, kaggle_data_name,
                          is_competition):
    """
    Attempt to load the CSV and return columns.
    """
    final_path = None
    if data_option == "Upload Data File":
        if data_file is None:
            return []
        final_path = data_file
    elif data_option == "Provide Data Path":
        if os.path.exists(data_path):
            final_path = data_path
        else:
            print("Provided path does not exist.")
            return []
    elif data_option == "Download from Kaggle":
        if kaggle_json_file is None:
            print("No kaggle.json uploaded.")
            return []
        import shutil
        kaggle_config_dir = os.path.expanduser('~/.kaggle')
        os.makedirs(kaggle_config_dir, exist_ok=True)
        kaggle_json_path = os.path.join(kaggle_config_dir, 'kaggle.json')
        shutil.copy(kaggle_json_file.name, kaggle_json_path)
        os.chmod(kaggle_json_path, 0o600)

        data_dir = download_kaggle_data(kaggle_json_path, kaggle_competition_name, is_competition)
        if data_dir is None:
            print("Failed to download from Kaggle.")
            return []
        final_path = os.path.join(data_dir, kaggle_data_name)
        if not os.path.exists(final_path):
            print(f"{kaggle_data_name} not found in Kaggle data.")
            return []
    else:
        print("Invalid data option.")
        return []

    try:
        df = pd.read_csv(final_path)
        return df.columns.tolist()
    except Exception as e:
        print(f"Error reading {final_path}: {e}")
        return []

#####################################
# Creating the Gradio Tab
#####################################

def create_task_tab(task_name, model_modules, script_path):
    """
    Creates a Gradio Tab for a specific unsupervised task (Clustering, DimRed, Anomaly).
    - model_modules: list of model modules from get_model_modules(task_type)
    - script_path: e.g. 'scripts/train_clustering_model.py'
    """

    with gr.Tab(task_name):
        gr.Markdown(f"## {task_name} Task")

        # Model selection
        model_select = gr.Dropdown(choices=model_modules, label=f"{task_name} Model Module")

        # Data input approach
        data_option = gr.Radio(
            choices=["Upload Data File", "Provide Data Path", "Download from Kaggle"],
            label="Data Input Option",
            value="Upload Data File"
        )

        with gr.Column(visible=True) as upload_data_col:
            data_file = gr.File(label="Upload CSV Data File", type="filepath")

        with gr.Column(visible=False) as path_data_col:
            data_path_txt = gr.Textbox(label="Data File Path")

        with gr.Column(visible=False) as kaggle_data_col:
            kaggle_json = gr.File(label="Upload kaggle.json File", type="filepath")
            kaggle_competition_name = gr.Textbox(value='', label="Kaggle Competition/Dataset Name")
            kaggle_data_name = gr.Textbox(value='data.csv', label="Data File Name in Kaggle dataset")
            kaggle_is_competition = gr.Checkbox(label="Is Kaggle Competition?", value=False)

        # Toggle data input columns
        def toggle_data_input(choice):
            if choice == "Upload Data File":
                return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
            elif choice == "Provide Data Path":
                return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
            elif choice == "Download from Kaggle":
                return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
            else:
                return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)

        data_option.change(
            toggle_data_input,
            inputs=[data_option],
            outputs=[upload_data_col, path_data_col, kaggle_data_col]
        )

        # Update columns button
        update_cols_btn = gr.Button("Update Columns")

        # We remove "Columns in Data (for reference)" as requested
        drop_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Drop")
        select_cols_chk = gr.CheckboxGroup(choices=[], label="Columns to Keep (if empty, keep all)")

        # Visualization param
        visualize_chk = gr.Checkbox(label="Visualize 2D (using PCA if needed)", value=True)

        # Model / results path with empty default, and label "(optional)"
        model_path_txt = gr.Textbox(label="Model Save Path (optional)", value="")
        results_path_txt = gr.Textbox(label="Results Save Path (optional)", value="")

        # The Train button
        train_btn = gr.Button(f"Train {task_name}")

        # Logs/Output
        output_box = gr.Textbox(label="Logs / Output")
        image_display = gr.Image(label="Plot Output", visible=True)

        # Function to update columns
        def update_columns_fn(dataopt, f, p, kagfile, kcname, kdname, iscomp):
            cols = get_columns_from_data(dataopt, f, p, kagfile, kcname, kdname, iscomp)
            # Return updated choices for drop_cols_chk, select_cols_chk
            if cols:
                return gr.update(choices=cols), gr.update(choices=cols)
            else:
                return gr.update(choices=[]), gr.update(choices=[])

        update_cols_btn.click(
            fn=update_columns_fn,
            inputs=[
                data_option, data_file, data_path_txt,
                kaggle_json, kaggle_competition_name, kaggle_data_name,
                kaggle_is_competition
            ],
            outputs=[drop_cols_chk, select_cols_chk]
        )

        def run_task(model_mod, dataopt, f, p, kagfile, kcname, kdname, iscomp,
                     drop_cols, select_cols, visualize, mpath, rpath):
            # Build the command for the relevant script
            script_cmd = [sys.executable, os.path.join(script_path)]
            script_cmd.extend(["--model_module", model_mod])

            # Minimal approach for data path logic
            final_path = None
            if dataopt == "Upload Data File" and f is not None:
                final_path = f
            elif dataopt == "Provide Data Path" and os.path.exists(p):
                final_path = p
            else:
                # For Kaggle or other complexities, skipping for brevity.
                # Could handle it similarly to get_columns_from_data approach
                final_path = ""

            if final_path:
                script_cmd.extend(["--data_path", final_path])

            # drop cols
            if drop_cols and len(drop_cols) > 0:
                script_cmd.extend(["--drop_columns", ",".join(drop_cols)])
            # select cols
            if select_cols and len(select_cols) > 0:
                script_cmd.extend(["--select_columns", ",".join(select_cols)])
            # visualize
            if visualize:
                script_cmd.append("--visualize")

            # model_path
            if mpath.strip():
                script_cmd.extend(["--model_path", mpath.strip()])
            # results_path
            if rpath.strip():
                script_cmd.extend(["--results_path", rpath.strip()])

            print("Executing command:", " ".join(script_cmd))
            out_text, plot_path = run_subprocess(script_path, script_cmd)
            return out_text, plot_path

        # The Train button is above logs, so let's define the click function
        train_btn.click(
            fn=run_task,
            inputs=[
                model_select, data_option, data_file, data_path_txt,
                kaggle_json, kaggle_competition_name, kaggle_data_name, kaggle_is_competition,
                drop_cols_chk, select_cols_chk, visualize_chk,
                model_path_txt, results_path_txt
            ],
            outputs=[output_box, image_display]
        )

    return  # end create_task_tab


#####################################
# Build the Main Gradio App
#####################################

with gr.Blocks() as demo:
    gr.Markdown("# Unsupervised Learning Gradio Interface")

    # 1) Clustering Tab
    clustering_modules = get_model_modules("clustering")
    create_task_tab(
        task_name="Clustering",
        model_modules=clustering_modules,
        script_path="scripts/train_clustering_model.py"
    )

    # 2) Dimensionality Reduction Tab
    dimred_modules = get_model_modules("dimred")
    create_task_tab(
        task_name="Dimensionality Reduction",
        model_modules=dimred_modules,
        script_path="scripts/train_dimred_model.py"
    )

    # 3) Anomaly Detection Tab
    anomaly_modules = get_model_modules("anomaly")
    create_task_tab(
        task_name="Anomaly Detection",
        model_modules=anomaly_modules,
        script_path="scripts/train_anomaly_detection.py"
    )

if __name__ == "__main__":
    demo.launch()