File size: 13,833 Bytes
993cfb9
 
 
 
 
 
 
 
 
 
 
2712881
26816ad
 
993cfb9
26816ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
993cfb9
26816ad
2712881
993cfb9
 
 
 
 
 
 
26816ad
993cfb9
 
 
2712881
 
 
 
 
993cfb9
 
2712881
 
 
26816ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2712881
 
 
26816ad
 
 
 
 
 
 
2712881
 
 
26816ad
2712881
26816ad
2712881
 
 
 
 
993cfb9
26816ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
993cfb9
 
 
 
 
 
 
 
 
 
26816ad
 
993cfb9
 
26816ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
993cfb9
 
 
 
 
 
 
26816ad
 
993cfb9
26816ad
 
993cfb9
 
 
 
 
 
 
26816ad
993cfb9
26816ad
 
 
 
 
993cfb9
26816ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
993cfb9
 
 
26816ad
 
 
993cfb9
 
2712881
993cfb9
26816ad
993cfb9
26816ad
 
993cfb9
 
26816ad
993cfb9
26816ad
993cfb9
 
26816ad
2712881
26816ad
 
 
 
 
2712881
26816ad
 
2712881
26816ad
2712881
 
 
26816ad
2712881
 
26816ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2712881
 
26816ad
2712881
26816ad
2712881
 
26816ad
 
2712881
26816ad
2712881
26816ad
 
2712881
26816ad
 
 
2712881
993cfb9
2712881
 
993cfb9
 
 
 
 
 
 
 
 
131a1cc
993cfb9
2712881
993cfb9
2712881
993cfb9
2712881
993cfb9
2712881
993cfb9
 
 
 
 
 
 
 
26816ad
 
 
 
 
 
 
 
 
 
 
 
993cfb9
2712881
 
 
 
 
 
 
 
 
 
 
 
993cfb9
 
 
2712881
26816ad
 
993cfb9
 
 
 
 
2712881
993cfb9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
import gradio as gr
import pandas as pd
import io
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport
import tempfile
import requests
import json
from typing import Optional, Tuple, Any, Union
from openai import OpenAI  # Added for Nebius AI Studio LLM integration

# Constants
NO_TASK_DETECTED = "No task detected"
NO_COLUMNS_LOADED = "No columns loaded."


def load_data(file_input: Any) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
    """
    Loads CSV data from either a local file upload or a public URL.
    
    Args:
        file_input: A file object from Gradio upload or a URL string.
        
    Returns:
        Tuple containing the DataFrame and comma-separated column names,
        or (None, None) if loading fails.
    """
    if file_input is None:
        return None, None

    try:
        if hasattr(file_input, 'name'):
            file_path = file_input.name
            with open(file_path, 'rb') as f:
                file_bytes = f.read()
            df = pd.read_csv(io.BytesIO(file_bytes))
        elif isinstance(file_input, str) and file_input.startswith('http'):
            response = requests.get(file_input, timeout=30)
            response.raise_for_status()
            df = pd.read_csv(io.StringIO(response.text))
        else:
            return None, None

        # Extract column names here
        column_names = ", ".join(df.columns.tolist())
        return df, column_names
    except Exception as e:
        gr.Warning(f"Failed to load or parse data: {e}")
        return None, None


def generate_dataset_summary(df: pd.DataFrame, target_column: str) -> str:
    """
    Generates a concise summary of the dataset for LLM context.
    
    Args:
        df: The pandas DataFrame to summarize.
        target_column: The name of the target column.
        
    Returns:
        A formatted string summary of the dataset.
    """
    summary_parts = [
        f"Dataset Shape: {df.shape[0]} rows, {df.shape[1]} columns",
        f"Target Column: {target_column}",
        f"Target Unique Values: {df[target_column].nunique()}",
        f"Features: {', '.join([col for col in df.columns if col != target_column])}",
        f"Missing Values: {df.isnull().sum().sum()} total",
        f"Numeric Columns: {len(df.select_dtypes(include=['number']).columns)}",
        f"Categorical Columns: {len(df.select_dtypes(include=['object', 'category']).columns)}"
    ]
    return "\n".join(summary_parts)


def update_detected_columns_display(file_data: Any, url_data: Optional[str]) -> str:
    """
    Detects and displays column names from the uploaded file or URL
    as soon as the input changes, before the main analysis button is pressed.
    
    Args:
        file_data: File object from Gradio file upload component.
        url_data: URL string from Gradio textbox component.
        
    Returns:
        Comma-separated string of column names or error message.
    """
    data_source = file_data if file_data is not None else url_data
    if data_source is None:
        return ""

    _, column_names = load_data(data_source)
    if column_names:
        return column_names
    else:
        return "No columns detected or error loading file. Please check the file format."


def analyze_and_model(
    df: pd.DataFrame, 
    target_column: str
) -> Tuple[ProfileReport, str, str, pd.DataFrame, str, str, str]:
    """
    Internal function to perform EDA, model training, and visualization.
    
    Args:
        df: The pandas DataFrame containing the dataset.
        target_column: The name of the target column for prediction.
        
    Returns:
        Tuple containing: profile report, profile path, task type, 
        models dataframe, plot path, pickle path, and best model name.
    """
    profile = ProfileReport(df, title="EDA Report", minimal=True)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_html:
        profile.to_file(temp_html.name)
        profile_path = temp_html.name

    X = df.drop(columns=[target_column])
    y = df[target_column]
    task = "classification" if y.nunique() <= 10 else "regression"
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    lazy_model = LazyClassifier(ignore_warnings=True, verbose=0) if task == "classification" else LazyRegressor(ignore_warnings=True, verbose=0)
    models, _ = lazy_model.fit(X_train, X_test, y_train, y_test)

    sort_metric = "Accuracy" if task == "classification" else "R-Squared"
    sorted_models = models.sort_values(by=sort_metric, ascending=False)
    best_model_name = sorted_models.index[0]
    
    # Safely access the best model with error handling
    try:
        best_model = lazy_model.models[best_model_name]
    except KeyError:
        # Fallback: try to find the model with stripped whitespace
        model_keys = list(lazy_model.models.keys())
        matching_key = next((k for k in model_keys if k.strip() == best_model_name.strip()), None)
        if matching_key:
            best_model = lazy_model.models[matching_key]
        else:
            # Use the first available model as fallback
            best_model = list(lazy_model.models.values())[0]
            gr.Warning(f"Could not find exact model '{best_model_name}', using first available model.")

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as temp_pkl:
        pickle.dump(best_model, temp_pkl)
        pickle_path = temp_pkl.name

    plt.figure(figsize=(10, 6))
    plot_column = "Accuracy" if task == "classification" else "R-Squared"
    top_models = models.head(10)
    sns.barplot(x=top_models[plot_column].values, y=top_models.index.tolist())
    plt.title(f"Top 10 Models by {plot_column}")
    plt.xlabel(plot_column)
    plt.ylabel("Model")
    plt.tight_layout()
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_png:
        plt.savefig(temp_png.name)
        plot_path = temp_png.name
    plt.close()

    models_reset = models.reset_index().rename(columns={'index': 'Model'})
    return profile, profile_path, task, models_reset, plot_path, pickle_path, best_model_name

def run_pipeline(
    data_source: Union[Any, str], 
    target_column: str, 
    nebius_api_key: Optional[str] = None
) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]:
    """
    Run the complete AutoML pipeline including data loading, EDA, model training, and AI explanation.
    
    This is the primary MCP tool function that orchestrates the entire AutoML workflow.
    
    Args:
        data_source: Either a file path/object from local upload or a URL string pointing to a CSV file.
        target_column: The name of the column to predict (target variable).
        nebius_api_key: Optional API key for Nebius AI Studio to enable AI-powered explanations.
        
    Returns:
        Tuple containing:
        - eda_report_path: Path to the generated HTML EDA report file.
        - task_type: Either "classification" or "regression" based on target variable.
        - models_dataframe: DataFrame with performance metrics of all trained models.
        - visualization_path: Path to the model comparison chart image.
        - model_pickle_path: Path to the serialized best model (.pkl file).
        - llm_explanation: AI-generated explanation of results (or fallback message).
        - column_names: Comma-separated list of detected column names.
    """
    # --- 1. Input Validation ---
    if not data_source or not target_column:
        error_msg = "Please provide both a data source and target column name."
        gr.Warning("Error: Data source and target column must be provided.")
        return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED

    gr.Info("Starting analysis...")

    # --- 2. Data Loading ---
    df, column_names = load_data(data_source)
    if df is None:
        error_msg = "Could not load data. Please check the file format or URL."
        return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED

    if target_column not in df.columns:
        error_msg = f"Target column '{target_column}' not found. Available columns: {column_names}"
        gr.Warning(error_msg)
        return None, NO_TASK_DETECTED, None, None, None, error_msg, column_names

    # --- 3. Analysis and Modeling ---
    _, profile_path, task, models_df, plot_path, pickle_path, best_model_name = analyze_and_model(df, target_column)

    # --- 4. Generate Dataset Summary for LLM Context ---
    dataset_summary = generate_dataset_summary(df, target_column)
    
    # Get top 5 model performance summary
    top_models_summary = models_df.head(5).to_string(index=False)

    # --- 5. Explanation with Nebius AI Studio LLM ---
    llm_explanation = "AI explanation is unavailable. Please provide a Nebius AI Studio API key to enable this feature."

    if nebius_api_key and nebius_api_key.strip():
        try:
            client = OpenAI(
                base_url="https://api.studio.nebius.com/v1/",
                api_key=nebius_api_key.strip()
            )

            # Craft an improved prompt with actual data context
            prompt_text = f"""Analyze this AutoML result and provide a concise, professional explanation:

**Dataset Overview:**
{dataset_summary}

**Task Type:** {task}

**Top 5 Performing Models:**
{top_models_summary}

**Best Model:** {best_model_name}

Please explain:
1. Why '{best_model_name}' performed best for this {task} task
2. Key insights about the dataset characteristics
3. Recommendations for model deployment or further improvement

Keep the explanation concise (3-4 paragraphs) and accessible to both technical and non-technical stakeholders."""

            response = client.chat.completions.create(
                model="meta-llama/Llama-3.3-70B-Instruct",
                messages=[
                    {"role": "system", "content": "You are an expert data scientist assistant that explains machine learning results clearly and professionally."},
                    {"role": "user", "content": prompt_text}
                ],
                temperature=0.6,
                max_tokens=512,
                top_p=0.9,
                extra_body={"top_k": 50}
            )
            # Simplified response access (no need for json.loads)
            llm_explanation = response.choices[0].message.content

        except Exception as e:
            gr.Warning(f"Failed to get AI explanation: {e}")
            llm_explanation = f"AI explanation unavailable due to an error. The best performing model is **{best_model_name}** for your {task} task."

    gr.Info("Analysis complete!")
    gr.Info(f'Profile report saved to: {profile_path}')
    return profile_path, task, models_df, plot_path, pickle_path, llm_explanation, column_names 

# --- Gradio UI ---
with gr.Blocks(title="AutoML Trainer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🤖 AutoML Trainer")

    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(label="Upload Local CSV File")
            url_input = gr.Textbox(label="Or Enter Public CSV URL", placeholder="e.g., https://.../data.csv")
            gr.Textbox(label="Sample CSV", value="https://raw.githubusercontent.com/daniel-was-taken/MCP_Project/refs/heads/master/collegePlace.csv")
            target_column_input = gr.Textbox(label="Enter Target Column Name", placeholder="e.g., approved")
            nebius_api_key_input = gr.Textbox(label="Nebius AI Studio API Key (Optional)", type="password", placeholder="Enter your API key for AI explanations")
            run_button = gr.Button("Run Analysis & AutoML", variant="primary")

        with gr.Column(scale=2):
            column_names_output = gr.Textbox(label="Detected Columns", interactive=False, lines=2) # New Textbox for column names
            task_output = gr.Textbox(label="Detected Task", interactive=False)
            llm_output = gr.Markdown(label="AI Explanation")
            metrics_output = gr.Dataframe(label="Model Performance Metrics")

    with gr.Row():
        vis_output = gr.Image(label="Top Models Comparison")
        with gr.Column():
            eda_output = gr.File(label="Download Full EDA Report")
            model_output = gr.File(label="Download Best Model (.pkl)")

    def process_inputs(
        file_data: Any, 
        url_data: Optional[str], 
        target: str, 
        api_key: Optional[str]
    ) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]:
        """
        Process inputs and run the AutoML pipeline.
        
        This wrapper function handles input selection between file upload and URL,
        then delegates to the main run_pipeline function.
        """
        data_source = file_data if file_data is not None else url_data
        return run_pipeline(data_source, target, api_key)

    file_input.change(
        fn=update_detected_columns_display,
        inputs=[file_input, url_input],
        outputs=column_names_output
    )
    url_input.change(
        fn=update_detected_columns_display,
        inputs=[file_input, url_input],
        outputs=column_names_output
    )

    run_button.click(
        fn=process_inputs,
        inputs=[file_input, url_input, target_column_input, nebius_api_key_input],
        outputs=[eda_output, task_output, metrics_output, vis_output, model_output, llm_output, column_names_output],
        api_name="run_automl_pipeline"  # Explicit API name for MCP
    )

demo.launch(
    server_name="0.0.0.0",
    server_port=7860,
    share=False,
    inbrowser=True,
    mcp_server=True
)