Spaces:

Agents-MCP-Hackathon
/

AutoML-MCP

Sleeping

File size: 13,833 Bytes

import gradio as gr
import pandas as pd
import io
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport
import tempfile
import requests
import json
from typing import Optional, Tuple, Any, Union
from openai import OpenAI  # Added for Nebius AI Studio LLM integration

# Constants
NO_TASK_DETECTED = "No task detected"
NO_COLUMNS_LOADED = "No columns loaded."


def load_data(file_input: Any) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
    """
    Loads CSV data from either a local file upload or a public URL.
    
    Args:
        file_input: A file object from Gradio upload or a URL string.
        
    Returns:
        Tuple containing the DataFrame and comma-separated column names,
        or (None, None) if loading fails.
    """
    if file_input is None:
        return None, None

    try:
        if hasattr(file_input, 'name'):
            file_path = file_input.name
            with open(file_path, 'rb') as f:
                file_bytes = f.read()
            df = pd.read_csv(io.BytesIO(file_bytes))
        elif isinstance(file_input, str) and file_input.startswith('http'):
            response = requests.get(file_input, timeout=30)
            response.raise_for_status()
            df = pd.read_csv(io.StringIO(response.text))
        else:
            return None, None

        # Extract column names here
        column_names = ", ".join(df.columns.tolist())
        return df, column_names
    except Exception as e:
        gr.Warning(f"Failed to load or parse data: {e}")
        return None, None


def generate_dataset_summary(df: pd.DataFrame, target_column: str) -> str:
    """
    Generates a concise summary of the dataset for LLM context.
    
    Args:
        df: The pandas DataFrame to summarize.
        target_column: The name of the target column.
        
    Returns:
        A formatted string summary of the dataset.
    """
    summary_parts = [
        f"Dataset Shape: {df.shape[0]} rows, {df.shape[1]} columns",
        f"Target Column: {target_column}",
        f"Target Unique Values: {df[target_column].nunique()}",
        f"Features: {', '.join([col for col in df.columns if col != target_column])}",
        f"Missing Values: {df.isnull().sum().sum()} total",
        f"Numeric Columns: {len(df.select_dtypes(include=['number']).columns)}",
        f"Categorical Columns: {len(df.select_dtypes(include=['object', 'category']).columns)}"
    ]
    return "\n".join(summary_parts)


def update_detected_columns_display(file_data: Any, url_data: Optional[str]) -> str:
    """
    Detects and displays column names from the uploaded file or URL
    as soon as the input changes, before the main analysis button is pressed.
    
    Args:
        file_data: File object from Gradio file upload component.
        url_data: URL string from Gradio textbox component.
        
    Returns:
        Comma-separated string of column names or error message.
    """
    data_source = file_data if file_data is not None else url_data
    if data_source is None:
        return ""

    _, column_names = load_data(data_source)
    if column_names:
        return column_names
    else:
        return "No columns detected or error loading file. Please check the file format."


def analyze_and_model(
    df: pd.DataFrame, 
    target_column: str
) -> Tuple[ProfileReport, str, str, pd.DataFrame, str, str, str]:
    """
    Internal function to perform EDA, model training, and visualization.
    
    Args:
        df: The pandas DataFrame containing the dataset.
        target_column: The name of the target column for prediction.
        
    Returns:
        Tuple containing: profile report, profile path, task type, 
        models dataframe, plot path, pickle path, and best model name.
    """
    profile = ProfileReport(df, title="EDA Report", minimal=True)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_html:
        profile.to_file(temp_html.name)
        profile_path = temp_html.name

    X = df.drop(columns=[target_column])
    y = df[target_column]
    task = "classification" if y.nunique() <= 10 else "regression"
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    lazy_model = LazyClassifier(ignore_warnings=True, verbose=0) if task == "classification" else LazyRegressor(ignore_warnings=True, verbose=0)
    models, _ = lazy_model.fit(X_train, X_test, y_train, y_test)

    sort_metric = "Accuracy" if task == "classification" else "R-Squared"
    sorted_models = models.sort_values(by=sort_metric, ascending=False)
    best_model_name = sorted_models.index[0]
    
    # Safely access the best model with error handling
    try:
        best_model = lazy_model.models[best_model_name]
    except KeyError:
        # Fallback: try to find the model with stripped whitespace
        model_keys = list(lazy_model.models.keys())
        matching_key = next((k for k in model_keys if k.strip() == best_model_name.strip()), None)
        if matching_key:
            best_model = lazy_model.models[matching_key]
        else:
            # Use the first available model as fallback
            best_model = list(lazy_model.models.values())[0]
            gr.Warning(f"Could not find exact model '{best_model_name}', using first available model.")

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as temp_pkl:
        pickle.dump(best_model, temp_pkl)
        pickle_path = temp_pkl.name

    plt.figure(figsize=(10, 6))
    plot_column = "Accuracy" if task == "classification" else "R-Squared"
    top_models = models.head(10)
    sns.barplot(x=top_models[plot_column].values, y=top_models.index.tolist())
    plt.title(f"Top 10 Models by {plot_column}")
    plt.xlabel(plot_column)
    plt.ylabel("Model")
    plt.tight_layout()
    with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_png:
        plt.savefig(temp_png.name)
        plot_path = temp_png.name
    plt.close()

    models_reset = models.reset_index().rename(columns={'index': 'Model'})
    return profile, profile_path, task, models_reset, plot_path, pickle_path, best_model_name

def run_pipeline(
    data_source: Union[Any, str], 
    target_column: str, 
    nebius_api_key: Optional[str] = None
) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]:
    """
    Run the complete AutoML pipeline including data loading, EDA, model training, and AI explanation.
    
    This is the primary MCP tool function that orchestrates the entire AutoML workflow.
    
    Args:
        data_source: Either a file path/object from local upload or a URL string pointing to a CSV file.
        target_column: The name of the column to predict (target variable).
        nebius_api_key: Optional API key for Nebius AI Studio to enable AI-powered explanations.
        
    Returns:
        Tuple containing:
        - eda_report_path: Path to the generated HTML EDA report file.
        - task_type: Either "classification" or "regression" based on target variable.
        - models_dataframe: DataFrame with performance metrics of all trained models.
        - visualization_path: Path to the model comparison chart image.
        - model_pickle_path: Path to the serialized best model (.pkl file).
        - llm_explanation: AI-generated explanation of results (or fallback message).
        - column_names: Comma-separated list of detected column names.
    """
    # --- 1. Input Validation ---
    if not data_source or not target_column:
        error_msg = "Please provide both a data source and target column name."
        gr.Warning("Error: Data source and target column must be provided.")
        return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED

    gr.Info("Starting analysis...")

    # --- 2. Data Loading ---
    df, column_names = load_data(data_source)
    if df is None:
        error_msg = "Could not load data. Please check the file format or URL."
        return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED

    if target_column not in df.columns:
        error_msg = f"Target column '{target_column}' not found. Available columns: {column_names}"
        gr.Warning(error_msg)
        return None, NO_TASK_DETECTED, None, None, None, error_msg, column_names

    # --- 3. Analysis and Modeling ---
    _, profile_path, task, models_df, plot_path, pickle_path, best_model_name = analyze_and_model(df, target_column)

    # --- 4. Generate Dataset Summary for LLM Context ---
    dataset_summary = generate_dataset_summary(df, target_column)
    
    # Get top 5 model performance summary
    top_models_summary = models_df.head(5).to_string(index=False)

    # --- 5. Explanation with Nebius AI Studio LLM ---
    llm_explanation = "AI explanation is unavailable. Please provide a Nebius AI Studio API key to enable this feature."

    if nebius_api_key and nebius_api_key.strip():
        try:
            client = OpenAI(
                base_url="https://api.studio.nebius.com/v1/",
                api_key=nebius_api_key.strip()
            )

            # Craft an improved prompt with actual data context
            prompt_text = f"""Analyze this AutoML result and provide a concise, professional explanation:

**Dataset Overview:**
{dataset_summary}

**Task Type:** {task}

**Top 5 Performing Models:**
{top_models_summary}

**Best Model:** {best_model_name}

Please explain:
1. Why '{best_model_name}' performed best for this {task} task
2. Key insights about the dataset characteristics
3. Recommendations for model deployment or further improvement

Keep the explanation concise (3-4 paragraphs) and accessible to both technical and non-technical stakeholders."""

            response = client.chat.completions.create(
                model="meta-llama/Llama-3.3-70B-Instruct",
                messages=[
                    {"role": "system", "content": "You are an expert data scientist assistant that explains machine learning results clearly and professionally."},
                    {"role": "user", "content": prompt_text}
                ],
                temperature=0.6,
                max_tokens=512,
                top_p=0.9,
                extra_body={"top_k": 50}
            )
            # Simplified response access (no need for json.loads)
            llm_explanation = response.choices[0].message.content

        except Exception as e:
            gr.Warning(f"Failed to get AI explanation: {e}")
            llm_explanation = f"AI explanation unavailable due to an error. The best performing model is **{best_model_name}** for your {task} task."

    gr.Info("Analysis complete!")
    gr.Info(f'Profile report saved to: {profile_path}')
    return profile_path, task, models_df, plot_path, pickle_path, llm_explanation, column_names 

# --- Gradio UI ---
with gr.Blocks(title="AutoML Trainer", theme=gr.themes.Soft()) as demo:
    gr.Markdown("## 🤖 AutoML Trainer")

    with gr.Row():
        with gr.Column(scale=1):
            file_input = gr.File(label="Upload Local CSV File")
            url_input = gr.Textbox(label="Or Enter Public CSV URL", placeholder="e.g., https://.../data.csv")
            gr.Textbox(label="Sample CSV", value="https://raw.githubusercontent.com/daniel-was-taken/MCP_Project/refs/heads/master/collegePlace.csv")
            target_column_input = gr.Textbox(label="Enter Target Column Name", placeholder="e.g., approved")
            nebius_api_key_input = gr.Textbox(label="Nebius AI Studio API Key (Optional)", type="password", placeholder="Enter your API key for AI explanations")
            run_button = gr.Button("Run Analysis & AutoML", variant="primary")

        with gr.Column(scale=2):
            column_names_output = gr.Textbox(label="Detected Columns", interactive=False, lines=2) # New Textbox for column names
            task_output = gr.Textbox(label="Detected Task", interactive=False)
            llm_output = gr.Markdown(label="AI Explanation")
            metrics_output = gr.Dataframe(label="Model Performance Metrics")

    with gr.Row():
        vis_output = gr.Image(label="Top Models Comparison")
        with gr.Column():
            eda_output = gr.File(label="Download Full EDA Report")
            model_output = gr.File(label="Download Best Model (.pkl)")

    def process_inputs(
        file_data: Any, 
        url_data: Optional[str], 
        target: str, 
        api_key: Optional[str]
    ) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]:
        """
        Process inputs and run the AutoML pipeline.
        
        This wrapper function handles input selection between file upload and URL,
        then delegates to the main run_pipeline function.
        """
        data_source = file_data if file_data is not None else url_data
        return run_pipeline(data_source, target, api_key)

    file_input.change(
        fn=update_detected_columns_display,
        inputs=[file_input, url_input],
        outputs=column_names_output
    )
    url_input.change(
        fn=update_detected_columns_display,
        inputs=[file_input, url_input],
        outputs=column_names_output
    )

    run_button.click(
        fn=process_inputs,
        inputs=[file_input, url_input, target_column_input, nebius_api_key_input],
        outputs=[eda_output, task_output, metrics_output, vis_output, model_output, llm_output, column_names_output],
        api_name="run_automl_pipeline"  # Explicit API name for MCP
    )

demo.launch(
    server_name="0.0.0.0",
    server_port=7860,
    share=False,
    inbrowser=True,
    mcp_server=True
)