Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import io | |
| import pickle | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from lazypredict.Supervised import LazyClassifier, LazyRegressor | |
| from sklearn.model_selection import train_test_split | |
| from ydata_profiling import ProfileReport | |
| import tempfile | |
| import requests | |
| import json | |
| from typing import Optional, Tuple, Any, Union | |
| from openai import OpenAI # Added for Nebius AI Studio LLM integration | |
| # Constants | |
| NO_TASK_DETECTED = "No task detected" | |
| NO_COLUMNS_LOADED = "No columns loaded." | |
| def load_data(file_input: Any) -> Tuple[Optional[pd.DataFrame], Optional[str]]: | |
| """ | |
| Loads CSV data from either a local file upload or a public URL. | |
| Args: | |
| file_input: A file object from Gradio upload or a URL string. | |
| Returns: | |
| Tuple containing the DataFrame and comma-separated column names, | |
| or (None, None) if loading fails. | |
| """ | |
| if file_input is None: | |
| return None, None | |
| try: | |
| if hasattr(file_input, 'name'): | |
| file_path = file_input.name | |
| with open(file_path, 'rb') as f: | |
| file_bytes = f.read() | |
| df = pd.read_csv(io.BytesIO(file_bytes)) | |
| elif isinstance(file_input, str) and file_input.startswith('http'): | |
| response = requests.get(file_input, timeout=30) | |
| response.raise_for_status() | |
| df = pd.read_csv(io.StringIO(response.text)) | |
| else: | |
| return None, None | |
| # Extract column names here | |
| column_names = ", ".join(df.columns.tolist()) | |
| return df, column_names | |
| except Exception as e: | |
| gr.Warning(f"Failed to load or parse data: {e}") | |
| return None, None | |
| def generate_dataset_summary(df: pd.DataFrame, target_column: str) -> str: | |
| """ | |
| Generates a concise summary of the dataset for LLM context. | |
| Args: | |
| df: The pandas DataFrame to summarize. | |
| target_column: The name of the target column. | |
| Returns: | |
| A formatted string summary of the dataset. | |
| """ | |
| summary_parts = [ | |
| f"Dataset Shape: {df.shape[0]} rows, {df.shape[1]} columns", | |
| f"Target Column: {target_column}", | |
| f"Target Unique Values: {df[target_column].nunique()}", | |
| f"Features: {', '.join([col for col in df.columns if col != target_column])}", | |
| f"Missing Values: {df.isnull().sum().sum()} total", | |
| f"Numeric Columns: {len(df.select_dtypes(include=['number']).columns)}", | |
| f"Categorical Columns: {len(df.select_dtypes(include=['object', 'category']).columns)}" | |
| ] | |
| return "\n".join(summary_parts) | |
| def update_detected_columns_display(file_data: Any, url_data: Optional[str]) -> str: | |
| """ | |
| Detects and displays column names from the uploaded file or URL | |
| as soon as the input changes, before the main analysis button is pressed. | |
| Args: | |
| file_data: File object from Gradio file upload component. | |
| url_data: URL string from Gradio textbox component. | |
| Returns: | |
| Comma-separated string of column names or error message. | |
| """ | |
| data_source = file_data if file_data is not None else url_data | |
| if data_source is None: | |
| return "" | |
| _, column_names = load_data(data_source) | |
| if column_names: | |
| return column_names | |
| else: | |
| return "No columns detected or error loading file. Please check the file format." | |
| def analyze_and_model( | |
| df: pd.DataFrame, | |
| target_column: str | |
| ) -> Tuple[ProfileReport, str, str, pd.DataFrame, str, str, str]: | |
| """ | |
| Internal function to perform EDA, model training, and visualization. | |
| Args: | |
| df: The pandas DataFrame containing the dataset. | |
| target_column: The name of the target column for prediction. | |
| Returns: | |
| Tuple containing: profile report, profile path, task type, | |
| models dataframe, plot path, pickle path, and best model name. | |
| """ | |
| profile = ProfileReport(df, title="EDA Report", minimal=True) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_html: | |
| profile.to_file(temp_html.name) | |
| profile_path = temp_html.name | |
| X = df.drop(columns=[target_column]) | |
| y = df[target_column] | |
| task = "classification" if y.nunique() <= 10 else "regression" | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| lazy_model = LazyClassifier(ignore_warnings=True, verbose=0) if task == "classification" else LazyRegressor(ignore_warnings=True, verbose=0) | |
| models, _ = lazy_model.fit(X_train, X_test, y_train, y_test) | |
| sort_metric = "Accuracy" if task == "classification" else "R-Squared" | |
| sorted_models = models.sort_values(by=sort_metric, ascending=False) | |
| best_model_name = sorted_models.index[0] | |
| # Safely access the best model with error handling | |
| try: | |
| best_model = lazy_model.models[best_model_name] | |
| except KeyError: | |
| # Fallback: try to find the model with stripped whitespace | |
| model_keys = list(lazy_model.models.keys()) | |
| matching_key = next((k for k in model_keys if k.strip() == best_model_name.strip()), None) | |
| if matching_key: | |
| best_model = lazy_model.models[matching_key] | |
| else: | |
| # Use the first available model as fallback | |
| best_model = list(lazy_model.models.values())[0] | |
| gr.Warning(f"Could not find exact model '{best_model_name}', using first available model.") | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as temp_pkl: | |
| pickle.dump(best_model, temp_pkl) | |
| pickle_path = temp_pkl.name | |
| plt.figure(figsize=(10, 6)) | |
| plot_column = "Accuracy" if task == "classification" else "R-Squared" | |
| top_models = models.head(10) | |
| sns.barplot(x=top_models[plot_column].values, y=top_models.index.tolist()) | |
| plt.title(f"Top 10 Models by {plot_column}") | |
| plt.xlabel(plot_column) | |
| plt.ylabel("Model") | |
| plt.tight_layout() | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_png: | |
| plt.savefig(temp_png.name) | |
| plot_path = temp_png.name | |
| plt.close() | |
| models_reset = models.reset_index().rename(columns={'index': 'Model'}) | |
| return profile, profile_path, task, models_reset, plot_path, pickle_path, best_model_name | |
| def run_pipeline( | |
| data_source: Union[Any, str], | |
| target_column: str, | |
| nebius_api_key: Optional[str] = None | |
| ) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]: | |
| """ | |
| Run the complete AutoML pipeline including data loading, EDA, model training, and AI explanation. | |
| This is the primary MCP tool function that orchestrates the entire AutoML workflow. | |
| Args: | |
| data_source: Either a file path/object from local upload or a URL string pointing to a CSV file. | |
| target_column: The name of the column to predict (target variable). | |
| nebius_api_key: Optional API key for Nebius AI Studio to enable AI-powered explanations. | |
| Returns: | |
| Tuple containing: | |
| - eda_report_path: Path to the generated HTML EDA report file. | |
| - task_type: Either "classification" or "regression" based on target variable. | |
| - models_dataframe: DataFrame with performance metrics of all trained models. | |
| - visualization_path: Path to the model comparison chart image. | |
| - model_pickle_path: Path to the serialized best model (.pkl file). | |
| - llm_explanation: AI-generated explanation of results (or fallback message). | |
| - column_names: Comma-separated list of detected column names. | |
| """ | |
| # --- 1. Input Validation --- | |
| if not data_source or not target_column: | |
| error_msg = "Please provide both a data source and target column name." | |
| gr.Warning("Error: Data source and target column must be provided.") | |
| return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED | |
| gr.Info("Starting analysis...") | |
| # --- 2. Data Loading --- | |
| df, column_names = load_data(data_source) | |
| if df is None: | |
| error_msg = "Could not load data. Please check the file format or URL." | |
| return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED | |
| if target_column not in df.columns: | |
| error_msg = f"Target column '{target_column}' not found. Available columns: {column_names}" | |
| gr.Warning(error_msg) | |
| return None, NO_TASK_DETECTED, None, None, None, error_msg, column_names | |
| # --- 3. Analysis and Modeling --- | |
| _, profile_path, task, models_df, plot_path, pickle_path, best_model_name = analyze_and_model(df, target_column) | |
| # --- 4. Generate Dataset Summary for LLM Context --- | |
| dataset_summary = generate_dataset_summary(df, target_column) | |
| # Get top 5 model performance summary | |
| top_models_summary = models_df.head(5).to_string(index=False) | |
| # --- 5. Explanation with Nebius AI Studio LLM --- | |
| llm_explanation = "AI explanation is unavailable. Please provide a Nebius AI Studio API key to enable this feature." | |
| if nebius_api_key and nebius_api_key.strip(): | |
| try: | |
| client = OpenAI( | |
| base_url="https://api.studio.nebius.com/v1/", | |
| api_key=nebius_api_key.strip() | |
| ) | |
| # Craft an improved prompt with actual data context | |
| prompt_text = f"""Analyze this AutoML result and provide a concise, professional explanation: | |
| **Dataset Overview:** | |
| {dataset_summary} | |
| **Task Type:** {task} | |
| **Top 5 Performing Models:** | |
| {top_models_summary} | |
| **Best Model:** {best_model_name} | |
| Please explain: | |
| 1. Why '{best_model_name}' performed best for this {task} task | |
| 2. Key insights about the dataset characteristics | |
| 3. Recommendations for model deployment or further improvement | |
| Keep the explanation concise (3-4 paragraphs) and accessible to both technical and non-technical stakeholders.""" | |
| response = client.chat.completions.create( | |
| model="meta-llama/Llama-3.3-70B-Instruct", | |
| messages=[ | |
| {"role": "system", "content": "You are an expert data scientist assistant that explains machine learning results clearly and professionally."}, | |
| {"role": "user", "content": prompt_text} | |
| ], | |
| temperature=0.6, | |
| max_tokens=512, | |
| top_p=0.9, | |
| extra_body={"top_k": 50} | |
| ) | |
| # Simplified response access (no need for json.loads) | |
| llm_explanation = response.choices[0].message.content | |
| except Exception as e: | |
| gr.Warning(f"Failed to get AI explanation: {e}") | |
| llm_explanation = f"AI explanation unavailable due to an error. The best performing model is **{best_model_name}** for your {task} task." | |
| gr.Info("Analysis complete!") | |
| gr.Info(f'Profile report saved to: {profile_path}') | |
| return profile_path, task, models_df, plot_path, pickle_path, llm_explanation, column_names | |
| # --- Gradio UI --- | |
| with gr.Blocks(title="AutoML Trainer", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown("## 🤖 AutoML Trainer") | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| file_input = gr.File(label="Upload Local CSV File") | |
| url_input = gr.Textbox(label="Or Enter Public CSV URL", placeholder="e.g., https://.../data.csv") | |
| gr.Textbox(label="Sample CSV", value="https://raw.githubusercontent.com/daniel-was-taken/MCP_Project/refs/heads/master/collegePlace.csv") | |
| target_column_input = gr.Textbox(label="Enter Target Column Name", placeholder="e.g., approved") | |
| nebius_api_key_input = gr.Textbox(label="Nebius AI Studio API Key (Optional)", type="password", placeholder="Enter your API key for AI explanations") | |
| run_button = gr.Button("Run Analysis & AutoML", variant="primary") | |
| with gr.Column(scale=2): | |
| column_names_output = gr.Textbox(label="Detected Columns", interactive=False, lines=2) # New Textbox for column names | |
| task_output = gr.Textbox(label="Detected Task", interactive=False) | |
| llm_output = gr.Markdown(label="AI Explanation") | |
| metrics_output = gr.Dataframe(label="Model Performance Metrics") | |
| with gr.Row(): | |
| vis_output = gr.Image(label="Top Models Comparison") | |
| with gr.Column(): | |
| eda_output = gr.File(label="Download Full EDA Report") | |
| model_output = gr.File(label="Download Best Model (.pkl)") | |
| def process_inputs( | |
| file_data: Any, | |
| url_data: Optional[str], | |
| target: str, | |
| api_key: Optional[str] | |
| ) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]: | |
| """ | |
| Process inputs and run the AutoML pipeline. | |
| This wrapper function handles input selection between file upload and URL, | |
| then delegates to the main run_pipeline function. | |
| """ | |
| data_source = file_data if file_data is not None else url_data | |
| return run_pipeline(data_source, target, api_key) | |
| file_input.change( | |
| fn=update_detected_columns_display, | |
| inputs=[file_input, url_input], | |
| outputs=column_names_output | |
| ) | |
| url_input.change( | |
| fn=update_detected_columns_display, | |
| inputs=[file_input, url_input], | |
| outputs=column_names_output | |
| ) | |
| run_button.click( | |
| fn=process_inputs, | |
| inputs=[file_input, url_input, target_column_input, nebius_api_key_input], | |
| outputs=[eda_output, task_output, metrics_output, vis_output, model_output, llm_output, column_names_output], | |
| api_name="run_automl_pipeline" # Explicit API name for MCP | |
| ) | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| inbrowser=True, | |
| mcp_server=True | |
| ) |