Spaces:
Sleeping
Sleeping
File size: 13,833 Bytes
993cfb9 2712881 26816ad 993cfb9 26816ad 993cfb9 26816ad 2712881 993cfb9 26816ad 993cfb9 2712881 993cfb9 2712881 26816ad 2712881 26816ad 2712881 26816ad 2712881 26816ad 2712881 993cfb9 26816ad 993cfb9 26816ad 993cfb9 26816ad 993cfb9 26816ad 993cfb9 26816ad 993cfb9 26816ad 993cfb9 26816ad 993cfb9 26816ad 993cfb9 26816ad 993cfb9 2712881 993cfb9 26816ad 993cfb9 26816ad 993cfb9 26816ad 993cfb9 26816ad 993cfb9 26816ad 2712881 26816ad 2712881 26816ad 2712881 26816ad 2712881 26816ad 2712881 26816ad 2712881 26816ad 2712881 26816ad 2712881 26816ad 2712881 26816ad 2712881 26816ad 2712881 26816ad 2712881 993cfb9 2712881 993cfb9 131a1cc 993cfb9 2712881 993cfb9 2712881 993cfb9 2712881 993cfb9 2712881 993cfb9 26816ad 993cfb9 2712881 993cfb9 2712881 26816ad 993cfb9 2712881 993cfb9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 | import gradio as gr
import pandas as pd
import io
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from lazypredict.Supervised import LazyClassifier, LazyRegressor
from sklearn.model_selection import train_test_split
from ydata_profiling import ProfileReport
import tempfile
import requests
import json
from typing import Optional, Tuple, Any, Union
from openai import OpenAI # Added for Nebius AI Studio LLM integration
# Constants
NO_TASK_DETECTED = "No task detected"
NO_COLUMNS_LOADED = "No columns loaded."
def load_data(file_input: Any) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
"""
Loads CSV data from either a local file upload or a public URL.
Args:
file_input: A file object from Gradio upload or a URL string.
Returns:
Tuple containing the DataFrame and comma-separated column names,
or (None, None) if loading fails.
"""
if file_input is None:
return None, None
try:
if hasattr(file_input, 'name'):
file_path = file_input.name
with open(file_path, 'rb') as f:
file_bytes = f.read()
df = pd.read_csv(io.BytesIO(file_bytes))
elif isinstance(file_input, str) and file_input.startswith('http'):
response = requests.get(file_input, timeout=30)
response.raise_for_status()
df = pd.read_csv(io.StringIO(response.text))
else:
return None, None
# Extract column names here
column_names = ", ".join(df.columns.tolist())
return df, column_names
except Exception as e:
gr.Warning(f"Failed to load or parse data: {e}")
return None, None
def generate_dataset_summary(df: pd.DataFrame, target_column: str) -> str:
"""
Generates a concise summary of the dataset for LLM context.
Args:
df: The pandas DataFrame to summarize.
target_column: The name of the target column.
Returns:
A formatted string summary of the dataset.
"""
summary_parts = [
f"Dataset Shape: {df.shape[0]} rows, {df.shape[1]} columns",
f"Target Column: {target_column}",
f"Target Unique Values: {df[target_column].nunique()}",
f"Features: {', '.join([col for col in df.columns if col != target_column])}",
f"Missing Values: {df.isnull().sum().sum()} total",
f"Numeric Columns: {len(df.select_dtypes(include=['number']).columns)}",
f"Categorical Columns: {len(df.select_dtypes(include=['object', 'category']).columns)}"
]
return "\n".join(summary_parts)
def update_detected_columns_display(file_data: Any, url_data: Optional[str]) -> str:
"""
Detects and displays column names from the uploaded file or URL
as soon as the input changes, before the main analysis button is pressed.
Args:
file_data: File object from Gradio file upload component.
url_data: URL string from Gradio textbox component.
Returns:
Comma-separated string of column names or error message.
"""
data_source = file_data if file_data is not None else url_data
if data_source is None:
return ""
_, column_names = load_data(data_source)
if column_names:
return column_names
else:
return "No columns detected or error loading file. Please check the file format."
def analyze_and_model(
df: pd.DataFrame,
target_column: str
) -> Tuple[ProfileReport, str, str, pd.DataFrame, str, str, str]:
"""
Internal function to perform EDA, model training, and visualization.
Args:
df: The pandas DataFrame containing the dataset.
target_column: The name of the target column for prediction.
Returns:
Tuple containing: profile report, profile path, task type,
models dataframe, plot path, pickle path, and best model name.
"""
profile = ProfileReport(df, title="EDA Report", minimal=True)
with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_html:
profile.to_file(temp_html.name)
profile_path = temp_html.name
X = df.drop(columns=[target_column])
y = df[target_column]
task = "classification" if y.nunique() <= 10 else "regression"
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lazy_model = LazyClassifier(ignore_warnings=True, verbose=0) if task == "classification" else LazyRegressor(ignore_warnings=True, verbose=0)
models, _ = lazy_model.fit(X_train, X_test, y_train, y_test)
sort_metric = "Accuracy" if task == "classification" else "R-Squared"
sorted_models = models.sort_values(by=sort_metric, ascending=False)
best_model_name = sorted_models.index[0]
# Safely access the best model with error handling
try:
best_model = lazy_model.models[best_model_name]
except KeyError:
# Fallback: try to find the model with stripped whitespace
model_keys = list(lazy_model.models.keys())
matching_key = next((k for k in model_keys if k.strip() == best_model_name.strip()), None)
if matching_key:
best_model = lazy_model.models[matching_key]
else:
# Use the first available model as fallback
best_model = list(lazy_model.models.values())[0]
gr.Warning(f"Could not find exact model '{best_model_name}', using first available model.")
with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as temp_pkl:
pickle.dump(best_model, temp_pkl)
pickle_path = temp_pkl.name
plt.figure(figsize=(10, 6))
plot_column = "Accuracy" if task == "classification" else "R-Squared"
top_models = models.head(10)
sns.barplot(x=top_models[plot_column].values, y=top_models.index.tolist())
plt.title(f"Top 10 Models by {plot_column}")
plt.xlabel(plot_column)
plt.ylabel("Model")
plt.tight_layout()
with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_png:
plt.savefig(temp_png.name)
plot_path = temp_png.name
plt.close()
models_reset = models.reset_index().rename(columns={'index': 'Model'})
return profile, profile_path, task, models_reset, plot_path, pickle_path, best_model_name
def run_pipeline(
data_source: Union[Any, str],
target_column: str,
nebius_api_key: Optional[str] = None
) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]:
"""
Run the complete AutoML pipeline including data loading, EDA, model training, and AI explanation.
This is the primary MCP tool function that orchestrates the entire AutoML workflow.
Args:
data_source: Either a file path/object from local upload or a URL string pointing to a CSV file.
target_column: The name of the column to predict (target variable).
nebius_api_key: Optional API key for Nebius AI Studio to enable AI-powered explanations.
Returns:
Tuple containing:
- eda_report_path: Path to the generated HTML EDA report file.
- task_type: Either "classification" or "regression" based on target variable.
- models_dataframe: DataFrame with performance metrics of all trained models.
- visualization_path: Path to the model comparison chart image.
- model_pickle_path: Path to the serialized best model (.pkl file).
- llm_explanation: AI-generated explanation of results (or fallback message).
- column_names: Comma-separated list of detected column names.
"""
# --- 1. Input Validation ---
if not data_source or not target_column:
error_msg = "Please provide both a data source and target column name."
gr.Warning("Error: Data source and target column must be provided.")
return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED
gr.Info("Starting analysis...")
# --- 2. Data Loading ---
df, column_names = load_data(data_source)
if df is None:
error_msg = "Could not load data. Please check the file format or URL."
return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED
if target_column not in df.columns:
error_msg = f"Target column '{target_column}' not found. Available columns: {column_names}"
gr.Warning(error_msg)
return None, NO_TASK_DETECTED, None, None, None, error_msg, column_names
# --- 3. Analysis and Modeling ---
_, profile_path, task, models_df, plot_path, pickle_path, best_model_name = analyze_and_model(df, target_column)
# --- 4. Generate Dataset Summary for LLM Context ---
dataset_summary = generate_dataset_summary(df, target_column)
# Get top 5 model performance summary
top_models_summary = models_df.head(5).to_string(index=False)
# --- 5. Explanation with Nebius AI Studio LLM ---
llm_explanation = "AI explanation is unavailable. Please provide a Nebius AI Studio API key to enable this feature."
if nebius_api_key and nebius_api_key.strip():
try:
client = OpenAI(
base_url="https://api.studio.nebius.com/v1/",
api_key=nebius_api_key.strip()
)
# Craft an improved prompt with actual data context
prompt_text = f"""Analyze this AutoML result and provide a concise, professional explanation:
**Dataset Overview:**
{dataset_summary}
**Task Type:** {task}
**Top 5 Performing Models:**
{top_models_summary}
**Best Model:** {best_model_name}
Please explain:
1. Why '{best_model_name}' performed best for this {task} task
2. Key insights about the dataset characteristics
3. Recommendations for model deployment or further improvement
Keep the explanation concise (3-4 paragraphs) and accessible to both technical and non-technical stakeholders."""
response = client.chat.completions.create(
model="meta-llama/Llama-3.3-70B-Instruct",
messages=[
{"role": "system", "content": "You are an expert data scientist assistant that explains machine learning results clearly and professionally."},
{"role": "user", "content": prompt_text}
],
temperature=0.6,
max_tokens=512,
top_p=0.9,
extra_body={"top_k": 50}
)
# Simplified response access (no need for json.loads)
llm_explanation = response.choices[0].message.content
except Exception as e:
gr.Warning(f"Failed to get AI explanation: {e}")
llm_explanation = f"AI explanation unavailable due to an error. The best performing model is **{best_model_name}** for your {task} task."
gr.Info("Analysis complete!")
gr.Info(f'Profile report saved to: {profile_path}')
return profile_path, task, models_df, plot_path, pickle_path, llm_explanation, column_names
# --- Gradio UI ---
with gr.Blocks(title="AutoML Trainer", theme=gr.themes.Soft()) as demo:
gr.Markdown("## 🤖 AutoML Trainer")
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(label="Upload Local CSV File")
url_input = gr.Textbox(label="Or Enter Public CSV URL", placeholder="e.g., https://.../data.csv")
gr.Textbox(label="Sample CSV", value="https://raw.githubusercontent.com/daniel-was-taken/MCP_Project/refs/heads/master/collegePlace.csv")
target_column_input = gr.Textbox(label="Enter Target Column Name", placeholder="e.g., approved")
nebius_api_key_input = gr.Textbox(label="Nebius AI Studio API Key (Optional)", type="password", placeholder="Enter your API key for AI explanations")
run_button = gr.Button("Run Analysis & AutoML", variant="primary")
with gr.Column(scale=2):
column_names_output = gr.Textbox(label="Detected Columns", interactive=False, lines=2) # New Textbox for column names
task_output = gr.Textbox(label="Detected Task", interactive=False)
llm_output = gr.Markdown(label="AI Explanation")
metrics_output = gr.Dataframe(label="Model Performance Metrics")
with gr.Row():
vis_output = gr.Image(label="Top Models Comparison")
with gr.Column():
eda_output = gr.File(label="Download Full EDA Report")
model_output = gr.File(label="Download Best Model (.pkl)")
def process_inputs(
file_data: Any,
url_data: Optional[str],
target: str,
api_key: Optional[str]
) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]:
"""
Process inputs and run the AutoML pipeline.
This wrapper function handles input selection between file upload and URL,
then delegates to the main run_pipeline function.
"""
data_source = file_data if file_data is not None else url_data
return run_pipeline(data_source, target, api_key)
file_input.change(
fn=update_detected_columns_display,
inputs=[file_input, url_input],
outputs=column_names_output
)
url_input.change(
fn=update_detected_columns_display,
inputs=[file_input, url_input],
outputs=column_names_output
)
run_button.click(
fn=process_inputs,
inputs=[file_input, url_input, target_column_input, nebius_api_key_input],
outputs=[eda_output, task_output, metrics_output, vis_output, model_output, llm_output, column_names_output],
api_name="run_automl_pipeline" # Explicit API name for MCP
)
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
inbrowser=True,
mcp_server=True
) |