daniel-was-taken commited on
Commit
26816ad
·
1 Parent(s): 053b42f

Add .gitignore and enhance app.py with detailed docstrings and error handling

Browse files
Files changed (3) hide show
  1. .gitignore +222 -0
  2. app.py +175 -57
  3. requirements.txt +2 -1
.gitignore ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ # Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ # poetry.lock
109
+ # poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ # pdm.lock
116
+ # pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ # pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # Redis
135
+ *.rdb
136
+ *.aof
137
+ *.pid
138
+
139
+ # RabbitMQ
140
+ mnesia/
141
+ rabbitmq/
142
+ rabbitmq-data/
143
+
144
+ # ActiveMQ
145
+ activemq-data/
146
+
147
+ # SageMath parsed files
148
+ *.sage.py
149
+
150
+ # Environments
151
+ .env
152
+ .envrc
153
+ .venv
154
+ env/
155
+ venv/
156
+ ENV/
157
+ env.bak/
158
+ venv.bak/
159
+
160
+ # Spyder project settings
161
+ .spyderproject
162
+ .spyproject
163
+
164
+ # Rope project settings
165
+ .ropeproject
166
+
167
+ # mkdocs documentation
168
+ /site
169
+
170
+ # mypy
171
+ .mypy_cache/
172
+ .dmypy.json
173
+ dmypy.json
174
+
175
+ # Pyre type checker
176
+ .pyre/
177
+
178
+ # pytype static type analyzer
179
+ .pytype/
180
+
181
+ # Cython debug symbols
182
+ cython_debug/
183
+
184
+ # PyCharm
185
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
186
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
187
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
188
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
189
+ # .idea/
190
+
191
+ # Abstra
192
+ # Abstra is an AI-powered process automation framework.
193
+ # Ignore directories containing user credentials, local state, and settings.
194
+ # Learn more at https://abstra.io/docs
195
+ .abstra/
196
+
197
+ # Visual Studio Code
198
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
199
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
200
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
201
+ # you could uncomment the following to ignore the entire vscode folder
202
+ # .vscode/
203
+
204
+ # Ruff stuff:
205
+ .ruff_cache/
206
+
207
+ # PyPI configuration file
208
+ .pypirc
209
+
210
+ # Marimo
211
+ marimo/_static/
212
+ marimo/_lsp/
213
+ __marimo__/
214
+
215
+ # Streamlit
216
+ .streamlit/secrets.toml
217
+
218
+ # Hackathon
219
+ trials/
220
+
221
+ # Gradio
222
+ .gradio/
app.py CHANGED
@@ -10,12 +10,27 @@ from ydata_profiling import ProfileReport
10
  import tempfile
11
  import requests
12
  import json
13
- from openai import OpenAI # Added for Nebius AI Studio LLM integration
 
14
 
15
- def load_data(file_input):
16
- """Loads CSV data from either a local file upload or a public URL."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  if file_input is None:
18
- return None, None
19
 
20
  try:
21
  if hasattr(file_input, 'name'):
@@ -24,7 +39,7 @@ def load_data(file_input):
24
  file_bytes = f.read()
25
  df = pd.read_csv(io.BytesIO(file_bytes))
26
  elif isinstance(file_input, str) and file_input.startswith('http'):
27
- response = requests.get(file_input)
28
  response.raise_for_status()
29
  df = pd.read_csv(io.StringIO(response.text))
30
  else:
@@ -38,24 +53,67 @@ def load_data(file_input):
38
  return None, None
39
 
40
 
41
- def update_detected_columns_display(file_data, url_data):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  """
43
  Detects and displays column names from the uploaded file or URL
44
  as soon as the input changes, before the main analysis button is pressed.
 
 
 
 
 
 
 
45
  """
46
  data_source = file_data if file_data is not None else url_data
47
  if data_source is None:
48
- return ""
49
 
50
- df, column_names = load_data(data_source)
51
  if column_names:
52
  return column_names
53
  else:
54
  return "No columns detected or error loading file. Please check the file format."
55
 
56
 
57
- def analyze_and_model(df, target_column):
58
- """Internal function to perform EDA, model training, and visualization."""
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  profile = ProfileReport(df, title="EDA Report", minimal=True)
60
  with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_html:
61
  profile.to_file(temp_html.name)
@@ -66,12 +124,26 @@ def analyze_and_model(df, target_column):
66
  task = "classification" if y.nunique() <= 10 else "regression"
67
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
68
 
69
- model = LazyClassifier(ignore_warnings=True, verbose=0) if task == "classification" else LazyRegressor(ignore_warnings=True, verbose=0)
70
- models, _ = model.fit(X_train, X_test, y_train, y_test)
71
 
72
  sort_metric = "Accuracy" if task == "classification" else "R-Squared"
73
- best_model_name = models.sort_values(by=sort_metric, ascending=False).index[0] # Corrected indexing
74
- best_model = model.models[best_model_name]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as temp_pkl:
77
  pickle.dump(best_model, temp_pkl)
@@ -79,8 +151,11 @@ def analyze_and_model(df, target_column):
79
 
80
  plt.figure(figsize=(10, 6))
81
  plot_column = "Accuracy" if task == "classification" else "R-Squared"
82
- sns.barplot(x=models[plot_column].head(10), y=models.head(10).index)
 
83
  plt.title(f"Top 10 Models by {plot_column}")
 
 
84
  plt.tight_layout()
85
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_png:
86
  plt.savefig(temp_png.name)
@@ -88,75 +163,108 @@ def analyze_and_model(df, target_column):
88
  plt.close()
89
 
90
  models_reset = models.reset_index().rename(columns={'index': 'Model'})
91
- return profile, profile_path, task, models_reset, plot_path, pickle_path
92
 
93
- def run_pipeline(data_source, target_column, nebius_api_key):
 
 
 
 
94
  """
95
- This single function drives the entire application.
96
- It's exposed as the primary tool for the MCP server.
97
-
98
- :param data_source: A local file path (from gr.File) or a URL (from gr.Textbox).
99
- :param target_column: The name of the target column for prediction.
100
- :param nebius_api_key: The API key for Nebius AI Studio.
 
 
 
 
 
 
 
 
 
 
 
 
101
  """
102
  # --- 1. Input Validation ---
103
  if not data_source or not target_column:
104
- error_msg = "Error: Data source and target column must be provided."
105
- gr.Warning(error_msg)
106
- return None, error_msg, None, None, None, "Please provide all inputs.", "No columns loaded."
107
 
108
  gr.Info("Starting analysis...")
109
 
110
  # --- 2. Data Loading ---
111
- df, column_names = load_data(data_source)
112
  if df is None:
113
- return None, "Error: Could not load data.", None, None, None, None, "No columns loaded."
 
114
 
115
  if target_column not in df.columns:
116
- error_msg = f"Error: Target column '{target_column}' not found in the dataset. Available: {column_names}"
117
  gr.Warning(error_msg)
118
- return None, error_msg, None, None, None, None, column_names
119
 
120
  # --- 3. Analysis and Modeling ---
121
- profile, profile_path, task, models_df, plot_path, pickle_path = analyze_and_model(df, target_column)
122
 
123
- # --- 4. Explanation with Nebius AI Studio LLM ---
124
- best_model_name = models_df.iloc[0]['Model'] # Corrected indexing
 
 
 
125
 
126
- llm_explanation = "AI explanation is unavailable. Please provide a Nebius AI Studio API key to enable this feature." # Generic fallback [1]
 
127
 
128
- if nebius_api_key:
129
  try:
130
  client = OpenAI(
131
  base_url="https://api.studio.nebius.com/v1/",
132
-
133
- api_key=nebius_api_key
134
  )
135
 
136
- # Craft a prompt for the LLM [2]
137
- prompt_text = f"Explain and Summarize the significance of the top performing model, '{best_model_name}', for a {task} task in a data analysis context. Keep the explanation concise and professional. Analyse the report: {profile}."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
- # Make the LLM call [2, 3]
140
  response = client.chat.completions.create(
141
- model="meta-llama/Llama-3.3-70B-Instruct",
142
  messages=[
143
- {"role": "system", "content": "You are a helpful AI assistant that explains data science concepts. "},
144
  {"role": "user", "content": prompt_text}
145
  ],
146
- temperature=0.6,
147
- max_tokens=512,
148
  top_p=0.9,
149
- extra_body={
150
- "top_k": 50
151
- }
152
  )
153
- message_content = response.to_json()
154
- data = json.loads(message_content)
155
- llm_explanation = data['choices'][0]['message']['content']
156
 
157
- except Exception as e:
158
- gr.Warning(f"Failed to get AI explanation: {e}. Please check your API key or try again later.")
159
- llm_explanation = "An error occurred while fetching AI explanation. Please check your API key or try again later."
160
 
161
  gr.Info("Analysis complete!")
162
  gr.Info(f'Profile report saved to: {profile_path}')
@@ -187,11 +295,21 @@ with gr.Blocks(title="AutoML Trainer", theme=gr.themes.Soft()) as demo:
187
  eda_output = gr.File(label="Download Full EDA Report")
188
  model_output = gr.File(label="Download Best Model (.pkl)")
189
 
190
- def process_inputs(file_data, url_data, target, api_key):
 
 
 
 
 
 
 
 
 
 
 
191
  data_source = file_data if file_data is not None else url_data
192
  return run_pipeline(data_source, target, api_key)
193
 
194
-
195
  file_input.change(
196
  fn=update_detected_columns_display,
197
  inputs=[file_input, url_input],
@@ -206,14 +324,14 @@ with gr.Blocks(title="AutoML Trainer", theme=gr.themes.Soft()) as demo:
206
  run_button.click(
207
  fn=process_inputs,
208
  inputs=[file_input, url_input, target_column_input, nebius_api_key_input],
209
- outputs=[eda_output, task_output, metrics_output, vis_output, model_output, llm_output, column_names_output]
 
210
  )
211
 
212
  demo.launch(
213
  server_name="0.0.0.0",
214
  server_port=7860,
215
  share=False,
216
- show_api=True,
217
  inbrowser=True,
218
  mcp_server=True
219
  )
 
10
  import tempfile
11
  import requests
12
  import json
13
+ from typing import Optional, Tuple, Any, Union
14
+ from openai import OpenAI # Added for Nebius AI Studio LLM integration
15
 
16
+ # Constants
17
+ NO_TASK_DETECTED = "No task detected"
18
+ NO_COLUMNS_LOADED = "No columns loaded."
19
+
20
+
21
+ def load_data(file_input: Any) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
22
+ """
23
+ Loads CSV data from either a local file upload or a public URL.
24
+
25
+ Args:
26
+ file_input: A file object from Gradio upload or a URL string.
27
+
28
+ Returns:
29
+ Tuple containing the DataFrame and comma-separated column names,
30
+ or (None, None) if loading fails.
31
+ """
32
  if file_input is None:
33
+ return None, None
34
 
35
  try:
36
  if hasattr(file_input, 'name'):
 
39
  file_bytes = f.read()
40
  df = pd.read_csv(io.BytesIO(file_bytes))
41
  elif isinstance(file_input, str) and file_input.startswith('http'):
42
+ response = requests.get(file_input, timeout=30)
43
  response.raise_for_status()
44
  df = pd.read_csv(io.StringIO(response.text))
45
  else:
 
53
  return None, None
54
 
55
 
56
+ def generate_dataset_summary(df: pd.DataFrame, target_column: str) -> str:
57
+ """
58
+ Generates a concise summary of the dataset for LLM context.
59
+
60
+ Args:
61
+ df: The pandas DataFrame to summarize.
62
+ target_column: The name of the target column.
63
+
64
+ Returns:
65
+ A formatted string summary of the dataset.
66
+ """
67
+ summary_parts = [
68
+ f"Dataset Shape: {df.shape[0]} rows, {df.shape[1]} columns",
69
+ f"Target Column: {target_column}",
70
+ f"Target Unique Values: {df[target_column].nunique()}",
71
+ f"Features: {', '.join([col for col in df.columns if col != target_column])}",
72
+ f"Missing Values: {df.isnull().sum().sum()} total",
73
+ f"Numeric Columns: {len(df.select_dtypes(include=['number']).columns)}",
74
+ f"Categorical Columns: {len(df.select_dtypes(include=['object', 'category']).columns)}"
75
+ ]
76
+ return "\n".join(summary_parts)
77
+
78
+
79
+ def update_detected_columns_display(file_data: Any, url_data: Optional[str]) -> str:
80
  """
81
  Detects and displays column names from the uploaded file or URL
82
  as soon as the input changes, before the main analysis button is pressed.
83
+
84
+ Args:
85
+ file_data: File object from Gradio file upload component.
86
+ url_data: URL string from Gradio textbox component.
87
+
88
+ Returns:
89
+ Comma-separated string of column names or error message.
90
  """
91
  data_source = file_data if file_data is not None else url_data
92
  if data_source is None:
93
+ return ""
94
 
95
+ _, column_names = load_data(data_source)
96
  if column_names:
97
  return column_names
98
  else:
99
  return "No columns detected or error loading file. Please check the file format."
100
 
101
 
102
+ def analyze_and_model(
103
+ df: pd.DataFrame,
104
+ target_column: str
105
+ ) -> Tuple[ProfileReport, str, str, pd.DataFrame, str, str, str]:
106
+ """
107
+ Internal function to perform EDA, model training, and visualization.
108
+
109
+ Args:
110
+ df: The pandas DataFrame containing the dataset.
111
+ target_column: The name of the target column for prediction.
112
+
113
+ Returns:
114
+ Tuple containing: profile report, profile path, task type,
115
+ models dataframe, plot path, pickle path, and best model name.
116
+ """
117
  profile = ProfileReport(df, title="EDA Report", minimal=True)
118
  with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_html:
119
  profile.to_file(temp_html.name)
 
124
  task = "classification" if y.nunique() <= 10 else "regression"
125
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
126
 
127
+ lazy_model = LazyClassifier(ignore_warnings=True, verbose=0) if task == "classification" else LazyRegressor(ignore_warnings=True, verbose=0)
128
+ models, _ = lazy_model.fit(X_train, X_test, y_train, y_test)
129
 
130
  sort_metric = "Accuracy" if task == "classification" else "R-Squared"
131
+ sorted_models = models.sort_values(by=sort_metric, ascending=False)
132
+ best_model_name = sorted_models.index[0]
133
+
134
+ # Safely access the best model with error handling
135
+ try:
136
+ best_model = lazy_model.models[best_model_name]
137
+ except KeyError:
138
+ # Fallback: try to find the model with stripped whitespace
139
+ model_keys = list(lazy_model.models.keys())
140
+ matching_key = next((k for k in model_keys if k.strip() == best_model_name.strip()), None)
141
+ if matching_key:
142
+ best_model = lazy_model.models[matching_key]
143
+ else:
144
+ # Use the first available model as fallback
145
+ best_model = list(lazy_model.models.values())[0]
146
+ gr.Warning(f"Could not find exact model '{best_model_name}', using first available model.")
147
 
148
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as temp_pkl:
149
  pickle.dump(best_model, temp_pkl)
 
151
 
152
  plt.figure(figsize=(10, 6))
153
  plot_column = "Accuracy" if task == "classification" else "R-Squared"
154
+ top_models = models.head(10)
155
+ sns.barplot(x=top_models[plot_column].values, y=top_models.index.tolist())
156
  plt.title(f"Top 10 Models by {plot_column}")
157
+ plt.xlabel(plot_column)
158
+ plt.ylabel("Model")
159
  plt.tight_layout()
160
  with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_png:
161
  plt.savefig(temp_png.name)
 
163
  plt.close()
164
 
165
  models_reset = models.reset_index().rename(columns={'index': 'Model'})
166
+ return profile, profile_path, task, models_reset, plot_path, pickle_path, best_model_name
167
 
168
+ def run_pipeline(
169
+ data_source: Union[Any, str],
170
+ target_column: str,
171
+ nebius_api_key: Optional[str] = None
172
+ ) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]:
173
  """
174
+ Run the complete AutoML pipeline including data loading, EDA, model training, and AI explanation.
175
+
176
+ This is the primary MCP tool function that orchestrates the entire AutoML workflow.
177
+
178
+ Args:
179
+ data_source: Either a file path/object from local upload or a URL string pointing to a CSV file.
180
+ target_column: The name of the column to predict (target variable).
181
+ nebius_api_key: Optional API key for Nebius AI Studio to enable AI-powered explanations.
182
+
183
+ Returns:
184
+ Tuple containing:
185
+ - eda_report_path: Path to the generated HTML EDA report file.
186
+ - task_type: Either "classification" or "regression" based on target variable.
187
+ - models_dataframe: DataFrame with performance metrics of all trained models.
188
+ - visualization_path: Path to the model comparison chart image.
189
+ - model_pickle_path: Path to the serialized best model (.pkl file).
190
+ - llm_explanation: AI-generated explanation of results (or fallback message).
191
+ - column_names: Comma-separated list of detected column names.
192
  """
193
  # --- 1. Input Validation ---
194
  if not data_source or not target_column:
195
+ error_msg = "Please provide both a data source and target column name."
196
+ gr.Warning("Error: Data source and target column must be provided.")
197
+ return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED
198
 
199
  gr.Info("Starting analysis...")
200
 
201
  # --- 2. Data Loading ---
202
+ df, column_names = load_data(data_source)
203
  if df is None:
204
+ error_msg = "Could not load data. Please check the file format or URL."
205
+ return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED
206
 
207
  if target_column not in df.columns:
208
+ error_msg = f"Target column '{target_column}' not found. Available columns: {column_names}"
209
  gr.Warning(error_msg)
210
+ return None, NO_TASK_DETECTED, None, None, None, error_msg, column_names
211
 
212
  # --- 3. Analysis and Modeling ---
213
+ _, profile_path, task, models_df, plot_path, pickle_path, best_model_name = analyze_and_model(df, target_column)
214
 
215
+ # --- 4. Generate Dataset Summary for LLM Context ---
216
+ dataset_summary = generate_dataset_summary(df, target_column)
217
+
218
+ # Get top 5 model performance summary
219
+ top_models_summary = models_df.head(5).to_string(index=False)
220
 
221
+ # --- 5. Explanation with Nebius AI Studio LLM ---
222
+ llm_explanation = "AI explanation is unavailable. Please provide a Nebius AI Studio API key to enable this feature."
223
 
224
+ if nebius_api_key and nebius_api_key.strip():
225
  try:
226
  client = OpenAI(
227
  base_url="https://api.studio.nebius.com/v1/",
228
+ api_key=nebius_api_key.strip()
 
229
  )
230
 
231
+ # Craft an improved prompt with actual data context
232
+ prompt_text = f"""Analyze this AutoML result and provide a concise, professional explanation:
233
+
234
+ **Dataset Overview:**
235
+ {dataset_summary}
236
+
237
+ **Task Type:** {task}
238
+
239
+ **Top 5 Performing Models:**
240
+ {top_models_summary}
241
+
242
+ **Best Model:** {best_model_name}
243
+
244
+ Please explain:
245
+ 1. Why '{best_model_name}' performed best for this {task} task
246
+ 2. Key insights about the dataset characteristics
247
+ 3. Recommendations for model deployment or further improvement
248
+
249
+ Keep the explanation concise (3-4 paragraphs) and accessible to both technical and non-technical stakeholders."""
250
 
 
251
  response = client.chat.completions.create(
252
+ model="meta-llama/Llama-3.3-70B-Instruct",
253
  messages=[
254
+ {"role": "system", "content": "You are an expert data scientist assistant that explains machine learning results clearly and professionally."},
255
  {"role": "user", "content": prompt_text}
256
  ],
257
+ temperature=0.6,
258
+ max_tokens=512,
259
  top_p=0.9,
260
+ extra_body={"top_k": 50}
 
 
261
  )
262
+ # Simplified response access (no need for json.loads)
263
+ llm_explanation = response.choices[0].message.content
 
264
 
265
+ except Exception as e:
266
+ gr.Warning(f"Failed to get AI explanation: {e}")
267
+ llm_explanation = f"AI explanation unavailable due to an error. The best performing model is **{best_model_name}** for your {task} task."
268
 
269
  gr.Info("Analysis complete!")
270
  gr.Info(f'Profile report saved to: {profile_path}')
 
295
  eda_output = gr.File(label="Download Full EDA Report")
296
  model_output = gr.File(label="Download Best Model (.pkl)")
297
 
298
+ def process_inputs(
299
+ file_data: Any,
300
+ url_data: Optional[str],
301
+ target: str,
302
+ api_key: Optional[str]
303
+ ) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]:
304
+ """
305
+ Process inputs and run the AutoML pipeline.
306
+
307
+ This wrapper function handles input selection between file upload and URL,
308
+ then delegates to the main run_pipeline function.
309
+ """
310
  data_source = file_data if file_data is not None else url_data
311
  return run_pipeline(data_source, target, api_key)
312
 
 
313
  file_input.change(
314
  fn=update_detected_columns_display,
315
  inputs=[file_input, url_input],
 
324
  run_button.click(
325
  fn=process_inputs,
326
  inputs=[file_input, url_input, target_column_input, nebius_api_key_input],
327
+ outputs=[eda_output, task_output, metrics_output, vis_output, model_output, llm_output, column_names_output],
328
+ api_name="run_automl_pipeline" # Explicit API name for MCP
329
  )
330
 
331
  demo.launch(
332
  server_name="0.0.0.0",
333
  server_port=7860,
334
  share=False,
 
335
  inbrowser=True,
336
  mcp_server=True
337
  )
requirements.txt CHANGED
@@ -5,7 +5,7 @@ gradio>=4.0.0
5
  Pillow>=10.0.0
6
  scikit-learn>=1.3.0
7
  pandas>=2.0.0
8
- numpy>=1.24.0
9
  matplotlib>=3.7.0
10
  seaborn>=0.12.0
11
  plotly>=5.0.0
@@ -14,3 +14,4 @@ lightgbm>=3.3.0
14
  shap>=0.42.0
15
  lazypredict>=0.2.12
16
  ydata-profiling>=4.0.0
 
 
5
  Pillow>=10.0.0
6
  scikit-learn>=1.3.0
7
  pandas>=2.0.0
8
+ numpy>=2.1.0
9
  matplotlib>=3.7.0
10
  seaborn>=0.12.0
11
  plotly>=5.0.0
 
14
  shap>=0.42.0
15
  lazypredict>=0.2.12
16
  ydata-profiling>=4.0.0
17
+ setuptools >= 80.10.2