Spaces:

Agents-MCP-Hackathon
/

AutoML-MCP

Sleeping

App Files Files Community

daniel-was-taken commited on Feb 5

Commit

26816ad

1 Parent(s): 053b42f

Add .gitignore and enhance app.py with detailed docstrings and error handling

Browse files

Files changed (3) hide show

.gitignore +222 -0
app.py +175 -57
requirements.txt +2 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,222 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# Redis
+*.rdb
+*.aof
+*.pid
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+# ActiveMQ
+activemq-data/
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer,
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+# Streamlit
+.streamlit/secrets.toml
+# Hackathon
+trials/
+# Gradio
+.gradio/

app.py CHANGED Viewed

@@ -10,12 +10,27 @@ from ydata_profiling import ProfileReport
 import tempfile
 import requests
 import json
-from openai import OpenAI # Added for Nebius AI Studio LLM integration
-def load_data(file_input):
-    """Loads CSV data from either a local file upload or a public URL."""
     if file_input is None:
-        return None, None
     try:
         if hasattr(file_input, 'name'):
@@ -24,7 +39,7 @@ def load_data(file_input):
                 file_bytes = f.read()
             df = pd.read_csv(io.BytesIO(file_bytes))
         elif isinstance(file_input, str) and file_input.startswith('http'):
-            response = requests.get(file_input)
             response.raise_for_status()
             df = pd.read_csv(io.StringIO(response.text))
         else:
@@ -38,24 +53,67 @@ def load_data(file_input):
         return None, None
-def update_detected_columns_display(file_data, url_data):
     """
     Detects and displays column names from the uploaded file or URL
     as soon as the input changes, before the main analysis button is pressed.
     """
     data_source = file_data if file_data is not None else url_data
     if data_source is None:
-        return ""
-    df, column_names = load_data(data_source)
     if column_names:
         return column_names
     else:
         return "No columns detected or error loading file. Please check the file format."
-def analyze_and_model(df, target_column):
-    """Internal function to perform EDA, model training, and visualization."""
     profile = ProfileReport(df, title="EDA Report", minimal=True)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_html:
         profile.to_file(temp_html.name)
@@ -66,12 +124,26 @@ def analyze_and_model(df, target_column):
     task = "classification" if y.nunique() <= 10 else "regression"
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-    model = LazyClassifier(ignore_warnings=True, verbose=0) if task == "classification" else LazyRegressor(ignore_warnings=True, verbose=0)
-    models, _ = model.fit(X_train, X_test, y_train, y_test)
     sort_metric = "Accuracy" if task == "classification" else "R-Squared"
-    best_model_name = models.sort_values(by=sort_metric, ascending=False).index[0] # Corrected indexing
-    best_model = model.models[best_model_name]
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as temp_pkl:
         pickle.dump(best_model, temp_pkl)
@@ -79,8 +151,11 @@ def analyze_and_model(df, target_column):
     plt.figure(figsize=(10, 6))
     plot_column = "Accuracy" if task == "classification" else "R-Squared"
-    sns.barplot(x=models[plot_column].head(10), y=models.head(10).index)
     plt.title(f"Top 10 Models by {plot_column}")
     plt.tight_layout()
     with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_png:
         plt.savefig(temp_png.name)
@@ -88,75 +163,108 @@ def analyze_and_model(df, target_column):
     plt.close()
     models_reset = models.reset_index().rename(columns={'index': 'Model'})
-    return profile, profile_path, task, models_reset, plot_path, pickle_path
-def run_pipeline(data_source, target_column, nebius_api_key):
     """
-    This single function drives the entire application.
-    It's exposed as the primary tool for the MCP server.
-    :param data_source: A local file path (from gr.File) or a URL (from gr.Textbox).
-    :param target_column: The name of the target column for prediction.
-    :param nebius_api_key: The API key for Nebius AI Studio.
     """
     # --- 1. Input Validation ---
     if not data_source or not target_column:
-        error_msg = "Error: Data source and target column must be provided."
-        gr.Warning(error_msg)
-        return None, error_msg, None, None, None, "Please provide all inputs.", "No columns loaded."
     gr.Info("Starting analysis...")
     # --- 2. Data Loading ---
-    df, column_names = load_data(data_source)
     if df is None:
-        return None, "Error: Could not load data.", None, None, None, None, "No columns loaded."
     if target_column not in df.columns:
-        error_msg = f"Error: Target column '{target_column}' not found in the dataset. Available: {column_names}"
         gr.Warning(error_msg)
-        return None, error_msg, None, None, None, None, column_names
     # --- 3. Analysis and Modeling ---
-    profile, profile_path, task, models_df, plot_path, pickle_path = analyze_and_model(df, target_column)
-    # --- 4. Explanation with Nebius AI Studio LLM ---
-    best_model_name = models_df.iloc[0]['Model'] # Corrected indexing
-    llm_explanation = "AI explanation is unavailable. Please provide a Nebius AI Studio API key to enable this feature." # Generic fallback [1]
-    if nebius_api_key:
         try:
             client = OpenAI(
                 base_url="https://api.studio.nebius.com/v1/",
-                api_key=nebius_api_key
             )
-            # Craft a prompt for the LLM [2]
-            prompt_text = f"Explain and Summarize the significance of the top performing model, '{best_model_name}', for a {task} task in a data analysis context. Keep the explanation concise and professional. Analyse the report: {profile}."
-            # Make the LLM call [2, 3]
             response = client.chat.completions.create(
-                model="meta-llama/Llama-3.3-70B-Instruct",
                 messages=[
-                    {"role": "system", "content": "You are a helpful AI assistant that explains data science concepts. "},
                     {"role": "user", "content": prompt_text}
                 ],
-                temperature=0.6,
-                max_tokens=512,
                 top_p=0.9,
-                extra_body={
-                    "top_k": 50
-                }
             )
-            message_content = response.to_json()
-            data = json.loads(message_content)
-            llm_explanation = data['choices'][0]['message']['content']
-        except Exception as e:
-            gr.Warning(f"Failed to get AI explanation: {e}. Please check your API key or try again later.")
-            llm_explanation = "An error occurred while fetching AI explanation. Please check your API key or try again later."
     gr.Info("Analysis complete!")
     gr.Info(f'Profile report saved to: {profile_path}')
@@ -187,11 +295,21 @@ with gr.Blocks(title="AutoML Trainer", theme=gr.themes.Soft()) as demo:
             eda_output = gr.File(label="Download Full EDA Report")
             model_output = gr.File(label="Download Best Model (.pkl)")
-    def process_inputs(file_data, url_data, target, api_key):
         data_source = file_data if file_data is not None else url_data
         return run_pipeline(data_source, target, api_key)
     file_input.change(
         fn=update_detected_columns_display,
         inputs=[file_input, url_input],
@@ -206,14 +324,14 @@ with gr.Blocks(title="AutoML Trainer", theme=gr.themes.Soft()) as demo:
     run_button.click(
         fn=process_inputs,
         inputs=[file_input, url_input, target_column_input, nebius_api_key_input],
-        outputs=[eda_output, task_output, metrics_output, vis_output, model_output, llm_output, column_names_output]
     )
 demo.launch(
     server_name="0.0.0.0",
     server_port=7860,
     share=False,
-    show_api=True,
     inbrowser=True,
     mcp_server=True
 )

 import tempfile
 import requests
 import json
+from typing import Optional, Tuple, Any, Union
+from openai import OpenAI  # Added for Nebius AI Studio LLM integration
+# Constants
+NO_TASK_DETECTED = "No task detected"
+NO_COLUMNS_LOADED = "No columns loaded."
+def load_data(file_input: Any) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
+    """
+    Loads CSV data from either a local file upload or a public URL.
+    Args:
+        file_input: A file object from Gradio upload or a URL string.
+    Returns:
+        Tuple containing the DataFrame and comma-separated column names,
+        or (None, None) if loading fails.
+    """
     if file_input is None:
+        return None, None
     try:
         if hasattr(file_input, 'name'):
                 file_bytes = f.read()
             df = pd.read_csv(io.BytesIO(file_bytes))
         elif isinstance(file_input, str) and file_input.startswith('http'):
+            response = requests.get(file_input, timeout=30)
             response.raise_for_status()
             df = pd.read_csv(io.StringIO(response.text))
         else:
         return None, None
+def generate_dataset_summary(df: pd.DataFrame, target_column: str) -> str:
+    """
+    Generates a concise summary of the dataset for LLM context.
+    Args:
+        df: The pandas DataFrame to summarize.
+        target_column: The name of the target column.
+    Returns:
+        A formatted string summary of the dataset.
+    """
+    summary_parts = [
+        f"Dataset Shape: {df.shape[0]} rows, {df.shape[1]} columns",
+        f"Target Column: {target_column}",
+        f"Target Unique Values: {df[target_column].nunique()}",
+        f"Features: {', '.join([col for col in df.columns if col != target_column])}",
+        f"Missing Values: {df.isnull().sum().sum()} total",
+        f"Numeric Columns: {len(df.select_dtypes(include=['number']).columns)}",
+        f"Categorical Columns: {len(df.select_dtypes(include=['object', 'category']).columns)}"
+    ]
+    return "\n".join(summary_parts)
+def update_detected_columns_display(file_data: Any, url_data: Optional[str]) -> str:
     """
     Detects and displays column names from the uploaded file or URL
     as soon as the input changes, before the main analysis button is pressed.
+    Args:
+        file_data: File object from Gradio file upload component.
+        url_data: URL string from Gradio textbox component.
+    Returns:
+        Comma-separated string of column names or error message.
     """
     data_source = file_data if file_data is not None else url_data
     if data_source is None:
+        return ""
+    _, column_names = load_data(data_source)
     if column_names:
         return column_names
     else:
         return "No columns detected or error loading file. Please check the file format."
+def analyze_and_model(
+    df: pd.DataFrame,
+    target_column: str
+) -> Tuple[ProfileReport, str, str, pd.DataFrame, str, str, str]:
+    """
+    Internal function to perform EDA, model training, and visualization.
+    Args:
+        df: The pandas DataFrame containing the dataset.
+        target_column: The name of the target column for prediction.
+    Returns:
+        Tuple containing: profile report, profile path, task type,
+        models dataframe, plot path, pickle path, and best model name.
+    """
     profile = ProfileReport(df, title="EDA Report", minimal=True)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".html") as temp_html:
         profile.to_file(temp_html.name)
     task = "classification" if y.nunique() <= 10 else "regression"
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+    lazy_model = LazyClassifier(ignore_warnings=True, verbose=0) if task == "classification" else LazyRegressor(ignore_warnings=True, verbose=0)
+    models, _ = lazy_model.fit(X_train, X_test, y_train, y_test)
     sort_metric = "Accuracy" if task == "classification" else "R-Squared"
+    sorted_models = models.sort_values(by=sort_metric, ascending=False)
+    best_model_name = sorted_models.index[0]
+    # Safely access the best model with error handling
+    try:
+        best_model = lazy_model.models[best_model_name]
+    except KeyError:
+        # Fallback: try to find the model with stripped whitespace
+        model_keys = list(lazy_model.models.keys())
+        matching_key = next((k for k in model_keys if k.strip() == best_model_name.strip()), None)
+        if matching_key:
+            best_model = lazy_model.models[matching_key]
+        else:
+            # Use the first available model as fallback
+            best_model = list(lazy_model.models.values())[0]
+            gr.Warning(f"Could not find exact model '{best_model_name}', using first available model.")
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pkl") as temp_pkl:
         pickle.dump(best_model, temp_pkl)
     plt.figure(figsize=(10, 6))
     plot_column = "Accuracy" if task == "classification" else "R-Squared"
+    top_models = models.head(10)
+    sns.barplot(x=top_models[plot_column].values, y=top_models.index.tolist())
     plt.title(f"Top 10 Models by {plot_column}")
+    plt.xlabel(plot_column)
+    plt.ylabel("Model")
     plt.tight_layout()
     with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as temp_png:
         plt.savefig(temp_png.name)
     plt.close()
     models_reset = models.reset_index().rename(columns={'index': 'Model'})
+    return profile, profile_path, task, models_reset, plot_path, pickle_path, best_model_name
+def run_pipeline(
+    data_source: Union[Any, str],
+    target_column: str,
+    nebius_api_key: Optional[str] = None
+) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]:
     """
+    Run the complete AutoML pipeline including data loading, EDA, model training, and AI explanation.
+    This is the primary MCP tool function that orchestrates the entire AutoML workflow.
+    Args:
+        data_source: Either a file path/object from local upload or a URL string pointing to a CSV file.
+        target_column: The name of the column to predict (target variable).
+        nebius_api_key: Optional API key for Nebius AI Studio to enable AI-powered explanations.
+    Returns:
+        Tuple containing:
+        - eda_report_path: Path to the generated HTML EDA report file.
+        - task_type: Either "classification" or "regression" based on target variable.
+        - models_dataframe: DataFrame with performance metrics of all trained models.
+        - visualization_path: Path to the model comparison chart image.
+        - model_pickle_path: Path to the serialized best model (.pkl file).
+        - llm_explanation: AI-generated explanation of results (or fallback message).
+        - column_names: Comma-separated list of detected column names.
     """
     # --- 1. Input Validation ---
     if not data_source or not target_column:
+        error_msg = "Please provide both a data source and target column name."
+        gr.Warning("Error: Data source and target column must be provided.")
+        return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED
     gr.Info("Starting analysis...")
     # --- 2. Data Loading ---
+    df, column_names = load_data(data_source)
     if df is None:
+        error_msg = "Could not load data. Please check the file format or URL."
+        return None, NO_TASK_DETECTED, None, None, None, error_msg, NO_COLUMNS_LOADED
     if target_column not in df.columns:
+        error_msg = f"Target column '{target_column}' not found. Available columns: {column_names}"
         gr.Warning(error_msg)
+        return None, NO_TASK_DETECTED, None, None, None, error_msg, column_names
     # --- 3. Analysis and Modeling ---
+    _, profile_path, task, models_df, plot_path, pickle_path, best_model_name = analyze_and_model(df, target_column)
+    # --- 4. Generate Dataset Summary for LLM Context ---
+    dataset_summary = generate_dataset_summary(df, target_column)
+    # Get top 5 model performance summary
+    top_models_summary = models_df.head(5).to_string(index=False)
+    # --- 5. Explanation with Nebius AI Studio LLM ---
+    llm_explanation = "AI explanation is unavailable. Please provide a Nebius AI Studio API key to enable this feature."
+    if nebius_api_key and nebius_api_key.strip():
         try:
             client = OpenAI(
                 base_url="https://api.studio.nebius.com/v1/",
+                api_key=nebius_api_key.strip()
             )
+            # Craft an improved prompt with actual data context
+            prompt_text = f"""Analyze this AutoML result and provide a concise, professional explanation:
+**Dataset Overview:**
+{dataset_summary}
+**Task Type:** {task}
+**Top 5 Performing Models:**
+{top_models_summary}
+**Best Model:** {best_model_name}
+Please explain:
+1. Why '{best_model_name}' performed best for this {task} task
+2. Key insights about the dataset characteristics
+3. Recommendations for model deployment or further improvement
+Keep the explanation concise (3-4 paragraphs) and accessible to both technical and non-technical stakeholders."""
             response = client.chat.completions.create(
+                model="meta-llama/Llama-3.3-70B-Instruct",
                 messages=[
+                    {"role": "system", "content": "You are an expert data scientist assistant that explains machine learning results clearly and professionally."},
                     {"role": "user", "content": prompt_text}
                 ],
+                temperature=0.6,
+                max_tokens=512,
                 top_p=0.9,
+                extra_body={"top_k": 50}
             )
+            # Simplified response access (no need for json.loads)
+            llm_explanation = response.choices[0].message.content
+        except Exception as e:
+            gr.Warning(f"Failed to get AI explanation: {e}")
+            llm_explanation = f"AI explanation unavailable due to an error. The best performing model is **{best_model_name}** for your {task} task."
     gr.Info("Analysis complete!")
     gr.Info(f'Profile report saved to: {profile_path}')
             eda_output = gr.File(label="Download Full EDA Report")
             model_output = gr.File(label="Download Best Model (.pkl)")
+    def process_inputs(
+        file_data: Any,
+        url_data: Optional[str],
+        target: str,
+        api_key: Optional[str]
+    ) -> Tuple[Optional[str], str, Optional[pd.DataFrame], Optional[str], Optional[str], str, str]:
+        """
+        Process inputs and run the AutoML pipeline.
+        This wrapper function handles input selection between file upload and URL,
+        then delegates to the main run_pipeline function.
+        """
         data_source = file_data if file_data is not None else url_data
         return run_pipeline(data_source, target, api_key)
     file_input.change(
         fn=update_detected_columns_display,
         inputs=[file_input, url_input],
     run_button.click(
         fn=process_inputs,
         inputs=[file_input, url_input, target_column_input, nebius_api_key_input],
+        outputs=[eda_output, task_output, metrics_output, vis_output, model_output, llm_output, column_names_output],
+        api_name="run_automl_pipeline"  # Explicit API name for MCP
     )
 demo.launch(
     server_name="0.0.0.0",
     server_port=7860,
     share=False,
     inbrowser=True,
     mcp_server=True
 )

requirements.txt CHANGED Viewed

@@ -5,7 +5,7 @@ gradio>=4.0.0
 Pillow>=10.0.0
 scikit-learn>=1.3.0
 pandas>=2.0.0
-numpy>=1.24.0
 matplotlib>=3.7.0
 seaborn>=0.12.0
 plotly>=5.0.0
@@ -14,3 +14,4 @@ lightgbm>=3.3.0
 shap>=0.42.0
 lazypredict>=0.2.12
 ydata-profiling>=4.0.0

 Pillow>=10.0.0
 scikit-learn>=1.3.0
 pandas>=2.0.0
+numpy>=2.1.0
 matplotlib>=3.7.0
 seaborn>=0.12.0
 plotly>=5.0.0
 shap>=0.42.0
 lazypredict>=0.2.12
 ydata-profiling>=4.0.0
+setuptools >= 80.10.2