Spaces:

Pulastya0
/

Data-Science-Agent

Running

File size: 82,569 Bytes

"""
Complete Tools Registry for Groq Function Calling - All 67 Tools
Defines all available tools in Groq's function calling format.
"""

TOOLS = [
    # ============================================
    # BASIC TOOLS (16)
    # ============================================
    
    # Data Profiling Tools (3)
    {
        "type": "function",
        "function": {
            "name": "profile_dataset",
            "description": "Get comprehensive statistics about a dataset including shape, data types, memory usage, null counts, and unique values. Use this as the first step to understand any new dataset.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Absolute or relative path to the CSV or Parquet file"
                    }
                },
                "required": ["file_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "detect_data_quality_issues",
            "description": "Detect data quality issues including outliers (using IQR method), duplicate rows, inconsistent formats, and data anomalies. Returns a prioritized list of issues with severity levels.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    }
                },
                "required": ["file_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "analyze_correlations",
            "description": "Compute correlation matrix and identify top correlations. If a target column is specified, shows features most correlated with the target. Useful for feature selection and understanding relationships.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "target": {
                        "type": "string",
                        "description": "Optional target column name to analyze correlations with"
                    }
                },
                "required": ["file_path"]
            }
        }
    },
    
    # Data Cleaning Tools (3)
    {
        "type": "function",
        "function": {
            "name": "clean_missing_values",
            "description": "Handle missing values using appropriate strategies based on column type. Strategies include median/mean for numeric, mode for categorical, forward_fill for time series, or drop. In 'auto' mode, first drops columns with >threshold missing (default 40%), then imputes remaining columns. Will not impute ID columns.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "strategy": {
                        "oneOf": [
                            {
                                "type": "string",
                                "enum": ["auto"],
                                "description": "Use 'auto' to automatically decide strategies for all columns based on data type. First drops columns with >threshold missing, then imputes remaining columns."
                            },
                            {
                                "type": "object",
                                "description": "Dictionary mapping column names to strategies ('median', 'mean', 'mode', 'forward_fill', 'drop')",
                                "additionalProperties": {"type": "string"}
                            }
                        ],
                        "description": "Either 'auto' (string) to automatically handle all missing values, or a dictionary mapping specific columns to strategies"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save cleaned dataset"
                    },
                    "threshold": {
                        "type": "number",
                        "description": "For 'auto' mode: drop columns with missing percentage above this threshold (default: 0.4 = 40%). Range: 0.0 to 1.0. For example, 0.7 means drop columns with >70% missing values."
                    }
                },
                "required": ["file_path", "strategy", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "handle_outliers",
            "description": "Detect and handle outliers in numeric columns using IQR method. Methods: 'clip' (cap at boundaries), 'winsorize' (cap at percentiles), or 'remove' (delete rows).",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "method": {
                        "type": "string",
                        "enum": ["clip", "winsorize", "remove"],
                        "description": "Method to handle outliers"
                    },
                    "columns": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of column names to check for outliers. Use 'all' to check all numeric columns."
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save cleaned dataset"
                    }
                },
                "required": ["file_path", "method", "columns", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "fix_data_types",
            "description": "Auto-detect and fix incorrect data types. Handles dates, booleans, categoricals, and numeric columns. Fixes common issues like 'null' strings and mixed types.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "type_mapping": {
                        "type": "object",
                        "description": "Optional dictionary mapping column names to target types ('int', 'float', 'string', 'date', 'bool', 'category'). Use 'auto' for automatic detection.",
                        "additionalProperties": {"type": "string"}
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save dataset with fixed types"
                    }
                },
                "required": ["file_path", "output_path"]
            }
        }
    },
    
    # Data Type Conversion Tools (2)
    {
        "type": "function",
        "function": {
            "name": "force_numeric_conversion",
            "description": "CRITICAL TOOL: Force convert columns to numeric type even if detected as strings/objects. Essential for datasets with numeric columns stored as strings (with commas, spaces, currency symbols). Use this BEFORE encoding when you see 'no numeric features' errors.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "columns": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of column names to force convert to numeric. Use ['all'] to auto-detect and convert all non-ID columns that look numeric."
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save dataset with converted types"
                    },
                    "errors": {
                        "type": "string",
                        "enum": ["coerce", "raise"],
                        "description": "How to handle conversion errors. 'coerce' makes invalid values null (recommended), 'raise' throws error."
                    }
                },
                "required": ["file_path", "columns", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "smart_type_inference",
            "description": "Intelligently infer and fix data types for all columns by analyzing patterns. Goes beyond basic type detection to understand semantic meaning. Use when dataset has widespread type issues.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save dataset with inferred types"
                    },
                    "aggressive": {
                        "type": "boolean",
                        "description": "If true, attempts aggressive conversion on ambiguous columns. Recommended for messy datasets."
                    }
                },
                "required": ["file_path", "output_path"]
            }
        }
    },
    
    # Feature Engineering Tools (2)
    {
        "type": "function",
        "function": {
            "name": "create_time_features",
            "description": "Extract comprehensive time-based features from datetime columns including year, month, day, day_of_week, quarter, is_weekend, and cyclical encodings (sin/cos for month and hour).",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "date_col": {
                        "type": "string",
                        "description": "Name of the datetime column to extract features from"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save dataset with new features"
                    }
                },
                "required": ["file_path", "date_col", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "encode_categorical",
            "description": "Encode categorical variables using one-hot encoding, target encoding, or frequency encoding. Handles high-cardinality columns intelligently. Use method='auto' to automatically choose the best encoding.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "method": {
                        "type": "string",
                        "enum": ["one_hot", "target", "frequency", "auto"],
                        "description": "Encoding method to use. 'auto' automatically selects the best method."
                    },
                    "columns": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of categorical columns to encode. Use ['all'] to encode all categorical columns. If not specified, defaults to all categorical columns."
                    },
                    "target_col": {
                        "type": "string",
                        "description": "Required for target encoding: name of the target column"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save dataset with encoded features"
                    }
                },
                "required": ["file_path", "output_path"]
            }
        }
    },
    
    # Model Training Tools (2)
    {
        "type": "function",
        "function": {
            "name": "train_baseline_models",
            "description": "Train multiple baseline models (Logistic Regression, Random Forest, XGBoost) and compare their performance. Automatically detects task type (classification/regression) and returns the best model with metrics.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the prepared dataset file"
                    },
                    "target_col": {
                        "type": "string",
                        "description": "Name of the target column to predict"
                    },
                    "task_type": {
                        "type": "string",
                        "enum": ["classification", "regression", "auto"],
                        "description": "Type of ML task. Use 'auto' to detect automatically."
                    },
                    "test_size": {
                        "type": "number",
                        "description": "Proportion of data to use for testing (default: 0.2)"
                    },
                    "random_state": {
                        "type": "integer",
                        "description": "Random seed for reproducibility (default: 42)"
                    }
                },
                "required": ["file_path", "target_col"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_model_report",
            "description": "Generate comprehensive model evaluation report including metrics, confusion matrix (for classification), feature importance, and SHAP values for top features. Saves report as JSON.",
            "parameters": {
                "type": "object",
                "properties": {
                    "model_path": {
                        "type": "string",
                        "description": "Path to saved model file (.pkl or .joblib)"
                    },
                    "test_data_path": {
                        "type": "string",
                        "description": "Path to test dataset file"
                    },
                    "target_col": {
                        "type": "string",
                        "description": "Name of the target column"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save the report JSON file"
                    }
                },
                "required": ["model_path", "test_data_path", "target_col", "output_path"]
            }
        }
    },
    
    # New Data Wrangling Tools (3)
    {
        "type": "function",
        "function": {
            "name": "get_smart_summary",
            "description": "Generate an LLM-friendly smart summary of a dataset with per-column missing value percentages (sorted by severity), unique value counts, sample data, and numeric statistics. Much more detailed than profile_dataset for decision-making.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the CSV or Parquet file to summarize"
                    },
                    "n_samples": {
                        "type": "integer",
                        "description": "Number of sample rows to include in the summary (default: 5)"
                    }
                },
                "required": ["file_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "merge_datasets",
            "description": "Merge two datasets using SQL-like join operations (inner, left, right, outer, cross). Supports joining on single or multiple columns with same or different names. Automatically handles duplicate columns with suffixes.",
            "parameters": {
                "type": "object",
                "properties": {
                    "left_path": {
                        "type": "string",
                        "description": "Path to the left (first) dataset file"
                    },
                    "right_path": {
                        "type": "string",
                        "description": "Path to the right (second) dataset file"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save the merged dataset"
                    },
                    "how": {
                        "type": "string",
                        "enum": ["inner", "left", "right", "outer", "cross"],
                        "description": "Join type: 'inner' (only matching rows), 'left' (all left + matching right), 'right' (all right + matching left), 'outer' (all rows from both), 'cross' (cartesian product)"
                    },
                    "on": {
                        "type": ["string", "array"],
                        "items": {"type": "string"},
                        "description": "Column name(s) to join on (must exist in both datasets). Can be a single column name or list of columns. Use this when join columns have the same name in both datasets."
                    },
                    "left_on": {
                        "type": ["string", "array"],
                        "items": {"type": "string"},
                        "description": "Column name(s) in left dataset to join on. Use with right_on when join columns have different names."
                    },
                    "right_on": {
                        "type": ["string", "array"],
                        "items": {"type": "string"},
                        "description": "Column name(s) in right dataset to join on. Use with left_on when join columns have different names."
                    }
                },
                "required": ["left_path", "right_path", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "concat_datasets",
            "description": "Concatenate multiple datasets either vertically (stacking rows, useful for monthly data) or horizontally (adding columns side-by-side). Validates schema compatibility for vertical concat.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_paths": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of paths to dataset files to concatenate (minimum 2 files)"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save the concatenated dataset"
                    },
                    "axis": {
                        "type": "string",
                        "enum": ["vertical", "horizontal"],
                        "description": "'vertical' to stack rows (union, for monthly data), 'horizontal' to add columns side-by-side (default: 'vertical')"
                    }
                },
                "required": ["file_paths", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "reshape_dataset",
            "description": "Transform dataset structure using pivot (long→wide format), melt (wide→long format), or transpose (swap rows and columns) operations.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file to reshape"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save the reshaped dataset"
                    },
                    "operation": {
                        "type": "string",
                        "enum": ["pivot", "melt", "transpose"],
                        "description": "Reshape operation: 'pivot' (long→wide, requires index/columns/values), 'melt' (wide→long, requires id_vars/value_vars), 'transpose' (swap rows/columns)"
                    },
                    "index": {
                        "type": "string",
                        "description": "Column to use as row index (for pivot operation)"
                    },
                    "columns": {
                        "type": "string",
                        "description": "Column whose values become new column names (for pivot operation)"
                    },
                    "values": {
                        "type": "string",
                        "description": "Column whose values populate the pivoted table (for pivot operation)"
                    },
                    "id_vars": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Columns to keep as identifiers (for melt operation)"
                    },
                    "value_vars": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Columns to unpivot (for melt operation). If not specified, uses all columns except id_vars."
                    }
                },
                "required": ["file_path", "output_path", "operation"]
            }
        }
    },
    
    # ============================================
    # ADVANCED ANALYSIS TOOLS (5)
    # ============================================
    {
        "type": "function",
        "function": {
            "name": "perform_eda_analysis",
            "description": "Comprehensive Exploratory Data Analysis with visualizations, distribution analysis, and automated insights. Generates HTML report with plots.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Optional target column for supervised analysis"},
                    "output_dir": {"type": "string", "description": "Directory to save EDA report and plots"}
                },
                "required": ["file_path", "output_dir"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "detect_model_issues",
            "description": "Detect overfitting, underfitting, class imbalance, and other model performance issues. Provides diagnostic recommendations.",
            "parameters": {
                "type": "object",
                "properties": {
                    "model_path": {"type": "string", "description": "Path to trained model"},
                    "train_data_path": {"type": "string", "description": "Path to training data"},
                    "test_data_path": {"type": "string", "description": "Path to test data"},
                    "target_col": {"type": "string", "description": "Target column name"}
                },
                "required": ["model_path", "train_data_path", "test_data_path", "target_col"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "detect_anomalies",
            "description": "Detect anomalies using Isolation Forest, LOF, or statistical methods. Returns anomaly scores and flags.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "method": {"type": "string", "enum": ["isolation_forest", "lof", "statistical"], "description": "Anomaly detection method"},
                    "contamination": {"type": "number", "description": "Expected proportion of anomalies (default: 0.1)"},
                    "output_path": {"type": "string", "description": "Path to save dataset with anomaly scores"}
                },
                "required": ["file_path", "method", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "detect_and_handle_multicollinearity",
            "description": "Detect and handle multicollinearity using VIF (Variance Inflation Factor). Removes highly correlated features automatically.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "threshold": {"type": "number", "description": "VIF threshold (default: 10)"},
                    "method": {"type": "string", "enum": ["drop", "combine"], "description": "How to handle correlated features"},
                    "output_path": {"type": "string", "description": "Path to save cleaned dataset"}
                },
                "required": ["file_path", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "perform_statistical_tests",
            "description": "Perform statistical hypothesis tests (t-test, chi-square, ANOVA) to analyze relationships between features and target.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Target column name"},
                    "test_type": {"type": "string", "enum": ["auto", "ttest", "chi2", "anova"], "description": "Type of statistical test"}
                },
                "required": ["file_path", "target_col"]
            }
        }
    },
    
    # ============================================
    # ADVANCED FEATURE ENGINEERING (4)
    # ============================================
    {
        "type": "function",
        "function": {
            "name": "create_interaction_features",
            "description": "Create polynomial, PCA, or cross-product interaction features to capture non-linear relationships.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "method": {"type": "string", "enum": ["polynomial", "pca", "cross"], "description": "Interaction method"},
                    "degree": {"type": "integer", "description": "Polynomial degree (default: 2)"},
                    "max_features": {"type": "integer", "description": "Maximum new features to create (default: 50)"},
                    "output_path": {"type": "string", "description": "Path to save enhanced dataset"}
                },
                "required": ["file_path", "method", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "create_aggregation_features",
            "description": "Create aggregation features (mean, sum, count, etc.) grouped by categorical columns. Useful for customer/transaction data.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "group_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to group by"},
                    "agg_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to aggregate"},
                    "agg_functions": {"type": "array", "items": {"type": "string"}, "description": "Aggregation functions (mean, sum, count, etc.)"},
                    "output_path": {"type": "string", "description": "Path to save dataset with aggregations"}
                },
                "required": ["file_path", "group_cols", "agg_cols", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "engineer_text_features",
            "description": "Extract features from text columns: TF-IDF, word counts, sentiment, readability scores, and embeddings.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "text_col": {"type": "string", "description": "Text column name"},
                    "methods": {"type": "array", "items": {"type": "string"}, "description": "Feature extraction methods (tfidf, count, sentiment, readability)"},
                    "max_features": {"type": "integer", "description": "Max TF-IDF features (default: 100)"},
                    "output_path": {"type": "string", "description": "Path to save dataset with text features"}
                },
                "required": ["file_path", "text_col", "methods", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "auto_feature_engineering",
            "description": "Use LLM (Gemini/Groq) to automatically generate creative feature engineering ideas and implement them. Works without API key if environment variables are set.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Target column name"},
                    "groq_api_key": {"type": "string", "description": "Groq API key (optional - uses environment variable if not provided)"},
                    "max_suggestions": {"type": "integer", "description": "Maximum feature suggestions to generate (default: 10)"},
                    "implement_top_k": {"type": "integer", "description": "Number of top suggestions to implement (default: 5)"},
                    "output_path": {"type": "string", "description": "Path to save dataset with new features"}
                },
                "required": ["file_path", "target_col", "output_path"]
            }
        }
    },
    
    # ============================================
    # ADVANCED PREPROCESSING (3)
    # ============================================
    {
        "type": "function",
        "function": {
            "name": "handle_imbalanced_data",
            "description": "Handle class imbalance using SMOTE, ADASYN, or class weights. Critical for classification tasks.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Target column name"},
                    "method": {"type": "string", "enum": ["smote", "adasyn", "random_oversample", "random_undersample"], "description": "Balancing method"},
                    "sampling_strategy": {"type": "string", "description": "Sampling ratio (auto, minority, majority)"},
                    "output_path": {"type": "string", "description": "Path to save balanced dataset"}
                },
                "required": ["file_path", "target_col", "method", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "perform_feature_scaling",
            "description": "Scale features using StandardScaler, MinMaxScaler, or RobustScaler. Essential for distance-based algorithms.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "method": {"type": "string", "enum": ["standard", "minmax", "robust"], "description": "Scaling method"},
                    "columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to scale (None = all numeric)"},
                    "output_path": {"type": "string", "description": "Path to save scaled dataset"}
                },
                "required": ["file_path", "method", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "split_data_strategically",
            "description": "Split data with stratification, time-based splitting, or group-based splitting for better validation.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Target column for stratification"},
                    "method": {"type": "string", "enum": ["stratified", "time_based", "group_based"], "description": "Split method"},
                    "test_size": {"type": "number", "description": "Test set proportion (default: 0.2)"},
                    "time_col": {"type": "string", "description": "Time column for time-based split"},
                    "group_col": {"type": "string", "description": "Group column for group-based split"}
                },
                "required": ["file_path", "method"]
            }
        }
    },
    
    # ============================================
    # ADVANCED TRAINING (3)
    # ============================================
    {
        "type": "function",
        "function": {
            "name": "hyperparameter_tuning",
            "description": "Optimize model hyperparameters using Optuna (Bayesian optimization). Finds best model configuration automatically.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to prepared dataset"},
                    "target_col": {"type": "string", "description": "Target column name"},
                    "model_type": {"type": "string", "enum": ["random_forest", "xgboost", "lightgbm"], "description": "Model to tune"},
                    "n_trials": {"type": "integer", "description": "Number of tuning trials (default: 100)"},
                    "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
                    "output_path": {"type": "string", "description": "Path to save tuned model"}
                },
                "required": ["file_path", "target_col", "model_type", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "train_ensemble_models",
            "description": "Train ensemble models using stacking, voting, or blending. Combines multiple models for better performance.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to prepared dataset"},
                    "target_col": {"type": "string", "description": "Target column name"},
                    "ensemble_method": {"type": "string", "enum": ["stacking", "voting", "blending"], "description": "Ensemble technique"},
                    "base_models": {"type": "array", "items": {"type": "string"}, "description": "Base model types to ensemble"},
                    "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
                    "output_path": {"type": "string", "description": "Path to save ensemble model"}
                },
                "required": ["file_path", "target_col", "ensemble_method", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "perform_cross_validation",
            "description": "Perform k-fold cross-validation to get robust model performance estimates. Returns mean and std of metrics.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Target column name"},
                    "model_type": {"type": "string", "description": "Model type (random_forest, xgboost, logistic, ridge)"},
                    "n_splits": {"type": "integer", "description": "Number of CV folds/splits (default: 5)"},
                    "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
                    "cv_strategy": {"type": "string", "enum": ["kfold", "stratified", "timeseries"], "description": "Cross-validation strategy (default: kfold)"},
                    "save_oof": {"type": "boolean", "description": "Whether to save out-of-fold predictions (default: false)"}
                },
                "required": ["file_path", "target_col", "model_type"]
            }
        }
    },
    
    # ============================================
    # BUSINESS INTELLIGENCE (4)
    # ============================================
    {
        "type": "function",
        "function": {
            "name": "perform_cohort_analysis",
            "description": "Analyze user cohorts over time (retention, revenue, engagement). Essential for SaaS and e-commerce businesses.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to transaction/event data"},
                    "user_col": {"type": "string", "description": "User ID column"},
                    "date_col": {"type": "string", "description": "Date/timestamp column"},
                    "metric_col": {"type": "string", "description": "Metric to analyze (revenue, events, etc.)"},
                    "cohort_period": {"type": "string", "enum": ["daily", "weekly", "monthly"], "description": "Cohort grouping period"},
                    "output_path": {"type": "string", "description": "Path to save cohort analysis results"}
                },
                "required": ["file_path", "user_col", "date_col", "metric_col", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "perform_rfm_analysis",
            "description": "RFM (Recency, Frequency, Monetary) analysis for customer segmentation. Identifies best/worst customers.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to transaction data"},
                    "customer_col": {"type": "string", "description": "Customer ID column"},
                    "date_col": {"type": "string", "description": "Transaction date column"},
                    "amount_col": {"type": "string", "description": "Transaction amount column"},
                    "output_path": {"type": "string", "description": "Path to save RFM segments"}
                },
                "required": ["file_path", "customer_col", "date_col", "amount_col", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "detect_causal_relationships",
            "description": "Detect potential causal relationships between features using Granger causality and correlation analysis.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Target/effect column"},
                    "feature_cols": {"type": "array", "items": {"type": "string"}, "description": "Potential cause columns"},
                    "method": {"type": "string", "enum": ["granger", "correlation"], "description": "Causality detection method"}
                },
                "required": ["file_path", "target_col"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_business_insights",
            "description": "Generate automated business insights using descriptive statistics, trends, and anomaly detection. Creates executive summary.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to business data"},
                    "metric_cols": {"type": "array", "items": {"type": "string"}, "description": "Key business metrics to analyze"},
                    "date_col": {"type": "string", "description": "Date column for trend analysis"},
                    "output_path": {"type": "string", "description": "Path to save insights report"}
                },
                "required": ["file_path", "metric_cols", "output_path"]
            }
        }
    },
    
    # ============================================
    # COMPUTER VISION (3)
    # ============================================
    {
        "type": "function",
        "function": {
            "name": "extract_image_features",
            "description": "Extract features from images using pre-trained CNNs (ResNet, VGG). Converts images to feature vectors for ML.",
            "parameters": {
                "type": "object",
                "properties": {
                    "image_dir": {"type": "string", "description": "Directory containing images"},
                    "model": {"type": "string", "enum": ["resnet", "vgg", "mobilenet"], "description": "Pre-trained model to use"},
                    "output_path": {"type": "string", "description": "Path to save feature vectors CSV"}
                },
                "required": ["image_dir", "model", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "perform_image_clustering",
            "description": "Cluster images based on visual similarity using K-means or DBSCAN on extracted features.",
            "parameters": {
                "type": "object",
                "properties": {
                    "image_dir": {"type": "string", "description": "Directory containing images"},
                    "n_clusters": {"type": "integer", "description": "Number of clusters (default: auto-detect)"},
                    "method": {"type": "string", "enum": ["kmeans", "dbscan"], "description": "Clustering method"},
                    "output_path": {"type": "string", "description": "Path to save clustering results"}
                },
                "required": ["image_dir", "method", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "analyze_tabular_image_hybrid",
            "description": "Combine tabular data with image features for hybrid ML models. Useful for e-commerce/medical data.",
            "parameters": {
                "type": "object",
                "properties": {
                    "tabular_path": {"type": "string", "description": "Path to tabular data CSV"},
                    "image_dir": {"type": "string", "description": "Directory with images"},
                    "image_id_col": {"type": "string", "description": "Column linking tabular data to images"},
                    "output_path": {"type": "string", "description": "Path to save combined features"}
                },
                "required": ["tabular_path", "image_dir", "image_id_col", "output_path"]
            }
        }
    },
    
    # ============================================
    # NLP/TEXT ANALYTICS (4)
    # ============================================
    {
        "type": "function",
        "function": {
            "name": "perform_topic_modeling",
            "description": "Discover topics in text documents using LDA or NMF. Extract themes from customer reviews, articles, etc.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset with text"},
                    "text_col": {"type": "string", "description": "Text column name"},
                    "n_topics": {"type": "integer", "description": "Number of topics to extract (default: 5)"},
                    "method": {"type": "string", "enum": ["lda", "nmf"], "description": "Topic modeling method"},
                    "output_path": {"type": "string", "description": "Path to save topics and document-topic matrix"}
                },
                "required": ["file_path", "text_col", "method", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "perform_named_entity_recognition",
            "description": "Extract named entities (person, organization, location) from text using NER models.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset with text"},
                    "text_col": {"type": "string", "description": "Text column name"},
                    "output_path": {"type": "string", "description": "Path to save dataset with extracted entities"}
                },
                "required": ["file_path", "text_col", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "analyze_sentiment_advanced",
            "description": "Perform advanced sentiment analysis with aspect-based sentiment (what features customers like/dislike).",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset with text"},
                    "text_col": {"type": "string", "description": "Text column name"},
                    "aspects": {"type": "array", "items": {"type": "string"}, "description": "Aspects to analyze sentiment for (e.g., 'price', 'quality')"},
                    "output_path": {"type": "string", "description": "Path to save sentiment scores"}
                },
                "required": ["file_path", "text_col", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "perform_text_similarity",
            "description": "Calculate text similarity using cosine similarity, Jaccard, or semantic embeddings. Find duplicate/similar documents.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset with text"},
                    "text_col": {"type": "string", "description": "Text column name"},
                    "method": {"type": "string", "enum": ["cosine", "jaccard", "semantic"], "description": "Similarity method"},
                    "threshold": {"type": "number", "description": "Similarity threshold (0-1)"},
                    "output_path": {"type": "string", "description": "Path to save similarity matrix"}
                },
                "required": ["file_path", "text_col", "method", "output_path"]
            }
        }
    },
    
    # ============================================
    # PRODUCTION/MLOPS (5)
    # ============================================
    {
        "type": "function",
        "function": {
            "name": "monitor_model_drift",
            "description": "Detect data drift and concept drift in production models. Compare training vs production data distributions.",
            "parameters": {
                "type": "object",
                "properties": {
                    "train_data_path": {"type": "string", "description": "Path to original training data"},
                    "production_data_path": {"type": "string", "description": "Path to recent production data"},
                    "features": {"type": "array", "items": {"type": "string"}, "description": "Features to monitor for drift"},
                    "output_path": {"type": "string", "description": "Path to save drift report"}
                },
                "required": ["train_data_path", "production_data_path", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "explain_predictions",
            "description": "Explain model predictions using SHAP or LIME. Generate feature importance explanations for individual predictions.",
            "parameters": {
                "type": "object",
                "properties": {
                    "model_path": {"type": "string", "description": "Path to trained model"},
                    "data_path": {"type": "string", "description": "Path to data to explain"},
                    "method": {"type": "string", "enum": ["shap", "lime"], "description": "Explanation method"},
                    "n_samples": {"type": "integer", "description": "Number of samples to explain (default: 10)"},
                    "output_path": {"type": "string", "description": "Path to save explanations"}
                },
                "required": ["model_path", "data_path", "method", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_model_card",
            "description": "Generate model card documentation with model details, performance metrics, bias analysis, and usage guidelines.",
            "parameters": {
                "type": "object",
                "properties": {
                    "model_path": {"type": "string", "description": "Path to trained model"},
                    "train_data_path": {"type": "string", "description": "Path to training data"},
                    "test_data_path": {"type": "string", "description": "Path to test data"},
                    "target_col": {"type": "string", "description": "Target column name"},
                    "output_path": {"type": "string", "description": "Path to save model card JSON"}
                },
                "required": ["model_path", "train_data_path", "test_data_path", "target_col", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "perform_ab_test_analysis",
            "description": "Analyze A/B test results with statistical significance testing. Determine if variant B is better than control A.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to A/B test data"},
                    "variant_col": {"type": "string", "description": "Column indicating variant (A/B)"},
                    "metric_col": {"type": "string", "description": "Success metric column"},
                    "confidence_level": {"type": "number", "description": "Confidence level for significance (default: 0.95)"}
                },
                "required": ["file_path", "variant_col", "metric_col"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "detect_feature_leakage",
            "description": "Detect potential feature leakage by analyzing feature importance and temporal relationships. Prevents data leakage bugs.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Target column name"},
                    "date_col": {"type": "string", "description": "Optional date column for temporal analysis"}
                },
                "required": ["file_path", "target_col"]
            }
        }
    },
    
    # ============================================
    # TIME SERIES (3)
    # ============================================
    {
        "type": "function",
        "function": {
            "name": "forecast_time_series",
            "description": "Forecast future values using ARIMA, Prophet, or LSTM models. Handles seasonal and trend components.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to time series data"},
                    "date_col": {"type": "string", "description": "Date/timestamp column"},
                    "value_col": {"type": "string", "description": "Value column to forecast"},
                    "forecast_periods": {"type": "integer", "description": "Number of periods to forecast"},
                    "method": {"type": "string", "enum": ["arima", "prophet", "lstm"], "description": "Forecasting method"},
                    "output_path": {"type": "string", "description": "Path to save forecast results"}
                },
                "required": ["file_path", "date_col", "value_col", "forecast_periods", "method", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "detect_seasonality_trends",
            "description": "Detect seasonality patterns and trends in time series data using STL decomposition and statistical tests.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to time series data"},
                    "date_col": {"type": "string", "description": "Date/timestamp column"},
                    "value_col": {"type": "string", "description": "Value column to analyze"},
                    "period": {"type": "integer", "description": "Expected seasonal period (e.g., 12 for monthly)"}
                },
                "required": ["file_path", "date_col", "value_col"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "create_time_series_features",
            "description": "Create comprehensive time series features: lags, rolling stats, exponential moving averages, and Fourier features.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to time series data"},
                    "date_col": {"type": "string", "description": "Date/timestamp column"},
                    "value_col": {"type": "string", "description": "Value column"},
                    "lags": {"type": "array", "items": {"type": "integer"}, "description": "Lag periods to create (e.g., [1, 7, 30])"},
                    "windows": {"type": "array", "items": {"type": "integer"}, "description": "Rolling window sizes (e.g., [7, 30])"},
                    "output_path": {"type": "string", "description": "Path to save dataset with time series features"}
                },
                "required": ["file_path", "date_col", "value_col", "output_path"]
            }
        }
    },
    
    # ============================================
    # ADVANCED INSIGHTS TOOLS (6) - NEW
    # ============================================
    
    {
        "type": "function",
        "function": {
            "name": "analyze_root_cause",
            "description": "Perform root cause analysis to identify why a metric dropped or changed. Analyzes correlations, temporal patterns, and identifies top influencing factors.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Column to analyze (e.g., 'sales')"},
                    "time_col": {"type": "string", "description": "Optional time column for trend analysis"},
                    "threshold_drop": {"type": "number", "description": "Percentage drop to flag as significant (default 0.15)"}
                },
                "required": ["file_path", "target_col"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "detect_trends_and_seasonality",
            "description": "Detect trends and seasonal patterns in time series data using statistical methods and autocorrelation.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "value_col": {"type": "string", "description": "Column with values to analyze"},
                    "time_col": {"type": "string", "description": "Column with timestamps"},
                    "seasonal_period": {"type": "integer", "description": "Expected seasonal period (auto-detected if None)"}
                },
                "required": ["file_path", "value_col", "time_col"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "detect_anomalies_advanced",
            "description": "Detect anomalies with confidence scores using Isolation Forest or statistical methods.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to analyze (all numeric if None)"},
                    "contamination": {"type": "number", "description": "Expected proportion of outliers (default 0.1)"},
                    "method": {"type": "string", "enum": ["isolation_forest", "statistical"], "description": "Detection method"}
                },
                "required": ["file_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "perform_hypothesis_testing",
            "description": "Perform statistical hypothesis testing (t-test, ANOVA, chi-square) to compare groups.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "group_col": {"type": "string", "description": "Column defining groups"},
                    "value_col": {"type": "string", "description": "Column with values to compare"},
                    "test_type": {"type": "string", "enum": ["t-test", "anova", "chi-square", "auto"], "description": "Test type (auto-detected if 'auto')"}
                },
                "required": ["file_path", "group_col", "value_col"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "analyze_distribution",
            "description": "Analyze distribution of a column including normality tests, skewness, and kurtosis.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "column": {"type": "string", "description": "Column to analyze"},
                    "tests": {"type": "array", "items": {"type": "string"}, "description": "Tests to perform (normality, skewness)"}
                },
                "required": ["file_path", "column"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "perform_segment_analysis",
            "description": "Perform cluster-based customer/data segmentation using K-means and profile each segment.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "n_segments": {"type": "integer", "description": "Number of segments to create (default 5)"},
                    "features": {"type": "array", "items": {"type": "string"}, "description": "Features for clustering (all numeric if None)"}
                },
                "required": ["file_path"]
            }
        }
    },
    
    # ============================================
    # AUTOMATED PIPELINE TOOLS (2) - NEW
    # ============================================
    
    {
        "type": "function",
        "function": {
            "name": "auto_ml_pipeline",
            "description": "Fully automated ML pipeline: auto-detect types, clean missing values, handle outliers, encode categorical, engineer features, and select best features. Zero configuration required!",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to input dataset"},
                    "target_col": {"type": "string", "description": "Target column name"},
                    "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "Task type (auto-detected if 'auto')"},
                    "output_path": {"type": "string", "description": "Where to save processed data"},
                    "feature_engineering_level": {"type": "string", "enum": ["basic", "intermediate", "advanced"], "description": "Feature engineering depth"}
                },
                "required": ["file_path", "target_col"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "auto_feature_selection",
            "description": "Automatically select the best features for modeling using mutual information or F-statistics.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Target column"},
                    "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "Task type"},
                    "max_features": {"type": "integer", "description": "Maximum features to keep (default 50)"},
                    "method": {"type": "string", "enum": ["mutual_info", "f_test", "auto"], "description": "Selection method"},
                    "output_path": {"type": "string", "description": "Where to save selected features"}
                },
                "required": ["file_path", "target_col"]
            }
        }
    },
    
    # ============================================
    # VISUALIZATION TOOLS (3) - NEW
    # ============================================
    
    {
        "type": "function",
        "function": {
            "name": "generate_all_plots",
            "description": "Generate ALL plots for a dataset automatically: data quality, EDA, distributions, and correlations. Creates interactive HTML plots.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Optional target column"},
                    "output_dir": {"type": "string", "description": "Directory to save plots (default ./outputs/plots)"}
                },
                "required": ["file_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_data_quality_plots",
            "description": "Generate data quality visualizations: missing values, data types, and outlier detection plots.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "output_dir": {"type": "string", "description": "Directory to save plots"}
                },
                "required": ["file_path", "output_dir"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_eda_plots",
            "description": "Generate exploratory data analysis plots: correlation heatmap, feature relationships, and pairplots.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Optional target column"},
                    "output_dir": {"type": "string", "description": "Directory to save plots"}
                },
                "required": ["file_path", "output_dir"]
            }
        }
    },
    
    # ============================================
    # INTERACTIVE PLOTLY VISUALIZATIONS (6)
    # ============================================
    {
        "type": "function",
        "function": {
            "name": "generate_interactive_scatter",
            "description": "Create interactive scatter plot with zoom, pan, and hover capabilities. Great for exploring relationships between variables.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "x_col": {"type": "string", "description": "Column for X-axis"},
                    "y_col": {"type": "string", "description": "Column for Y-axis"},
                    "color_col": {"type": "string", "description": "Optional column for color coding points"},
                    "size_col": {"type": "string", "description": "Optional column for bubble size"},
                    "output_path": {"type": "string", "description": "Path to save HTML file (default: ./outputs/plots/interactive/scatter.html)"}
                },
                "required": ["file_path", "x_col", "y_col"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_interactive_histogram",
            "description": "Create interactive histogram with box plot overlay. Users can explore distribution interactively.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "column": {"type": "string", "description": "Column to plot distribution"},
                    "bins": {"type": "integer", "description": "Number of bins (default: 30)"},
                    "color_col": {"type": "string", "description": "Optional column for grouped histograms"},
                    "output_path": {"type": "string", "description": "Path to save HTML file"}
                },
                "required": ["file_path", "column"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_interactive_correlation_heatmap",
            "description": "Create interactive correlation heatmap with hover values. Better than static matplotlib version.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "output_path": {"type": "string", "description": "Path to save HTML file"}
                },
                "required": ["file_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_interactive_box_plots",
            "description": "Create interactive box plots for outlier detection. Supports grouping by categorical variable.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to plot (all numeric if not specified)"},
                    "group_by": {"type": "string", "description": "Optional categorical column for grouping"},
                    "output_path": {"type": "string", "description": "Path to save HTML file"}
                },
                "required": ["file_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_interactive_time_series",
            "description": "Create interactive time series plot with range slider and zoom. Perfect for temporal data analysis.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "time_col": {"type": "string", "description": "Column with datetime values"},
                    "value_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to plot over time"},
                    "output_path": {"type": "string", "description": "Path to save HTML file"}
                },
                "required": ["file_path", "time_col", "value_cols"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_plotly_dashboard",
            "description": "Generate complete interactive dashboard with multiple visualizations: correlation heatmap, box plots, scatter plots, histograms. One-stop visualization solution.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to dataset"},
                    "target_col": {"type": "string", "description": "Optional target column for supervised analysis"},
                    "output_dir": {"type": "string", "description": "Directory to save all plots (default: ./outputs/plots/interactive)"}
                },
                "required": ["file_path"]
            }
        }
    },
    # EDA Report Generation (1) - NEW PHASE 2
    {
        "type": "function",
        "function": {
            "name": "generate_ydata_profiling_report",
            "description": "Generate comprehensive HTML report using ydata-profiling (formerly pandas-profiling). Provides extensive analysis: overview, variable statistics, interactions, correlations (Pearson, Spearman, Cramér's V), missing values matrix, duplicate analysis, and more. Most detailed and comprehensive profiling tool with automated insights and data quality warnings.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {"type": "string", "description": "Path to the dataset CSV/Parquet file"},
                    "output_path": {"type": "string", "description": "Where to save HTML report (default: ./outputs/reports/ydata_profile.html)"},
                    "minimal": {"type": "boolean", "description": "If true, generates faster minimal report (useful for large datasets, default: false)"},
                    "title": {"type": "string", "description": "Report title (default: 'Data Profiling Report')"}
                },
                "required": ["file_path"]
            }
        }
    },
    # ========================================
    # CODE INTERPRETER - THE GAME CHANGER 🚀
    # ========================================
    {
        "type": "function",
        "function": {
            "name": "execute_python_code",
            "description": "⭐ CRITICAL TOOL - Execute custom Python code for ANY data science task not covered by existing tools. This is what makes you a TRUE AI AGENT, not just a function-calling bot. Use this when user requests: 1) Custom visualizations (specific Plotly plots, interactive dashboards, unique chart types) 2) Domain-specific calculations 3) Custom data transformations 4) Specific export formats 5) Interactive widgets/filters. Code has access to pandas, polars, numpy, matplotlib, seaborn, plotly. ALWAYS save outputs to files and return file paths.",
            "parameters": {
                "type": "object",
                "properties": {
                    "code": {
                        "type": "string",
                        "description": "Python code to execute. Auto-imported: pandas as pd, polars as pl, numpy as np, matplotlib.pyplot as plt, seaborn as sns, plotly.express as px, plotly.graph_objects as go. Code should save outputs to files in working_directory. Example: fig.write_html('./outputs/code/plot.html')"
                    },
                    "working_directory": {
                        "type": "string",
                        "description": "Directory to run code in (default: ./outputs/code). Code can read from ./temp/ and write to this directory."
                    },
                    "timeout": {
                        "type": "integer",
                        "description": "Maximum execution time in seconds (default: 60)"
                    }
                },
                "required": ["code"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "execute_code_from_file",
            "description": "Execute Python code from an existing .py file. Useful when code is too long to pass as string, or when running pre-written scripts. Same capabilities as execute_python_code.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to .py file to execute"
                    },
                    "working_directory": {
                        "type": "string",
                        "description": "Directory to run code in (default: ./outputs/code)"
                    },
                    "timeout": {
                        "type": "integer",
                        "description": "Maximum execution time in seconds (default: 60)"
                    }
                },
                "required": ["file_path"]
            }
        }
    },
    
    # ============================================
    # CLOUD DATA SOURCES (4) - NEW
    # ============================================
    
    {
        "type": "function",
        "function": {
            "name": "load_bigquery_table",
            "description": "Load data from Google BigQuery table into a Polars DataFrame. Supports sampling via LIMIT and column selection. Returns CSV path for downstream tools. Use profile_bigquery_table first for large tables.",
            "parameters": {
                "type": "object",
                "properties": {
                    "project_id": {
                        "type": "string",
                        "description": "Google Cloud project ID"
                    },
                    "dataset": {
                        "type": "string",
                        "description": "BigQuery dataset name"
                    },
                    "table": {
                        "type": "string",
                        "description": "BigQuery table name"
                    },
                    "limit": {
                        "type": "integer",
                        "description": "Optional row limit for sampling (e.g., 10000 for large tables)"
                    },
                    "columns": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "Optional list of column names to load"
                    },
                    "where_clause": {
                        "type": "string",
                        "description": "Optional SQL WHERE clause for filtering (without WHERE keyword)"
                    }
                },
                "required": ["project_id", "dataset", "table"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "write_bigquery_table",
            "description": "Write predictions or processed data from CSV/Parquet file to BigQuery table. Supports append, overwrite, or fail modes.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to CSV or Parquet file to write"
                    },
                    "project_id": {
                        "type": "string",
                        "description": "Google Cloud project ID"
                    },
                    "dataset": {
                        "type": "string",
                        "description": "BigQuery dataset name"
                    },
                    "table": {
                        "type": "string",
                        "description": "BigQuery table name"
                    },
                    "mode": {
                        "type": "string",
                        "enum": ["append", "overwrite", "fail"],
                        "description": "Write mode: append (add rows), overwrite (replace), fail (error if exists)"
                    }
                },
                "required": ["file_path", "project_id", "dataset", "table"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "profile_bigquery_table",
            "description": "Profile a BigQuery table without loading all data. Returns row count, column types, null counts (sampled), table size, and load recommendations. Use this BEFORE load_bigquery_table for large tables.",
            "parameters": {
                "type": "object",
                "properties": {
                    "project_id": {
                        "type": "string",
                        "description": "Google Cloud project ID"
                    },
                    "dataset": {
                        "type": "string",
                        "description": "BigQuery dataset name"
                    },
                    "table": {
                        "type": "string",
                        "description": "BigQuery table name"
                    }
                },
                "required": ["project_id", "dataset", "table"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "query_bigquery",
            "description": "Execute custom BigQuery SQL query and return results as DataFrame. Useful for complex aggregations, joins, or transformations before analysis.",
            "parameters": {
                "type": "object",
                "properties": {
                    "project_id": {
                        "type": "string",
                        "description": "Google Cloud project ID"
                    },
                    "query": {
                        "type": "string",
                        "description": "SQL query to execute"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Optional path to save results (default: auto-generated)"
                    },
                    "limit": {
                        "type": "integer",
                        "description": "Optional row limit to append to query"
                    }
                },
                "required": ["project_id", "query"]
            }
        }
    }
]


def get_tool_by_name(tool_name: str) -> dict:
    """Get tool definition by name."""
    for tool in TOOLS:
        if tool["function"]["name"] == tool_name:
            return tool
    raise ValueError(f"Tool '{tool_name}' not found in registry")


def get_all_tool_names() -> list:
    """Get list of all tool names."""
    return [tool["function"]["name"] for tool in TOOLS]


def get_tools_by_category() -> dict:
    """Get tools organized by category."""
    return {
        "basic": [t["function"]["name"] for t in TOOLS[:16]],
        "advanced_analysis": [t["function"]["name"] for t in TOOLS[16:21]],
        "advanced_feature_engineering": [t["function"]["name"] for t in TOOLS[21:25]],
        "advanced_preprocessing": [t["function"]["name"] for t in TOOLS[25:28]],
        "advanced_training": [t["function"]["name"] for t in TOOLS[28:31]],
        "business_intelligence": [t["function"]["name"] for t in TOOLS[31:35]],
        "computer_vision": [t["function"]["name"] for t in TOOLS[35:38]],
        "nlp_text_analytics": [t["function"]["name"] for t in TOOLS[38:42]],
        "production_mlops": [t["function"]["name"] for t in TOOLS[42:47]],
        "time_series": [t["function"]["name"] for t in TOOLS[47:50]],
        "cloud_data_sources": [t["function"]["name"] for t in TOOLS[50:54]]
    }