Spaces:
Running
Running
| """ | |
| Complete Tools Registry for Groq Function Calling - All 67 Tools | |
| Defines all available tools in Groq's function calling format. | |
| """ | |
| TOOLS = [ | |
| # ============================================ | |
| # BASIC TOOLS (16) | |
| # ============================================ | |
| # Data Profiling Tools (3) | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "profile_dataset", | |
| "description": "Get comprehensive statistics about a dataset including shape, data types, memory usage, null counts, and unique values. Use this as the first step to understand any new dataset.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Absolute or relative path to the CSV or Parquet file" | |
| } | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "detect_data_quality_issues", | |
| "description": "Detect data quality issues including outliers (using IQR method), duplicate rows, inconsistent formats, and data anomalies. Returns a prioritized list of issues with severity levels.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| } | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "analyze_correlations", | |
| "description": "Compute correlation matrix and identify top correlations. If a target column is specified, shows features most correlated with the target. Useful for feature selection and understanding relationships.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "target": { | |
| "type": "string", | |
| "description": "Optional target column name to analyze correlations with" | |
| } | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| # Data Cleaning Tools (3) | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "clean_missing_values", | |
| "description": "Handle missing values using appropriate strategies based on column type. Strategies include median/mean for numeric, mode for categorical, forward_fill for time series, or drop. In 'auto' mode, first drops columns with >threshold missing (default 40%), then imputes remaining columns. Will not impute ID columns.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "strategy": { | |
| "oneOf": [ | |
| { | |
| "type": "string", | |
| "enum": ["auto"], | |
| "description": "Use 'auto' to automatically decide strategies for all columns based on data type. First drops columns with >threshold missing, then imputes remaining columns." | |
| }, | |
| { | |
| "type": "object", | |
| "description": "Dictionary mapping column names to strategies ('median', 'mean', 'mode', 'forward_fill', 'drop')", | |
| "additionalProperties": {"type": "string"} | |
| } | |
| ], | |
| "description": "Either 'auto' (string) to automatically handle all missing values, or a dictionary mapping specific columns to strategies" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save cleaned dataset" | |
| }, | |
| "threshold": { | |
| "type": "number", | |
| "description": "For 'auto' mode: drop columns with missing percentage above this threshold (default: 0.4 = 40%). Range: 0.0 to 1.0. For example, 0.7 means drop columns with >70% missing values." | |
| } | |
| }, | |
| "required": ["file_path", "strategy", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "handle_outliers", | |
| "description": "Detect and handle outliers in numeric columns using IQR method. Methods: 'clip' (cap at boundaries), 'winsorize' (cap at percentiles), or 'remove' (delete rows).", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "method": { | |
| "type": "string", | |
| "enum": ["clip", "winsorize", "remove"], | |
| "description": "Method to handle outliers" | |
| }, | |
| "columns": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "List of column names to check for outliers. Use 'all' to check all numeric columns." | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save cleaned dataset" | |
| } | |
| }, | |
| "required": ["file_path", "method", "columns", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "fix_data_types", | |
| "description": "Auto-detect and fix incorrect data types. Handles dates, booleans, categoricals, and numeric columns. Fixes common issues like 'null' strings and mixed types.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "type_mapping": { | |
| "type": "object", | |
| "description": "Optional dictionary mapping column names to target types ('int', 'float', 'string', 'date', 'bool', 'category'). Use 'auto' for automatic detection.", | |
| "additionalProperties": {"type": "string"} | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save dataset with fixed types" | |
| } | |
| }, | |
| "required": ["file_path", "output_path"] | |
| } | |
| } | |
| }, | |
| # Data Type Conversion Tools (2) | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "force_numeric_conversion", | |
| "description": "CRITICAL TOOL: Force convert columns to numeric type even if detected as strings/objects. Essential for datasets with numeric columns stored as strings (with commas, spaces, currency symbols). Use this BEFORE encoding when you see 'no numeric features' errors.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "columns": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "List of column names to force convert to numeric. Use ['all'] to auto-detect and convert all non-ID columns that look numeric." | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save dataset with converted types" | |
| }, | |
| "errors": { | |
| "type": "string", | |
| "enum": ["coerce", "raise"], | |
| "description": "How to handle conversion errors. 'coerce' makes invalid values null (recommended), 'raise' throws error." | |
| } | |
| }, | |
| "required": ["file_path", "columns", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "smart_type_inference", | |
| "description": "Intelligently infer and fix data types for all columns by analyzing patterns. Goes beyond basic type detection to understand semantic meaning. Use when dataset has widespread type issues.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save dataset with inferred types" | |
| }, | |
| "aggressive": { | |
| "type": "boolean", | |
| "description": "If true, attempts aggressive conversion on ambiguous columns. Recommended for messy datasets." | |
| } | |
| }, | |
| "required": ["file_path", "output_path"] | |
| } | |
| } | |
| }, | |
| # Feature Engineering Tools (2) | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "create_time_features", | |
| "description": "Extract comprehensive time-based features from datetime columns including year, month, day, day_of_week, quarter, is_weekend, and cyclical encodings (sin/cos for month and hour).", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "date_col": { | |
| "type": "string", | |
| "description": "Name of the datetime column to extract features from" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save dataset with new features" | |
| } | |
| }, | |
| "required": ["file_path", "date_col", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "encode_categorical", | |
| "description": "Encode categorical variables using one-hot encoding, target encoding, or frequency encoding. Handles high-cardinality columns intelligently. Use method='auto' to automatically choose the best encoding.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "method": { | |
| "type": "string", | |
| "enum": ["one_hot", "target", "frequency", "auto"], | |
| "description": "Encoding method to use. 'auto' automatically selects the best method." | |
| }, | |
| "columns": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "List of categorical columns to encode. Use ['all'] to encode all categorical columns. If not specified, defaults to all categorical columns." | |
| }, | |
| "target_col": { | |
| "type": "string", | |
| "description": "Required for target encoding: name of the target column" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save dataset with encoded features" | |
| } | |
| }, | |
| "required": ["file_path", "output_path"] | |
| } | |
| } | |
| }, | |
| # Model Training Tools (2) | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "train_baseline_models", | |
| "description": "Train multiple baseline models (Logistic Regression, Random Forest, XGBoost) and compare their performance. Automatically detects task type (classification/regression) and returns the best model with metrics.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the prepared dataset file" | |
| }, | |
| "target_col": { | |
| "type": "string", | |
| "description": "Name of the target column to predict" | |
| }, | |
| "task_type": { | |
| "type": "string", | |
| "enum": ["classification", "regression", "auto"], | |
| "description": "Type of ML task. Use 'auto' to detect automatically." | |
| }, | |
| "test_size": { | |
| "type": "number", | |
| "description": "Proportion of data to use for testing (default: 0.2)" | |
| }, | |
| "random_state": { | |
| "type": "integer", | |
| "description": "Random seed for reproducibility (default: 42)" | |
| } | |
| }, | |
| "required": ["file_path", "target_col"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_model_report", | |
| "description": "Generate comprehensive model evaluation report including metrics, confusion matrix (for classification), feature importance, and SHAP values for top features. Saves report as JSON.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "model_path": { | |
| "type": "string", | |
| "description": "Path to saved model file (.pkl or .joblib)" | |
| }, | |
| "test_data_path": { | |
| "type": "string", | |
| "description": "Path to test dataset file" | |
| }, | |
| "target_col": { | |
| "type": "string", | |
| "description": "Name of the target column" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save the report JSON file" | |
| } | |
| }, | |
| "required": ["model_path", "test_data_path", "target_col", "output_path"] | |
| } | |
| } | |
| }, | |
| # New Data Wrangling Tools (3) | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "get_smart_summary", | |
| "description": "Generate an LLM-friendly smart summary of a dataset with per-column missing value percentages (sorted by severity), unique value counts, sample data, and numeric statistics. Much more detailed than profile_dataset for decision-making.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the CSV or Parquet file to summarize" | |
| }, | |
| "n_samples": { | |
| "type": "integer", | |
| "description": "Number of sample rows to include in the summary (default: 5)" | |
| } | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "merge_datasets", | |
| "description": "Merge two datasets using SQL-like join operations (inner, left, right, outer, cross). Supports joining on single or multiple columns with same or different names. Automatically handles duplicate columns with suffixes.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "left_path": { | |
| "type": "string", | |
| "description": "Path to the left (first) dataset file" | |
| }, | |
| "right_path": { | |
| "type": "string", | |
| "description": "Path to the right (second) dataset file" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save the merged dataset" | |
| }, | |
| "how": { | |
| "type": "string", | |
| "enum": ["inner", "left", "right", "outer", "cross"], | |
| "description": "Join type: 'inner' (only matching rows), 'left' (all left + matching right), 'right' (all right + matching left), 'outer' (all rows from both), 'cross' (cartesian product)" | |
| }, | |
| "on": { | |
| "type": ["string", "array"], | |
| "items": {"type": "string"}, | |
| "description": "Column name(s) to join on (must exist in both datasets). Can be a single column name or list of columns. Use this when join columns have the same name in both datasets." | |
| }, | |
| "left_on": { | |
| "type": ["string", "array"], | |
| "items": {"type": "string"}, | |
| "description": "Column name(s) in left dataset to join on. Use with right_on when join columns have different names." | |
| }, | |
| "right_on": { | |
| "type": ["string", "array"], | |
| "items": {"type": "string"}, | |
| "description": "Column name(s) in right dataset to join on. Use with left_on when join columns have different names." | |
| } | |
| }, | |
| "required": ["left_path", "right_path", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "concat_datasets", | |
| "description": "Concatenate multiple datasets either vertically (stacking rows, useful for monthly data) or horizontally (adding columns side-by-side). Validates schema compatibility for vertical concat.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_paths": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "List of paths to dataset files to concatenate (minimum 2 files)" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save the concatenated dataset" | |
| }, | |
| "axis": { | |
| "type": "string", | |
| "enum": ["vertical", "horizontal"], | |
| "description": "'vertical' to stack rows (union, for monthly data), 'horizontal' to add columns side-by-side (default: 'vertical')" | |
| } | |
| }, | |
| "required": ["file_paths", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "reshape_dataset", | |
| "description": "Transform dataset structure using pivot (long→wide format), melt (wide→long format), or transpose (swap rows and columns) operations.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file to reshape" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save the reshaped dataset" | |
| }, | |
| "operation": { | |
| "type": "string", | |
| "enum": ["pivot", "melt", "transpose"], | |
| "description": "Reshape operation: 'pivot' (long→wide, requires index/columns/values), 'melt' (wide→long, requires id_vars/value_vars), 'transpose' (swap rows/columns)" | |
| }, | |
| "index": { | |
| "type": "string", | |
| "description": "Column to use as row index (for pivot operation)" | |
| }, | |
| "columns": { | |
| "type": "string", | |
| "description": "Column whose values become new column names (for pivot operation)" | |
| }, | |
| "values": { | |
| "type": "string", | |
| "description": "Column whose values populate the pivoted table (for pivot operation)" | |
| }, | |
| "id_vars": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "Columns to keep as identifiers (for melt operation)" | |
| }, | |
| "value_vars": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "Columns to unpivot (for melt operation). If not specified, uses all columns except id_vars." | |
| } | |
| }, | |
| "required": ["file_path", "output_path", "operation"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # ADVANCED ANALYSIS TOOLS (5) | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_eda_analysis", | |
| "description": "Comprehensive Exploratory Data Analysis with visualizations, distribution analysis, and automated insights. Generates HTML report with plots.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Optional target column for supervised analysis"}, | |
| "output_dir": {"type": "string", "description": "Directory to save EDA report and plots"} | |
| }, | |
| "required": ["file_path", "output_dir"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "detect_model_issues", | |
| "description": "Detect overfitting, underfitting, class imbalance, and other model performance issues. Provides diagnostic recommendations.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "model_path": {"type": "string", "description": "Path to trained model"}, | |
| "train_data_path": {"type": "string", "description": "Path to training data"}, | |
| "test_data_path": {"type": "string", "description": "Path to test data"}, | |
| "target_col": {"type": "string", "description": "Target column name"} | |
| }, | |
| "required": ["model_path", "train_data_path", "test_data_path", "target_col"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "detect_anomalies", | |
| "description": "Detect anomalies using Isolation Forest, LOF, or statistical methods. Returns anomaly scores and flags.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "method": {"type": "string", "enum": ["isolation_forest", "lof", "statistical"], "description": "Anomaly detection method"}, | |
| "contamination": {"type": "number", "description": "Expected proportion of anomalies (default: 0.1)"}, | |
| "output_path": {"type": "string", "description": "Path to save dataset with anomaly scores"} | |
| }, | |
| "required": ["file_path", "method", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "detect_and_handle_multicollinearity", | |
| "description": "Detect and handle multicollinearity using VIF (Variance Inflation Factor). Removes highly correlated features automatically.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "threshold": {"type": "number", "description": "VIF threshold (default: 10)"}, | |
| "method": {"type": "string", "enum": ["drop", "combine"], "description": "How to handle correlated features"}, | |
| "output_path": {"type": "string", "description": "Path to save cleaned dataset"} | |
| }, | |
| "required": ["file_path", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_statistical_tests", | |
| "description": "Perform statistical hypothesis tests (t-test, chi-square, ANOVA) to analyze relationships between features and target.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Target column name"}, | |
| "test_type": {"type": "string", "enum": ["auto", "ttest", "chi2", "anova"], "description": "Type of statistical test"} | |
| }, | |
| "required": ["file_path", "target_col"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # ADVANCED FEATURE ENGINEERING (4) | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "create_interaction_features", | |
| "description": "Create polynomial, PCA, or cross-product interaction features to capture non-linear relationships.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "method": {"type": "string", "enum": ["polynomial", "pca", "cross"], "description": "Interaction method"}, | |
| "degree": {"type": "integer", "description": "Polynomial degree (default: 2)"}, | |
| "max_features": {"type": "integer", "description": "Maximum new features to create (default: 50)"}, | |
| "output_path": {"type": "string", "description": "Path to save enhanced dataset"} | |
| }, | |
| "required": ["file_path", "method", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "create_aggregation_features", | |
| "description": "Create aggregation features (mean, sum, count, etc.) grouped by categorical columns. Useful for customer/transaction data.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "group_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to group by"}, | |
| "agg_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to aggregate"}, | |
| "agg_functions": {"type": "array", "items": {"type": "string"}, "description": "Aggregation functions (mean, sum, count, etc.)"}, | |
| "output_path": {"type": "string", "description": "Path to save dataset with aggregations"} | |
| }, | |
| "required": ["file_path", "group_cols", "agg_cols", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "engineer_text_features", | |
| "description": "Extract features from text columns: TF-IDF, word counts, sentiment, readability scores, and embeddings.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "text_col": {"type": "string", "description": "Text column name"}, | |
| "methods": {"type": "array", "items": {"type": "string"}, "description": "Feature extraction methods (tfidf, count, sentiment, readability)"}, | |
| "max_features": {"type": "integer", "description": "Max TF-IDF features (default: 100)"}, | |
| "output_path": {"type": "string", "description": "Path to save dataset with text features"} | |
| }, | |
| "required": ["file_path", "text_col", "methods", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "auto_feature_engineering", | |
| "description": "Use LLM (Gemini/Groq) to automatically generate creative feature engineering ideas and implement them. Works without API key if environment variables are set.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Target column name"}, | |
| "groq_api_key": {"type": "string", "description": "Groq API key (optional - uses environment variable if not provided)"}, | |
| "max_suggestions": {"type": "integer", "description": "Maximum feature suggestions to generate (default: 10)"}, | |
| "implement_top_k": {"type": "integer", "description": "Number of top suggestions to implement (default: 5)"}, | |
| "output_path": {"type": "string", "description": "Path to save dataset with new features"} | |
| }, | |
| "required": ["file_path", "target_col", "output_path"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # ADVANCED PREPROCESSING (3) | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "handle_imbalanced_data", | |
| "description": "Handle class imbalance using SMOTE, ADASYN, or class weights. Critical for classification tasks.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Target column name"}, | |
| "method": {"type": "string", "enum": ["smote", "adasyn", "random_oversample", "random_undersample"], "description": "Balancing method"}, | |
| "sampling_strategy": {"type": "string", "description": "Sampling ratio (auto, minority, majority)"}, | |
| "output_path": {"type": "string", "description": "Path to save balanced dataset"} | |
| }, | |
| "required": ["file_path", "target_col", "method", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_feature_scaling", | |
| "description": "Scale features using StandardScaler, MinMaxScaler, or RobustScaler. Essential for distance-based algorithms.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "method": {"type": "string", "enum": ["standard", "minmax", "robust"], "description": "Scaling method"}, | |
| "columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to scale (None = all numeric)"}, | |
| "output_path": {"type": "string", "description": "Path to save scaled dataset"} | |
| }, | |
| "required": ["file_path", "method", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "split_data_strategically", | |
| "description": "Split data with stratification, time-based splitting, or group-based splitting for better validation.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Target column for stratification"}, | |
| "method": {"type": "string", "enum": ["stratified", "time_based", "group_based"], "description": "Split method"}, | |
| "test_size": {"type": "number", "description": "Test set proportion (default: 0.2)"}, | |
| "time_col": {"type": "string", "description": "Time column for time-based split"}, | |
| "group_col": {"type": "string", "description": "Group column for group-based split"} | |
| }, | |
| "required": ["file_path", "method"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # ADVANCED TRAINING (3) | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "hyperparameter_tuning", | |
| "description": "Optimize model hyperparameters using Optuna (Bayesian optimization). Finds best model configuration automatically.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to prepared dataset"}, | |
| "target_col": {"type": "string", "description": "Target column name"}, | |
| "model_type": {"type": "string", "enum": ["random_forest", "xgboost", "lightgbm"], "description": "Model to tune"}, | |
| "n_trials": {"type": "integer", "description": "Number of tuning trials (default: 100)"}, | |
| "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"}, | |
| "output_path": {"type": "string", "description": "Path to save tuned model"} | |
| }, | |
| "required": ["file_path", "target_col", "model_type", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "train_ensemble_models", | |
| "description": "Train ensemble models using stacking, voting, or blending. Combines multiple models for better performance.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to prepared dataset"}, | |
| "target_col": {"type": "string", "description": "Target column name"}, | |
| "ensemble_method": {"type": "string", "enum": ["stacking", "voting", "blending"], "description": "Ensemble technique"}, | |
| "base_models": {"type": "array", "items": {"type": "string"}, "description": "Base model types to ensemble"}, | |
| "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"}, | |
| "output_path": {"type": "string", "description": "Path to save ensemble model"} | |
| }, | |
| "required": ["file_path", "target_col", "ensemble_method", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_cross_validation", | |
| "description": "Perform k-fold cross-validation to get robust model performance estimates. Returns mean and std of metrics.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Target column name"}, | |
| "model_type": {"type": "string", "description": "Model type (random_forest, xgboost, logistic, ridge)"}, | |
| "n_splits": {"type": "integer", "description": "Number of CV folds/splits (default: 5)"}, | |
| "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"}, | |
| "cv_strategy": {"type": "string", "enum": ["kfold", "stratified", "timeseries"], "description": "Cross-validation strategy (default: kfold)"}, | |
| "save_oof": {"type": "boolean", "description": "Whether to save out-of-fold predictions (default: false)"} | |
| }, | |
| "required": ["file_path", "target_col", "model_type"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # BUSINESS INTELLIGENCE (4) | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_cohort_analysis", | |
| "description": "Analyze user cohorts over time (retention, revenue, engagement). Essential for SaaS and e-commerce businesses.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to transaction/event data"}, | |
| "user_col": {"type": "string", "description": "User ID column"}, | |
| "date_col": {"type": "string", "description": "Date/timestamp column"}, | |
| "metric_col": {"type": "string", "description": "Metric to analyze (revenue, events, etc.)"}, | |
| "cohort_period": {"type": "string", "enum": ["daily", "weekly", "monthly"], "description": "Cohort grouping period"}, | |
| "output_path": {"type": "string", "description": "Path to save cohort analysis results"} | |
| }, | |
| "required": ["file_path", "user_col", "date_col", "metric_col", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_rfm_analysis", | |
| "description": "RFM (Recency, Frequency, Monetary) analysis for customer segmentation. Identifies best/worst customers.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to transaction data"}, | |
| "customer_col": {"type": "string", "description": "Customer ID column"}, | |
| "date_col": {"type": "string", "description": "Transaction date column"}, | |
| "amount_col": {"type": "string", "description": "Transaction amount column"}, | |
| "output_path": {"type": "string", "description": "Path to save RFM segments"} | |
| }, | |
| "required": ["file_path", "customer_col", "date_col", "amount_col", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "detect_causal_relationships", | |
| "description": "Detect potential causal relationships between features using Granger causality and correlation analysis.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Target/effect column"}, | |
| "feature_cols": {"type": "array", "items": {"type": "string"}, "description": "Potential cause columns"}, | |
| "method": {"type": "string", "enum": ["granger", "correlation"], "description": "Causality detection method"} | |
| }, | |
| "required": ["file_path", "target_col"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_business_insights", | |
| "description": "Generate automated business insights using descriptive statistics, trends, and anomaly detection. Creates executive summary.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to business data"}, | |
| "metric_cols": {"type": "array", "items": {"type": "string"}, "description": "Key business metrics to analyze"}, | |
| "date_col": {"type": "string", "description": "Date column for trend analysis"}, | |
| "output_path": {"type": "string", "description": "Path to save insights report"} | |
| }, | |
| "required": ["file_path", "metric_cols", "output_path"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # COMPUTER VISION (3) | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "extract_image_features", | |
| "description": "Extract features from images using pre-trained CNNs (ResNet, VGG). Converts images to feature vectors for ML.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "image_dir": {"type": "string", "description": "Directory containing images"}, | |
| "model": {"type": "string", "enum": ["resnet", "vgg", "mobilenet"], "description": "Pre-trained model to use"}, | |
| "output_path": {"type": "string", "description": "Path to save feature vectors CSV"} | |
| }, | |
| "required": ["image_dir", "model", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_image_clustering", | |
| "description": "Cluster images based on visual similarity using K-means or DBSCAN on extracted features.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "image_dir": {"type": "string", "description": "Directory containing images"}, | |
| "n_clusters": {"type": "integer", "description": "Number of clusters (default: auto-detect)"}, | |
| "method": {"type": "string", "enum": ["kmeans", "dbscan"], "description": "Clustering method"}, | |
| "output_path": {"type": "string", "description": "Path to save clustering results"} | |
| }, | |
| "required": ["image_dir", "method", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "analyze_tabular_image_hybrid", | |
| "description": "Combine tabular data with image features for hybrid ML models. Useful for e-commerce/medical data.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "tabular_path": {"type": "string", "description": "Path to tabular data CSV"}, | |
| "image_dir": {"type": "string", "description": "Directory with images"}, | |
| "image_id_col": {"type": "string", "description": "Column linking tabular data to images"}, | |
| "output_path": {"type": "string", "description": "Path to save combined features"} | |
| }, | |
| "required": ["tabular_path", "image_dir", "image_id_col", "output_path"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # NLP/TEXT ANALYTICS (4) | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_topic_modeling", | |
| "description": "Discover topics in text documents using LDA or NMF. Extract themes from customer reviews, articles, etc.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset with text"}, | |
| "text_col": {"type": "string", "description": "Text column name"}, | |
| "n_topics": {"type": "integer", "description": "Number of topics to extract (default: 5)"}, | |
| "method": {"type": "string", "enum": ["lda", "nmf"], "description": "Topic modeling method"}, | |
| "output_path": {"type": "string", "description": "Path to save topics and document-topic matrix"} | |
| }, | |
| "required": ["file_path", "text_col", "method", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_named_entity_recognition", | |
| "description": "Extract named entities (person, organization, location) from text using NER models.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset with text"}, | |
| "text_col": {"type": "string", "description": "Text column name"}, | |
| "output_path": {"type": "string", "description": "Path to save dataset with extracted entities"} | |
| }, | |
| "required": ["file_path", "text_col", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "analyze_sentiment_advanced", | |
| "description": "Perform advanced sentiment analysis with aspect-based sentiment (what features customers like/dislike).", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset with text"}, | |
| "text_col": {"type": "string", "description": "Text column name"}, | |
| "aspects": {"type": "array", "items": {"type": "string"}, "description": "Aspects to analyze sentiment for (e.g., 'price', 'quality')"}, | |
| "output_path": {"type": "string", "description": "Path to save sentiment scores"} | |
| }, | |
| "required": ["file_path", "text_col", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_text_similarity", | |
| "description": "Calculate text similarity using cosine similarity, Jaccard, or semantic embeddings. Find duplicate/similar documents.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset with text"}, | |
| "text_col": {"type": "string", "description": "Text column name"}, | |
| "method": {"type": "string", "enum": ["cosine", "jaccard", "semantic"], "description": "Similarity method"}, | |
| "threshold": {"type": "number", "description": "Similarity threshold (0-1)"}, | |
| "output_path": {"type": "string", "description": "Path to save similarity matrix"} | |
| }, | |
| "required": ["file_path", "text_col", "method", "output_path"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # PRODUCTION/MLOPS (5) | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "monitor_model_drift", | |
| "description": "Detect data drift and concept drift in production models. Compare training vs production data distributions.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "train_data_path": {"type": "string", "description": "Path to original training data"}, | |
| "production_data_path": {"type": "string", "description": "Path to recent production data"}, | |
| "features": {"type": "array", "items": {"type": "string"}, "description": "Features to monitor for drift"}, | |
| "output_path": {"type": "string", "description": "Path to save drift report"} | |
| }, | |
| "required": ["train_data_path", "production_data_path", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "explain_predictions", | |
| "description": "Explain model predictions using SHAP or LIME. Generate feature importance explanations for individual predictions.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "model_path": {"type": "string", "description": "Path to trained model"}, | |
| "data_path": {"type": "string", "description": "Path to data to explain"}, | |
| "method": {"type": "string", "enum": ["shap", "lime"], "description": "Explanation method"}, | |
| "n_samples": {"type": "integer", "description": "Number of samples to explain (default: 10)"}, | |
| "output_path": {"type": "string", "description": "Path to save explanations"} | |
| }, | |
| "required": ["model_path", "data_path", "method", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_model_card", | |
| "description": "Generate model card documentation with model details, performance metrics, bias analysis, and usage guidelines.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "model_path": {"type": "string", "description": "Path to trained model"}, | |
| "train_data_path": {"type": "string", "description": "Path to training data"}, | |
| "test_data_path": {"type": "string", "description": "Path to test data"}, | |
| "target_col": {"type": "string", "description": "Target column name"}, | |
| "output_path": {"type": "string", "description": "Path to save model card JSON"} | |
| }, | |
| "required": ["model_path", "train_data_path", "test_data_path", "target_col", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_ab_test_analysis", | |
| "description": "Analyze A/B test results with statistical significance testing. Determine if variant B is better than control A.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to A/B test data"}, | |
| "variant_col": {"type": "string", "description": "Column indicating variant (A/B)"}, | |
| "metric_col": {"type": "string", "description": "Success metric column"}, | |
| "confidence_level": {"type": "number", "description": "Confidence level for significance (default: 0.95)"} | |
| }, | |
| "required": ["file_path", "variant_col", "metric_col"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "detect_feature_leakage", | |
| "description": "Detect potential feature leakage by analyzing feature importance and temporal relationships. Prevents data leakage bugs.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Target column name"}, | |
| "date_col": {"type": "string", "description": "Optional date column for temporal analysis"} | |
| }, | |
| "required": ["file_path", "target_col"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # TIME SERIES (3) | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "forecast_time_series", | |
| "description": "Forecast future values using ARIMA, Prophet, or LSTM models. Handles seasonal and trend components.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to time series data"}, | |
| "date_col": {"type": "string", "description": "Date/timestamp column"}, | |
| "value_col": {"type": "string", "description": "Value column to forecast"}, | |
| "forecast_periods": {"type": "integer", "description": "Number of periods to forecast"}, | |
| "method": {"type": "string", "enum": ["arima", "prophet", "lstm"], "description": "Forecasting method"}, | |
| "output_path": {"type": "string", "description": "Path to save forecast results"} | |
| }, | |
| "required": ["file_path", "date_col", "value_col", "forecast_periods", "method", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "detect_seasonality_trends", | |
| "description": "Detect seasonality patterns and trends in time series data using STL decomposition and statistical tests.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to time series data"}, | |
| "date_col": {"type": "string", "description": "Date/timestamp column"}, | |
| "value_col": {"type": "string", "description": "Value column to analyze"}, | |
| "period": {"type": "integer", "description": "Expected seasonal period (e.g., 12 for monthly)"} | |
| }, | |
| "required": ["file_path", "date_col", "value_col"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "create_time_series_features", | |
| "description": "Create comprehensive time series features: lags, rolling stats, exponential moving averages, and Fourier features.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to time series data"}, | |
| "date_col": {"type": "string", "description": "Date/timestamp column"}, | |
| "value_col": {"type": "string", "description": "Value column"}, | |
| "lags": {"type": "array", "items": {"type": "integer"}, "description": "Lag periods to create (e.g., [1, 7, 30])"}, | |
| "windows": {"type": "array", "items": {"type": "integer"}, "description": "Rolling window sizes (e.g., [7, 30])"}, | |
| "output_path": {"type": "string", "description": "Path to save dataset with time series features"} | |
| }, | |
| "required": ["file_path", "date_col", "value_col", "output_path"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # ADVANCED INSIGHTS TOOLS (6) - NEW | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "analyze_root_cause", | |
| "description": "Perform root cause analysis to identify why a metric dropped or changed. Analyzes correlations, temporal patterns, and identifies top influencing factors.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Column to analyze (e.g., 'sales')"}, | |
| "time_col": {"type": "string", "description": "Optional time column for trend analysis"}, | |
| "threshold_drop": {"type": "number", "description": "Percentage drop to flag as significant (default 0.15)"} | |
| }, | |
| "required": ["file_path", "target_col"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "detect_trends_and_seasonality", | |
| "description": "Detect trends and seasonal patterns in time series data using statistical methods and autocorrelation.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "value_col": {"type": "string", "description": "Column with values to analyze"}, | |
| "time_col": {"type": "string", "description": "Column with timestamps"}, | |
| "seasonal_period": {"type": "integer", "description": "Expected seasonal period (auto-detected if None)"} | |
| }, | |
| "required": ["file_path", "value_col", "time_col"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "detect_anomalies_advanced", | |
| "description": "Detect anomalies with confidence scores using Isolation Forest or statistical methods.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to analyze (all numeric if None)"}, | |
| "contamination": {"type": "number", "description": "Expected proportion of outliers (default 0.1)"}, | |
| "method": {"type": "string", "enum": ["isolation_forest", "statistical"], "description": "Detection method"} | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_hypothesis_testing", | |
| "description": "Perform statistical hypothesis testing (t-test, ANOVA, chi-square) to compare groups.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "group_col": {"type": "string", "description": "Column defining groups"}, | |
| "value_col": {"type": "string", "description": "Column with values to compare"}, | |
| "test_type": {"type": "string", "enum": ["t-test", "anova", "chi-square", "auto"], "description": "Test type (auto-detected if 'auto')"} | |
| }, | |
| "required": ["file_path", "group_col", "value_col"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "analyze_distribution", | |
| "description": "Analyze distribution of a column including normality tests, skewness, and kurtosis.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "column": {"type": "string", "description": "Column to analyze"}, | |
| "tests": {"type": "array", "items": {"type": "string"}, "description": "Tests to perform (normality, skewness)"} | |
| }, | |
| "required": ["file_path", "column"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "perform_segment_analysis", | |
| "description": "Perform cluster-based customer/data segmentation using K-means and profile each segment.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "n_segments": {"type": "integer", "description": "Number of segments to create (default 5)"}, | |
| "features": {"type": "array", "items": {"type": "string"}, "description": "Features for clustering (all numeric if None)"} | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # AUTOMATED PIPELINE TOOLS (2) - NEW | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "auto_ml_pipeline", | |
| "description": "Fully automated ML pipeline: auto-detect types, clean missing values, handle outliers, encode categorical, engineer features, and select best features. Zero configuration required!", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to input dataset"}, | |
| "target_col": {"type": "string", "description": "Target column name"}, | |
| "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "Task type (auto-detected if 'auto')"}, | |
| "output_path": {"type": "string", "description": "Where to save processed data"}, | |
| "feature_engineering_level": {"type": "string", "enum": ["basic", "intermediate", "advanced"], "description": "Feature engineering depth"} | |
| }, | |
| "required": ["file_path", "target_col"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "auto_feature_selection", | |
| "description": "Automatically select the best features for modeling using mutual information or F-statistics.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Target column"}, | |
| "task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "Task type"}, | |
| "max_features": {"type": "integer", "description": "Maximum features to keep (default 50)"}, | |
| "method": {"type": "string", "enum": ["mutual_info", "f_test", "auto"], "description": "Selection method"}, | |
| "output_path": {"type": "string", "description": "Where to save selected features"} | |
| }, | |
| "required": ["file_path", "target_col"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # VISUALIZATION TOOLS (3) - NEW | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_all_plots", | |
| "description": "Generate ALL plots for a dataset automatically: data quality, EDA, distributions, and correlations. Creates interactive HTML plots.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Optional target column"}, | |
| "output_dir": {"type": "string", "description": "Directory to save plots (default ./outputs/plots)"} | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_data_quality_plots", | |
| "description": "Generate data quality visualizations: missing values, data types, and outlier detection plots.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "output_dir": {"type": "string", "description": "Directory to save plots"} | |
| }, | |
| "required": ["file_path", "output_dir"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_eda_plots", | |
| "description": "Generate exploratory data analysis plots: correlation heatmap, feature relationships, and pairplots.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Optional target column"}, | |
| "output_dir": {"type": "string", "description": "Directory to save plots"} | |
| }, | |
| "required": ["file_path", "output_dir"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # INTERACTIVE PLOTLY VISUALIZATIONS (6) | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_interactive_scatter", | |
| "description": "Create interactive scatter plot with zoom, pan, and hover capabilities. Great for exploring relationships between variables.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "x_col": {"type": "string", "description": "Column for X-axis"}, | |
| "y_col": {"type": "string", "description": "Column for Y-axis"}, | |
| "color_col": {"type": "string", "description": "Optional column for color coding points"}, | |
| "size_col": {"type": "string", "description": "Optional column for bubble size"}, | |
| "output_path": {"type": "string", "description": "Path to save HTML file (default: ./outputs/plots/interactive/scatter.html)"} | |
| }, | |
| "required": ["file_path", "x_col", "y_col"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_interactive_histogram", | |
| "description": "Create interactive histogram with box plot overlay. Users can explore distribution interactively.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "column": {"type": "string", "description": "Column to plot distribution"}, | |
| "bins": {"type": "integer", "description": "Number of bins (default: 30)"}, | |
| "color_col": {"type": "string", "description": "Optional column for grouped histograms"}, | |
| "output_path": {"type": "string", "description": "Path to save HTML file"} | |
| }, | |
| "required": ["file_path", "column"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_interactive_correlation_heatmap", | |
| "description": "Create interactive correlation heatmap with hover values. Better than static matplotlib version.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "output_path": {"type": "string", "description": "Path to save HTML file"} | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_interactive_box_plots", | |
| "description": "Create interactive box plots for outlier detection. Supports grouping by categorical variable.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to plot (all numeric if not specified)"}, | |
| "group_by": {"type": "string", "description": "Optional categorical column for grouping"}, | |
| "output_path": {"type": "string", "description": "Path to save HTML file"} | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_interactive_time_series", | |
| "description": "Create interactive time series plot with range slider and zoom. Perfect for temporal data analysis.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "time_col": {"type": "string", "description": "Column with datetime values"}, | |
| "value_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to plot over time"}, | |
| "output_path": {"type": "string", "description": "Path to save HTML file"} | |
| }, | |
| "required": ["file_path", "time_col", "value_cols"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_plotly_dashboard", | |
| "description": "Generate complete interactive dashboard with multiple visualizations: correlation heatmap, box plots, scatter plots, histograms. One-stop visualization solution.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to dataset"}, | |
| "target_col": {"type": "string", "description": "Optional target column for supervised analysis"}, | |
| "output_dir": {"type": "string", "description": "Directory to save all plots (default: ./outputs/plots/interactive)"} | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| # EDA Report Generation (1) - NEW PHASE 2 | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_ydata_profiling_report", | |
| "description": "Generate comprehensive HTML report using ydata-profiling (formerly pandas-profiling). Provides extensive analysis: overview, variable statistics, interactions, correlations (Pearson, Spearman, Cramér's V), missing values matrix, duplicate analysis, and more. Most detailed and comprehensive profiling tool with automated insights and data quality warnings.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": {"type": "string", "description": "Path to the dataset CSV/Parquet file"}, | |
| "output_path": {"type": "string", "description": "Where to save HTML report (default: ./outputs/reports/ydata_profile.html)"}, | |
| "minimal": {"type": "boolean", "description": "If true, generates faster minimal report (useful for large datasets, default: false)"}, | |
| "title": {"type": "string", "description": "Report title (default: 'Data Profiling Report')"} | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| # ======================================== | |
| # CODE INTERPRETER - THE GAME CHANGER 🚀 | |
| # ======================================== | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "execute_python_code", | |
| "description": "⭐ CRITICAL TOOL - Execute custom Python code for ANY data science task not covered by existing tools. This is what makes you a TRUE AI AGENT, not just a function-calling bot. Use this when user requests: 1) Custom visualizations (specific Plotly plots, interactive dashboards, unique chart types) 2) Domain-specific calculations 3) Custom data transformations 4) Specific export formats 5) Interactive widgets/filters. Code has access to pandas, polars, numpy, matplotlib, seaborn, plotly. ALWAYS save outputs to files and return file paths.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "code": { | |
| "type": "string", | |
| "description": "Python code to execute. Auto-imported: pandas as pd, polars as pl, numpy as np, matplotlib.pyplot as plt, seaborn as sns, plotly.express as px, plotly.graph_objects as go. Code should save outputs to files in working_directory. Example: fig.write_html('./outputs/code/plot.html')" | |
| }, | |
| "working_directory": { | |
| "type": "string", | |
| "description": "Directory to run code in (default: ./outputs/code). Code can read from ./temp/ and write to this directory." | |
| }, | |
| "timeout": { | |
| "type": "integer", | |
| "description": "Maximum execution time in seconds (default: 60)" | |
| } | |
| }, | |
| "required": ["code"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "execute_code_from_file", | |
| "description": "Execute Python code from an existing .py file. Useful when code is too long to pass as string, or when running pre-written scripts. Same capabilities as execute_python_code.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to .py file to execute" | |
| }, | |
| "working_directory": { | |
| "type": "string", | |
| "description": "Directory to run code in (default: ./outputs/code)" | |
| }, | |
| "timeout": { | |
| "type": "integer", | |
| "description": "Maximum execution time in seconds (default: 60)" | |
| } | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| # ============================================ | |
| # CLOUD DATA SOURCES (4) - NEW | |
| # ============================================ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "load_bigquery_table", | |
| "description": "Load data from Google BigQuery table into a Polars DataFrame. Supports sampling via LIMIT and column selection. Returns CSV path for downstream tools. Use profile_bigquery_table first for large tables.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "project_id": { | |
| "type": "string", | |
| "description": "Google Cloud project ID" | |
| }, | |
| "dataset": { | |
| "type": "string", | |
| "description": "BigQuery dataset name" | |
| }, | |
| "table": { | |
| "type": "string", | |
| "description": "BigQuery table name" | |
| }, | |
| "limit": { | |
| "type": "integer", | |
| "description": "Optional row limit for sampling (e.g., 10000 for large tables)" | |
| }, | |
| "columns": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "Optional list of column names to load" | |
| }, | |
| "where_clause": { | |
| "type": "string", | |
| "description": "Optional SQL WHERE clause for filtering (without WHERE keyword)" | |
| } | |
| }, | |
| "required": ["project_id", "dataset", "table"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "write_bigquery_table", | |
| "description": "Write predictions or processed data from CSV/Parquet file to BigQuery table. Supports append, overwrite, or fail modes.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to CSV or Parquet file to write" | |
| }, | |
| "project_id": { | |
| "type": "string", | |
| "description": "Google Cloud project ID" | |
| }, | |
| "dataset": { | |
| "type": "string", | |
| "description": "BigQuery dataset name" | |
| }, | |
| "table": { | |
| "type": "string", | |
| "description": "BigQuery table name" | |
| }, | |
| "mode": { | |
| "type": "string", | |
| "enum": ["append", "overwrite", "fail"], | |
| "description": "Write mode: append (add rows), overwrite (replace), fail (error if exists)" | |
| } | |
| }, | |
| "required": ["file_path", "project_id", "dataset", "table"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "profile_bigquery_table", | |
| "description": "Profile a BigQuery table without loading all data. Returns row count, column types, null counts (sampled), table size, and load recommendations. Use this BEFORE load_bigquery_table for large tables.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "project_id": { | |
| "type": "string", | |
| "description": "Google Cloud project ID" | |
| }, | |
| "dataset": { | |
| "type": "string", | |
| "description": "BigQuery dataset name" | |
| }, | |
| "table": { | |
| "type": "string", | |
| "description": "BigQuery table name" | |
| } | |
| }, | |
| "required": ["project_id", "dataset", "table"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "query_bigquery", | |
| "description": "Execute custom BigQuery SQL query and return results as DataFrame. Useful for complex aggregations, joins, or transformations before analysis.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "project_id": { | |
| "type": "string", | |
| "description": "Google Cloud project ID" | |
| }, | |
| "query": { | |
| "type": "string", | |
| "description": "SQL query to execute" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Optional path to save results (default: auto-generated)" | |
| }, | |
| "limit": { | |
| "type": "integer", | |
| "description": "Optional row limit to append to query" | |
| } | |
| }, | |
| "required": ["project_id", "query"] | |
| } | |
| } | |
| } | |
| ] | |
| def get_tool_by_name(tool_name: str) -> dict: | |
| """Get tool definition by name.""" | |
| for tool in TOOLS: | |
| if tool["function"]["name"] == tool_name: | |
| return tool | |
| raise ValueError(f"Tool '{tool_name}' not found in registry") | |
| def get_all_tool_names() -> list: | |
| """Get list of all tool names.""" | |
| return [tool["function"]["name"] for tool in TOOLS] | |
| def get_tools_by_category() -> dict: | |
| """Get tools organized by category.""" | |
| return { | |
| "basic": [t["function"]["name"] for t in TOOLS[:16]], | |
| "advanced_analysis": [t["function"]["name"] for t in TOOLS[16:21]], | |
| "advanced_feature_engineering": [t["function"]["name"] for t in TOOLS[21:25]], | |
| "advanced_preprocessing": [t["function"]["name"] for t in TOOLS[25:28]], | |
| "advanced_training": [t["function"]["name"] for t in TOOLS[28:31]], | |
| "business_intelligence": [t["function"]["name"] for t in TOOLS[31:35]], | |
| "computer_vision": [t["function"]["name"] for t in TOOLS[35:38]], | |
| "nlp_text_analytics": [t["function"]["name"] for t in TOOLS[38:42]], | |
| "production_mlops": [t["function"]["name"] for t in TOOLS[42:47]], | |
| "time_series": [t["function"]["name"] for t in TOOLS[47:50]], | |
| "cloud_data_sources": [t["function"]["name"] for t in TOOLS[50:54]] | |
| } | |