Spaces:

Pulastya0
/

Data-Science-Agent

Running

File size: 12,391 Bytes

226ac39

"""
Tools Registry for Groq Function Calling
Defines all available tools in Groq's function calling format.
"""

TOOLS = [
    # Data Profiling Tools
    {
        "type": "function",
        "function": {
            "name": "profile_dataset",
            "description": "Get comprehensive statistics about a dataset including shape, data types, memory usage, null counts, and unique values. Use this as the first step to understand any new dataset.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Absolute or relative path to the CSV or Parquet file"
                    }
                },
                "required": ["file_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "detect_data_quality_issues",
            "description": "Detect data quality issues including outliers (using IQR method), duplicate rows, inconsistent formats, and data anomalies. Returns a prioritized list of issues with severity levels.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    }
                },
                "required": ["file_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "analyze_correlations",
            "description": "Compute correlation matrix and identify top correlations. If a target column is specified, shows features most correlated with the target. Useful for feature selection and understanding relationships.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "target": {
                        "type": "string",
                        "description": "Optional target column name to analyze correlations with"
                    }
                },
                "required": ["file_path"]
            }
        }
    },
    
    # Data Cleaning Tools
    {
        "type": "function",
        "function": {
            "name": "clean_missing_values",
            "description": "Handle missing values using appropriate strategies based on column type. Strategies include median/mean for numeric, mode for categorical, forward_fill for time series, or drop. Will not impute ID columns.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "strategy": {
                        "type": "object",
                        "description": "Dictionary mapping column names to strategies ('median', 'mean', 'mode', 'forward_fill', 'drop'). Use 'auto' to let the tool decide based on data type.",
                        "additionalProperties": {
                            "type": "string"
                        }
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save cleaned dataset"
                    }
                },
                "required": ["file_path", "strategy", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "handle_outliers",
            "description": "Detect and handle outliers in numeric columns using IQR method. Methods: 'clip' (cap at boundaries), 'winsorize' (cap at percentiles), or 'remove' (delete rows).",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "method": {
                        "type": "string",
                        "enum": ["clip", "winsorize", "remove"],
                        "description": "Method to handle outliers"
                    },
                    "columns": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of column names to check for outliers. Use 'all' to check all numeric columns."
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save cleaned dataset"
                    }
                },
                "required": ["file_path", "method", "columns", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "fix_data_types",
            "description": "Auto-detect and fix incorrect data types. Handles dates, booleans, categoricals, and numeric columns. Fixes common issues like 'null' strings and mixed types.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "type_mapping": {
                        "type": "object",
                        "description": "Optional dictionary mapping column names to target types ('int', 'float', 'string', 'date', 'bool', 'category'). Use 'auto' for automatic detection.",
                        "additionalProperties": {
                            "type": "string"
                        }
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save dataset with fixed types"
                    }
                },
                "required": ["file_path", "output_path"]
            }
        }
    },
    
    # Feature Engineering Tools
    {
        "type": "function",
        "function": {
            "name": "create_time_features",
            "description": "Extract comprehensive time-based features from datetime columns including year, month, day, day_of_week, quarter, is_weekend, and cyclical encodings (sin/cos for month and hour).",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "date_col": {
                        "type": "string",
                        "description": "Name of the datetime column to extract features from"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save dataset with new features"
                    }
                },
                "required": ["file_path", "date_col", "output_path"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "encode_categorical",
            "description": "Encode categorical variables using one-hot encoding, target encoding, or frequency encoding. Handles high-cardinality columns intelligently.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the dataset file"
                    },
                    "method": {
                        "type": "string",
                        "enum": ["one_hot", "target", "frequency"],
                        "description": "Encoding method to use"
                    },
                    "columns": {
                        "type": "array",
                        "items": {"type": "string"},
                        "description": "List of categorical columns to encode. Use 'all' to encode all categorical columns."
                    },
                    "target_col": {
                        "type": "string",
                        "description": "Required for target encoding: name of the target column"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save dataset with encoded features"
                    }
                },
                "required": ["file_path", "method", "columns", "output_path"]
            }
        }
    },
    
    # Model Training Tools
    {
        "type": "function",
        "function": {
            "name": "train_baseline_models",
            "description": "Train multiple baseline models (Logistic Regression, Random Forest, XGBoost) and compare their performance. Automatically detects task type (classification/regression) and returns the best model with metrics.",
            "parameters": {
                "type": "object",
                "properties": {
                    "file_path": {
                        "type": "string",
                        "description": "Path to the prepared dataset file"
                    },
                    "target_col": {
                        "type": "string",
                        "description": "Name of the target column to predict"
                    },
                    "task_type": {
                        "type": "string",
                        "enum": ["classification", "regression", "auto"],
                        "description": "Type of ML task. Use 'auto' to detect automatically."
                    },
                    "test_size": {
                        "type": "number",
                        "description": "Proportion of data to use for testing (default: 0.2)"
                    },
                    "random_state": {
                        "type": "integer",
                        "description": "Random seed for reproducibility (default: 42)"
                    }
                },
                "required": ["file_path", "target_col"]
            }
        }
    },
    {
        "type": "function",
        "function": {
            "name": "generate_model_report",
            "description": "Generate comprehensive model evaluation report including metrics, confusion matrix (for classification), feature importance, and SHAP values for top features. Saves report as JSON.",
            "parameters": {
                "type": "object",
                "properties": {
                    "model_path": {
                        "type": "string",
                        "description": "Path to saved model file (.pkl or .joblib)"
                    },
                    "test_data_path": {
                        "type": "string",
                        "description": "Path to test dataset file"
                    },
                    "target_col": {
                        "type": "string",
                        "description": "Name of the target column"
                    },
                    "output_path": {
                        "type": "string",
                        "description": "Path to save the report JSON file"
                    }
                },
                "required": ["model_path", "test_data_path", "target_col", "output_path"]
            }
        }
    }
]


def get_tool_by_name(tool_name: str) -> dict:
    """
    Get tool definition by name.
    
    Args:
        tool_name: Name of the tool
        
    Returns:
        Tool definition dictionary
        
    Raises:
        ValueError: If tool not found
    """
    for tool in TOOLS:
        if tool["function"]["name"] == tool_name:
            return tool
    
    raise ValueError(f"Tool '{tool_name}' not found in registry")


def get_all_tool_names() -> list:
    """
    Get list of all tool names.
    
    Returns:
        List of tool names
    """
    return [tool["function"]["name"] for tool in TOOLS]