Data-Science-Agent / src /tools /tools_registry_old.py
Pulastya B
feat: Initial commit - Data Science Agent with React frontend and FastAPI backend
226ac39
"""
Tools Registry for Groq Function Calling
Defines all available tools in Groq's function calling format.
"""
TOOLS = [
# Data Profiling Tools
{
"type": "function",
"function": {
"name": "profile_dataset",
"description": "Get comprehensive statistics about a dataset including shape, data types, memory usage, null counts, and unique values. Use this as the first step to understand any new dataset.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Absolute or relative path to the CSV or Parquet file"
}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_data_quality_issues",
"description": "Detect data quality issues including outliers (using IQR method), duplicate rows, inconsistent formats, and data anomalies. Returns a prioritized list of issues with severity levels.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "analyze_correlations",
"description": "Compute correlation matrix and identify top correlations. If a target column is specified, shows features most correlated with the target. Useful for feature selection and understanding relationships.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"target": {
"type": "string",
"description": "Optional target column name to analyze correlations with"
}
},
"required": ["file_path"]
}
}
},
# Data Cleaning Tools
{
"type": "function",
"function": {
"name": "clean_missing_values",
"description": "Handle missing values using appropriate strategies based on column type. Strategies include median/mean for numeric, mode for categorical, forward_fill for time series, or drop. Will not impute ID columns.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"strategy": {
"type": "object",
"description": "Dictionary mapping column names to strategies ('median', 'mean', 'mode', 'forward_fill', 'drop'). Use 'auto' to let the tool decide based on data type.",
"additionalProperties": {
"type": "string"
}
},
"output_path": {
"type": "string",
"description": "Path to save cleaned dataset"
}
},
"required": ["file_path", "strategy", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "handle_outliers",
"description": "Detect and handle outliers in numeric columns using IQR method. Methods: 'clip' (cap at boundaries), 'winsorize' (cap at percentiles), or 'remove' (delete rows).",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"method": {
"type": "string",
"enum": ["clip", "winsorize", "remove"],
"description": "Method to handle outliers"
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "List of column names to check for outliers. Use 'all' to check all numeric columns."
},
"output_path": {
"type": "string",
"description": "Path to save cleaned dataset"
}
},
"required": ["file_path", "method", "columns", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "fix_data_types",
"description": "Auto-detect and fix incorrect data types. Handles dates, booleans, categoricals, and numeric columns. Fixes common issues like 'null' strings and mixed types.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"type_mapping": {
"type": "object",
"description": "Optional dictionary mapping column names to target types ('int', 'float', 'string', 'date', 'bool', 'category'). Use 'auto' for automatic detection.",
"additionalProperties": {
"type": "string"
}
},
"output_path": {
"type": "string",
"description": "Path to save dataset with fixed types"
}
},
"required": ["file_path", "output_path"]
}
}
},
# Feature Engineering Tools
{
"type": "function",
"function": {
"name": "create_time_features",
"description": "Extract comprehensive time-based features from datetime columns including year, month, day, day_of_week, quarter, is_weekend, and cyclical encodings (sin/cos for month and hour).",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"date_col": {
"type": "string",
"description": "Name of the datetime column to extract features from"
},
"output_path": {
"type": "string",
"description": "Path to save dataset with new features"
}
},
"required": ["file_path", "date_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "encode_categorical",
"description": "Encode categorical variables using one-hot encoding, target encoding, or frequency encoding. Handles high-cardinality columns intelligently.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"method": {
"type": "string",
"enum": ["one_hot", "target", "frequency"],
"description": "Encoding method to use"
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "List of categorical columns to encode. Use 'all' to encode all categorical columns."
},
"target_col": {
"type": "string",
"description": "Required for target encoding: name of the target column"
},
"output_path": {
"type": "string",
"description": "Path to save dataset with encoded features"
}
},
"required": ["file_path", "method", "columns", "output_path"]
}
}
},
# Model Training Tools
{
"type": "function",
"function": {
"name": "train_baseline_models",
"description": "Train multiple baseline models (Logistic Regression, Random Forest, XGBoost) and compare their performance. Automatically detects task type (classification/regression) and returns the best model with metrics.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the prepared dataset file"
},
"target_col": {
"type": "string",
"description": "Name of the target column to predict"
},
"task_type": {
"type": "string",
"enum": ["classification", "regression", "auto"],
"description": "Type of ML task. Use 'auto' to detect automatically."
},
"test_size": {
"type": "number",
"description": "Proportion of data to use for testing (default: 0.2)"
},
"random_state": {
"type": "integer",
"description": "Random seed for reproducibility (default: 42)"
}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_model_report",
"description": "Generate comprehensive model evaluation report including metrics, confusion matrix (for classification), feature importance, and SHAP values for top features. Saves report as JSON.",
"parameters": {
"type": "object",
"properties": {
"model_path": {
"type": "string",
"description": "Path to saved model file (.pkl or .joblib)"
},
"test_data_path": {
"type": "string",
"description": "Path to test dataset file"
},
"target_col": {
"type": "string",
"description": "Name of the target column"
},
"output_path": {
"type": "string",
"description": "Path to save the report JSON file"
}
},
"required": ["model_path", "test_data_path", "target_col", "output_path"]
}
}
}
]
def get_tool_by_name(tool_name: str) -> dict:
"""
Get tool definition by name.
Args:
tool_name: Name of the tool
Returns:
Tool definition dictionary
Raises:
ValueError: If tool not found
"""
for tool in TOOLS:
if tool["function"]["name"] == tool_name:
return tool
raise ValueError(f"Tool '{tool_name}' not found in registry")
def get_all_tool_names() -> list:
"""
Get list of all tool names.
Returns:
List of tool names
"""
return [tool["function"]["name"] for tool in TOOLS]