Spaces:

Pulastya0
/

Data-Science-Agent

Running

Data-Science-Agent / src /tools /tools_registry_old.py

Pulastya B

feat: Initial commit - Data Science Agent with React frontend and FastAPI backend

226ac39 4 months ago

12.4 kB

	"""
	Tools Registry for Groq Function Calling
	Defines all available tools in Groq's function calling format.
	"""

	TOOLS = [
	# Data Profiling Tools
	{
	"type": "function",
	"function": {
	"name": "profile_dataset",
	"description": "Get comprehensive statistics about a dataset including shape, data types, memory usage, null counts, and unique values. Use this as the first step to understand any new dataset.",
	"parameters": {
	"type": "object",
	"properties": {
	"file_path": {
	"type": "string",
	"description": "Absolute or relative path to the CSV or Parquet file"
	}
	},
	"required": ["file_path"]
	}
	}
	},
	{
	"type": "function",
	"function": {
	"name": "detect_data_quality_issues",
	"description": "Detect data quality issues including outliers (using IQR method), duplicate rows, inconsistent formats, and data anomalies. Returns a prioritized list of issues with severity levels.",
	"parameters": {
	"type": "object",
	"properties": {
	"file_path": {
	"type": "string",
	"description": "Path to the dataset file"
	}
	},
	"required": ["file_path"]
	}
	}
	},
	{
	"type": "function",
	"function": {
	"name": "analyze_correlations",
	"description": "Compute correlation matrix and identify top correlations. If a target column is specified, shows features most correlated with the target. Useful for feature selection and understanding relationships.",
	"parameters": {
	"type": "object",
	"properties": {
	"file_path": {
	"type": "string",
	"description": "Path to the dataset file"
	},
	"target": {
	"type": "string",
	"description": "Optional target column name to analyze correlations with"
	}
	},
	"required": ["file_path"]
	}
	}
	},

	# Data Cleaning Tools
	{
	"type": "function",
	"function": {
	"name": "clean_missing_values",
	"description": "Handle missing values using appropriate strategies based on column type. Strategies include median/mean for numeric, mode for categorical, forward_fill for time series, or drop. Will not impute ID columns.",
	"parameters": {
	"type": "object",
	"properties": {
	"file_path": {
	"type": "string",
	"description": "Path to the dataset file"
	},
	"strategy": {
	"type": "object",
	"description": "Dictionary mapping column names to strategies ('median', 'mean', 'mode', 'forward_fill', 'drop'). Use 'auto' to let the tool decide based on data type.",
	"additionalProperties": {
	"type": "string"
	}
	},
	"output_path": {
	"type": "string",
	"description": "Path to save cleaned dataset"
	}
	},
	"required": ["file_path", "strategy", "output_path"]
	}
	}
	},
	{
	"type": "function",
	"function": {
	"name": "handle_outliers",
	"description": "Detect and handle outliers in numeric columns using IQR method. Methods: 'clip' (cap at boundaries), 'winsorize' (cap at percentiles), or 'remove' (delete rows).",
	"parameters": {
	"type": "object",
	"properties": {
	"file_path": {
	"type": "string",
	"description": "Path to the dataset file"
	},
	"method": {
	"type": "string",
	"enum": ["clip", "winsorize", "remove"],
	"description": "Method to handle outliers"
	},
	"columns": {
	"type": "array",
	"items": {"type": "string"},
	"description": "List of column names to check for outliers. Use 'all' to check all numeric columns."
	},
	"output_path": {
	"type": "string",
	"description": "Path to save cleaned dataset"
	}
	},
	"required": ["file_path", "method", "columns", "output_path"]
	}
	}
	},
	{
	"type": "function",
	"function": {
	"name": "fix_data_types",
	"description": "Auto-detect and fix incorrect data types. Handles dates, booleans, categoricals, and numeric columns. Fixes common issues like 'null' strings and mixed types.",
	"parameters": {
	"type": "object",
	"properties": {
	"file_path": {
	"type": "string",
	"description": "Path to the dataset file"
	},
	"type_mapping": {
	"type": "object",
	"description": "Optional dictionary mapping column names to target types ('int', 'float', 'string', 'date', 'bool', 'category'). Use 'auto' for automatic detection.",
	"additionalProperties": {
	"type": "string"
	}
	},
	"output_path": {
	"type": "string",
	"description": "Path to save dataset with fixed types"
	}
	},
	"required": ["file_path", "output_path"]
	}
	}
	},

	# Feature Engineering Tools
	{
	"type": "function",
	"function": {
	"name": "create_time_features",
	"description": "Extract comprehensive time-based features from datetime columns including year, month, day, day_of_week, quarter, is_weekend, and cyclical encodings (sin/cos for month and hour).",
	"parameters": {
	"type": "object",
	"properties": {
	"file_path": {
	"type": "string",
	"description": "Path to the dataset file"
	},
	"date_col": {
	"type": "string",
	"description": "Name of the datetime column to extract features from"
	},
	"output_path": {
	"type": "string",
	"description": "Path to save dataset with new features"
	}
	},
	"required": ["file_path", "date_col", "output_path"]
	}
	}
	},
	{
	"type": "function",
	"function": {
	"name": "encode_categorical",
	"description": "Encode categorical variables using one-hot encoding, target encoding, or frequency encoding. Handles high-cardinality columns intelligently.",
	"parameters": {
	"type": "object",
	"properties": {
	"file_path": {
	"type": "string",
	"description": "Path to the dataset file"
	},
	"method": {
	"type": "string",
	"enum": ["one_hot", "target", "frequency"],
	"description": "Encoding method to use"
	},
	"columns": {
	"type": "array",
	"items": {"type": "string"},
	"description": "List of categorical columns to encode. Use 'all' to encode all categorical columns."
	},
	"target_col": {
	"type": "string",
	"description": "Required for target encoding: name of the target column"
	},
	"output_path": {
	"type": "string",
	"description": "Path to save dataset with encoded features"
	}
	},
	"required": ["file_path", "method", "columns", "output_path"]
	}
	}
	},

	# Model Training Tools
	{
	"type": "function",
	"function": {
	"name": "train_baseline_models",
	"description": "Train multiple baseline models (Logistic Regression, Random Forest, XGBoost) and compare their performance. Automatically detects task type (classification/regression) and returns the best model with metrics.",
	"parameters": {
	"type": "object",
	"properties": {
	"file_path": {
	"type": "string",
	"description": "Path to the prepared dataset file"
	},
	"target_col": {
	"type": "string",
	"description": "Name of the target column to predict"
	},
	"task_type": {
	"type": "string",
	"enum": ["classification", "regression", "auto"],
	"description": "Type of ML task. Use 'auto' to detect automatically."
	},
	"test_size": {
	"type": "number",
	"description": "Proportion of data to use for testing (default: 0.2)"
	},
	"random_state": {
	"type": "integer",
	"description": "Random seed for reproducibility (default: 42)"
	}
	},
	"required": ["file_path", "target_col"]
	}
	}
	},
	{
	"type": "function",
	"function": {
	"name": "generate_model_report",
	"description": "Generate comprehensive model evaluation report including metrics, confusion matrix (for classification), feature importance, and SHAP values for top features. Saves report as JSON.",
	"parameters": {
	"type": "object",
	"properties": {
	"model_path": {
	"type": "string",
	"description": "Path to saved model file (.pkl or .joblib)"
	},
	"test_data_path": {
	"type": "string",
	"description": "Path to test dataset file"
	},
	"target_col": {
	"type": "string",
	"description": "Name of the target column"
	},
	"output_path": {
	"type": "string",
	"description": "Path to save the report JSON file"
	}
	},
	"required": ["model_path", "test_data_path", "target_col", "output_path"]
	}
	}
	}
	]


	def get_tool_by_name(tool_name: str) -> dict:
	"""
	Get tool definition by name.

	Args:
	tool_name: Name of the tool

	Returns:
	Tool definition dictionary

	Raises:
	ValueError: If tool not found
	"""
	for tool in TOOLS:
	if tool["function"]["name"] == tool_name:
	return tool

	raise ValueError(f"Tool '{tool_name}' not found in registry")


	def get_all_tool_names() -> list:
	"""
	Get list of all tool names.

	Returns:
	List of tool names
	"""
	return [tool["function"]["name"] for tool in TOOLS]