Data-Science-Agent / src /tools /tools_registry.py
Pulastya B
refactor: Remove Sweetviz and use YData Profiling as primary EDA tool
d92d2aa
"""
Complete Tools Registry for Groq Function Calling - All 67 Tools
Defines all available tools in Groq's function calling format.
"""
TOOLS = [
# ============================================
# BASIC TOOLS (16)
# ============================================
# Data Profiling Tools (3)
{
"type": "function",
"function": {
"name": "profile_dataset",
"description": "Get comprehensive statistics about a dataset including shape, data types, memory usage, null counts, and unique values. Use this as the first step to understand any new dataset.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Absolute or relative path to the CSV or Parquet file"
}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_data_quality_issues",
"description": "Detect data quality issues including outliers (using IQR method), duplicate rows, inconsistent formats, and data anomalies. Returns a prioritized list of issues with severity levels.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "analyze_correlations",
"description": "Compute correlation matrix and identify top correlations. If a target column is specified, shows features most correlated with the target. Useful for feature selection and understanding relationships.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"target": {
"type": "string",
"description": "Optional target column name to analyze correlations with"
}
},
"required": ["file_path"]
}
}
},
# Data Cleaning Tools (3)
{
"type": "function",
"function": {
"name": "clean_missing_values",
"description": "Handle missing values using appropriate strategies based on column type. Strategies include median/mean for numeric, mode for categorical, forward_fill for time series, or drop. In 'auto' mode, first drops columns with >threshold missing (default 40%), then imputes remaining columns. Will not impute ID columns.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"strategy": {
"oneOf": [
{
"type": "string",
"enum": ["auto"],
"description": "Use 'auto' to automatically decide strategies for all columns based on data type. First drops columns with >threshold missing, then imputes remaining columns."
},
{
"type": "object",
"description": "Dictionary mapping column names to strategies ('median', 'mean', 'mode', 'forward_fill', 'drop')",
"additionalProperties": {"type": "string"}
}
],
"description": "Either 'auto' (string) to automatically handle all missing values, or a dictionary mapping specific columns to strategies"
},
"output_path": {
"type": "string",
"description": "Path to save cleaned dataset"
},
"threshold": {
"type": "number",
"description": "For 'auto' mode: drop columns with missing percentage above this threshold (default: 0.4 = 40%). Range: 0.0 to 1.0. For example, 0.7 means drop columns with >70% missing values."
}
},
"required": ["file_path", "strategy", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "handle_outliers",
"description": "Detect and handle outliers in numeric columns using IQR method. Methods: 'clip' (cap at boundaries), 'winsorize' (cap at percentiles), or 'remove' (delete rows).",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"method": {
"type": "string",
"enum": ["clip", "winsorize", "remove"],
"description": "Method to handle outliers"
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "List of column names to check for outliers. Use 'all' to check all numeric columns."
},
"output_path": {
"type": "string",
"description": "Path to save cleaned dataset"
}
},
"required": ["file_path", "method", "columns", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "fix_data_types",
"description": "Auto-detect and fix incorrect data types. Handles dates, booleans, categoricals, and numeric columns. Fixes common issues like 'null' strings and mixed types.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"type_mapping": {
"type": "object",
"description": "Optional dictionary mapping column names to target types ('int', 'float', 'string', 'date', 'bool', 'category'). Use 'auto' for automatic detection.",
"additionalProperties": {"type": "string"}
},
"output_path": {
"type": "string",
"description": "Path to save dataset with fixed types"
}
},
"required": ["file_path", "output_path"]
}
}
},
# Data Type Conversion Tools (2)
{
"type": "function",
"function": {
"name": "force_numeric_conversion",
"description": "CRITICAL TOOL: Force convert columns to numeric type even if detected as strings/objects. Essential for datasets with numeric columns stored as strings (with commas, spaces, currency symbols). Use this BEFORE encoding when you see 'no numeric features' errors.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "List of column names to force convert to numeric. Use ['all'] to auto-detect and convert all non-ID columns that look numeric."
},
"output_path": {
"type": "string",
"description": "Path to save dataset with converted types"
},
"errors": {
"type": "string",
"enum": ["coerce", "raise"],
"description": "How to handle conversion errors. 'coerce' makes invalid values null (recommended), 'raise' throws error."
}
},
"required": ["file_path", "columns", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "smart_type_inference",
"description": "Intelligently infer and fix data types for all columns by analyzing patterns. Goes beyond basic type detection to understand semantic meaning. Use when dataset has widespread type issues.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"output_path": {
"type": "string",
"description": "Path to save dataset with inferred types"
},
"aggressive": {
"type": "boolean",
"description": "If true, attempts aggressive conversion on ambiguous columns. Recommended for messy datasets."
}
},
"required": ["file_path", "output_path"]
}
}
},
# Feature Engineering Tools (2)
{
"type": "function",
"function": {
"name": "create_time_features",
"description": "Extract comprehensive time-based features from datetime columns including year, month, day, day_of_week, quarter, is_weekend, and cyclical encodings (sin/cos for month and hour).",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"date_col": {
"type": "string",
"description": "Name of the datetime column to extract features from"
},
"output_path": {
"type": "string",
"description": "Path to save dataset with new features"
}
},
"required": ["file_path", "date_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "encode_categorical",
"description": "Encode categorical variables using one-hot encoding, target encoding, or frequency encoding. Handles high-cardinality columns intelligently. Use method='auto' to automatically choose the best encoding.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file"
},
"method": {
"type": "string",
"enum": ["one_hot", "target", "frequency", "auto"],
"description": "Encoding method to use. 'auto' automatically selects the best method."
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "List of categorical columns to encode. Use ['all'] to encode all categorical columns. If not specified, defaults to all categorical columns."
},
"target_col": {
"type": "string",
"description": "Required for target encoding: name of the target column"
},
"output_path": {
"type": "string",
"description": "Path to save dataset with encoded features"
}
},
"required": ["file_path", "output_path"]
}
}
},
# Model Training Tools (2)
{
"type": "function",
"function": {
"name": "train_baseline_models",
"description": "Train multiple baseline models (Logistic Regression, Random Forest, XGBoost) and compare their performance. Automatically detects task type (classification/regression) and returns the best model with metrics.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the prepared dataset file"
},
"target_col": {
"type": "string",
"description": "Name of the target column to predict"
},
"task_type": {
"type": "string",
"enum": ["classification", "regression", "auto"],
"description": "Type of ML task. Use 'auto' to detect automatically."
},
"test_size": {
"type": "number",
"description": "Proportion of data to use for testing (default: 0.2)"
},
"random_state": {
"type": "integer",
"description": "Random seed for reproducibility (default: 42)"
}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_model_report",
"description": "Generate comprehensive model evaluation report including metrics, confusion matrix (for classification), feature importance, and SHAP values for top features. Saves report as JSON.",
"parameters": {
"type": "object",
"properties": {
"model_path": {
"type": "string",
"description": "Path to saved model file (.pkl or .joblib)"
},
"test_data_path": {
"type": "string",
"description": "Path to test dataset file"
},
"target_col": {
"type": "string",
"description": "Name of the target column"
},
"output_path": {
"type": "string",
"description": "Path to save the report JSON file"
}
},
"required": ["model_path", "test_data_path", "target_col", "output_path"]
}
}
},
# New Data Wrangling Tools (3)
{
"type": "function",
"function": {
"name": "get_smart_summary",
"description": "Generate an LLM-friendly smart summary of a dataset with per-column missing value percentages (sorted by severity), unique value counts, sample data, and numeric statistics. Much more detailed than profile_dataset for decision-making.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the CSV or Parquet file to summarize"
},
"n_samples": {
"type": "integer",
"description": "Number of sample rows to include in the summary (default: 5)"
}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "merge_datasets",
"description": "Merge two datasets using SQL-like join operations (inner, left, right, outer, cross). Supports joining on single or multiple columns with same or different names. Automatically handles duplicate columns with suffixes.",
"parameters": {
"type": "object",
"properties": {
"left_path": {
"type": "string",
"description": "Path to the left (first) dataset file"
},
"right_path": {
"type": "string",
"description": "Path to the right (second) dataset file"
},
"output_path": {
"type": "string",
"description": "Path to save the merged dataset"
},
"how": {
"type": "string",
"enum": ["inner", "left", "right", "outer", "cross"],
"description": "Join type: 'inner' (only matching rows), 'left' (all left + matching right), 'right' (all right + matching left), 'outer' (all rows from both), 'cross' (cartesian product)"
},
"on": {
"type": ["string", "array"],
"items": {"type": "string"},
"description": "Column name(s) to join on (must exist in both datasets). Can be a single column name or list of columns. Use this when join columns have the same name in both datasets."
},
"left_on": {
"type": ["string", "array"],
"items": {"type": "string"},
"description": "Column name(s) in left dataset to join on. Use with right_on when join columns have different names."
},
"right_on": {
"type": ["string", "array"],
"items": {"type": "string"},
"description": "Column name(s) in right dataset to join on. Use with left_on when join columns have different names."
}
},
"required": ["left_path", "right_path", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "concat_datasets",
"description": "Concatenate multiple datasets either vertically (stacking rows, useful for monthly data) or horizontally (adding columns side-by-side). Validates schema compatibility for vertical concat.",
"parameters": {
"type": "object",
"properties": {
"file_paths": {
"type": "array",
"items": {"type": "string"},
"description": "List of paths to dataset files to concatenate (minimum 2 files)"
},
"output_path": {
"type": "string",
"description": "Path to save the concatenated dataset"
},
"axis": {
"type": "string",
"enum": ["vertical", "horizontal"],
"description": "'vertical' to stack rows (union, for monthly data), 'horizontal' to add columns side-by-side (default: 'vertical')"
}
},
"required": ["file_paths", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "reshape_dataset",
"description": "Transform dataset structure using pivot (long→wide format), melt (wide→long format), or transpose (swap rows and columns) operations.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to the dataset file to reshape"
},
"output_path": {
"type": "string",
"description": "Path to save the reshaped dataset"
},
"operation": {
"type": "string",
"enum": ["pivot", "melt", "transpose"],
"description": "Reshape operation: 'pivot' (long→wide, requires index/columns/values), 'melt' (wide→long, requires id_vars/value_vars), 'transpose' (swap rows/columns)"
},
"index": {
"type": "string",
"description": "Column to use as row index (for pivot operation)"
},
"columns": {
"type": "string",
"description": "Column whose values become new column names (for pivot operation)"
},
"values": {
"type": "string",
"description": "Column whose values populate the pivoted table (for pivot operation)"
},
"id_vars": {
"type": "array",
"items": {"type": "string"},
"description": "Columns to keep as identifiers (for melt operation)"
},
"value_vars": {
"type": "array",
"items": {"type": "string"},
"description": "Columns to unpivot (for melt operation). If not specified, uses all columns except id_vars."
}
},
"required": ["file_path", "output_path", "operation"]
}
}
},
# ============================================
# ADVANCED ANALYSIS TOOLS (5)
# ============================================
{
"type": "function",
"function": {
"name": "perform_eda_analysis",
"description": "Comprehensive Exploratory Data Analysis with visualizations, distribution analysis, and automated insights. Generates HTML report with plots.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Optional target column for supervised analysis"},
"output_dir": {"type": "string", "description": "Directory to save EDA report and plots"}
},
"required": ["file_path", "output_dir"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_model_issues",
"description": "Detect overfitting, underfitting, class imbalance, and other model performance issues. Provides diagnostic recommendations.",
"parameters": {
"type": "object",
"properties": {
"model_path": {"type": "string", "description": "Path to trained model"},
"train_data_path": {"type": "string", "description": "Path to training data"},
"test_data_path": {"type": "string", "description": "Path to test data"},
"target_col": {"type": "string", "description": "Target column name"}
},
"required": ["model_path", "train_data_path", "test_data_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_anomalies",
"description": "Detect anomalies using Isolation Forest, LOF, or statistical methods. Returns anomaly scores and flags.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"method": {"type": "string", "enum": ["isolation_forest", "lof", "statistical"], "description": "Anomaly detection method"},
"contamination": {"type": "number", "description": "Expected proportion of anomalies (default: 0.1)"},
"output_path": {"type": "string", "description": "Path to save dataset with anomaly scores"}
},
"required": ["file_path", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_and_handle_multicollinearity",
"description": "Detect and handle multicollinearity using VIF (Variance Inflation Factor). Removes highly correlated features automatically.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"threshold": {"type": "number", "description": "VIF threshold (default: 10)"},
"method": {"type": "string", "enum": ["drop", "combine"], "description": "How to handle correlated features"},
"output_path": {"type": "string", "description": "Path to save cleaned dataset"}
},
"required": ["file_path", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_statistical_tests",
"description": "Perform statistical hypothesis tests (t-test, chi-square, ANOVA) to analyze relationships between features and target.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"test_type": {"type": "string", "enum": ["auto", "ttest", "chi2", "anova"], "description": "Type of statistical test"}
},
"required": ["file_path", "target_col"]
}
}
},
# ============================================
# ADVANCED FEATURE ENGINEERING (4)
# ============================================
{
"type": "function",
"function": {
"name": "create_interaction_features",
"description": "Create polynomial, PCA, or cross-product interaction features to capture non-linear relationships.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"method": {"type": "string", "enum": ["polynomial", "pca", "cross"], "description": "Interaction method"},
"degree": {"type": "integer", "description": "Polynomial degree (default: 2)"},
"max_features": {"type": "integer", "description": "Maximum new features to create (default: 50)"},
"output_path": {"type": "string", "description": "Path to save enhanced dataset"}
},
"required": ["file_path", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "create_aggregation_features",
"description": "Create aggregation features (mean, sum, count, etc.) grouped by categorical columns. Useful for customer/transaction data.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"group_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to group by"},
"agg_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to aggregate"},
"agg_functions": {"type": "array", "items": {"type": "string"}, "description": "Aggregation functions (mean, sum, count, etc.)"},
"output_path": {"type": "string", "description": "Path to save dataset with aggregations"}
},
"required": ["file_path", "group_cols", "agg_cols", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "engineer_text_features",
"description": "Extract features from text columns: TF-IDF, word counts, sentiment, readability scores, and embeddings.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"text_col": {"type": "string", "description": "Text column name"},
"methods": {"type": "array", "items": {"type": "string"}, "description": "Feature extraction methods (tfidf, count, sentiment, readability)"},
"max_features": {"type": "integer", "description": "Max TF-IDF features (default: 100)"},
"output_path": {"type": "string", "description": "Path to save dataset with text features"}
},
"required": ["file_path", "text_col", "methods", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "auto_feature_engineering",
"description": "Use LLM (Gemini/Groq) to automatically generate creative feature engineering ideas and implement them. Works without API key if environment variables are set.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"groq_api_key": {"type": "string", "description": "Groq API key (optional - uses environment variable if not provided)"},
"max_suggestions": {"type": "integer", "description": "Maximum feature suggestions to generate (default: 10)"},
"implement_top_k": {"type": "integer", "description": "Number of top suggestions to implement (default: 5)"},
"output_path": {"type": "string", "description": "Path to save dataset with new features"}
},
"required": ["file_path", "target_col", "output_path"]
}
}
},
# ============================================
# ADVANCED PREPROCESSING (3)
# ============================================
{
"type": "function",
"function": {
"name": "handle_imbalanced_data",
"description": "Handle class imbalance using SMOTE, ADASYN, or class weights. Critical for classification tasks.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"method": {"type": "string", "enum": ["smote", "adasyn", "random_oversample", "random_undersample"], "description": "Balancing method"},
"sampling_strategy": {"type": "string", "description": "Sampling ratio (auto, minority, majority)"},
"output_path": {"type": "string", "description": "Path to save balanced dataset"}
},
"required": ["file_path", "target_col", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_feature_scaling",
"description": "Scale features using StandardScaler, MinMaxScaler, or RobustScaler. Essential for distance-based algorithms.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"method": {"type": "string", "enum": ["standard", "minmax", "robust"], "description": "Scaling method"},
"columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to scale (None = all numeric)"},
"output_path": {"type": "string", "description": "Path to save scaled dataset"}
},
"required": ["file_path", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "split_data_strategically",
"description": "Split data with stratification, time-based splitting, or group-based splitting for better validation.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column for stratification"},
"method": {"type": "string", "enum": ["stratified", "time_based", "group_based"], "description": "Split method"},
"test_size": {"type": "number", "description": "Test set proportion (default: 0.2)"},
"time_col": {"type": "string", "description": "Time column for time-based split"},
"group_col": {"type": "string", "description": "Group column for group-based split"}
},
"required": ["file_path", "method"]
}
}
},
# ============================================
# ADVANCED TRAINING (3)
# ============================================
{
"type": "function",
"function": {
"name": "hyperparameter_tuning",
"description": "Optimize model hyperparameters using Optuna (Bayesian optimization). Finds best model configuration automatically.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to prepared dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"model_type": {"type": "string", "enum": ["random_forest", "xgboost", "lightgbm"], "description": "Model to tune"},
"n_trials": {"type": "integer", "description": "Number of tuning trials (default: 100)"},
"task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
"output_path": {"type": "string", "description": "Path to save tuned model"}
},
"required": ["file_path", "target_col", "model_type", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "train_ensemble_models",
"description": "Train ensemble models using stacking, voting, or blending. Combines multiple models for better performance.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to prepared dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"ensemble_method": {"type": "string", "enum": ["stacking", "voting", "blending"], "description": "Ensemble technique"},
"base_models": {"type": "array", "items": {"type": "string"}, "description": "Base model types to ensemble"},
"task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
"output_path": {"type": "string", "description": "Path to save ensemble model"}
},
"required": ["file_path", "target_col", "ensemble_method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_cross_validation",
"description": "Perform k-fold cross-validation to get robust model performance estimates. Returns mean and std of metrics.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"model_type": {"type": "string", "description": "Model type (random_forest, xgboost, logistic, ridge)"},
"n_splits": {"type": "integer", "description": "Number of CV folds/splits (default: 5)"},
"task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "ML task type"},
"cv_strategy": {"type": "string", "enum": ["kfold", "stratified", "timeseries"], "description": "Cross-validation strategy (default: kfold)"},
"save_oof": {"type": "boolean", "description": "Whether to save out-of-fold predictions (default: false)"}
},
"required": ["file_path", "target_col", "model_type"]
}
}
},
# ============================================
# BUSINESS INTELLIGENCE (4)
# ============================================
{
"type": "function",
"function": {
"name": "perform_cohort_analysis",
"description": "Analyze user cohorts over time (retention, revenue, engagement). Essential for SaaS and e-commerce businesses.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to transaction/event data"},
"user_col": {"type": "string", "description": "User ID column"},
"date_col": {"type": "string", "description": "Date/timestamp column"},
"metric_col": {"type": "string", "description": "Metric to analyze (revenue, events, etc.)"},
"cohort_period": {"type": "string", "enum": ["daily", "weekly", "monthly"], "description": "Cohort grouping period"},
"output_path": {"type": "string", "description": "Path to save cohort analysis results"}
},
"required": ["file_path", "user_col", "date_col", "metric_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_rfm_analysis",
"description": "RFM (Recency, Frequency, Monetary) analysis for customer segmentation. Identifies best/worst customers.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to transaction data"},
"customer_col": {"type": "string", "description": "Customer ID column"},
"date_col": {"type": "string", "description": "Transaction date column"},
"amount_col": {"type": "string", "description": "Transaction amount column"},
"output_path": {"type": "string", "description": "Path to save RFM segments"}
},
"required": ["file_path", "customer_col", "date_col", "amount_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_causal_relationships",
"description": "Detect potential causal relationships between features using Granger causality and correlation analysis.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target/effect column"},
"feature_cols": {"type": "array", "items": {"type": "string"}, "description": "Potential cause columns"},
"method": {"type": "string", "enum": ["granger", "correlation"], "description": "Causality detection method"}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_business_insights",
"description": "Generate automated business insights using descriptive statistics, trends, and anomaly detection. Creates executive summary.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to business data"},
"metric_cols": {"type": "array", "items": {"type": "string"}, "description": "Key business metrics to analyze"},
"date_col": {"type": "string", "description": "Date column for trend analysis"},
"output_path": {"type": "string", "description": "Path to save insights report"}
},
"required": ["file_path", "metric_cols", "output_path"]
}
}
},
# ============================================
# COMPUTER VISION (3)
# ============================================
{
"type": "function",
"function": {
"name": "extract_image_features",
"description": "Extract features from images using pre-trained CNNs (ResNet, VGG). Converts images to feature vectors for ML.",
"parameters": {
"type": "object",
"properties": {
"image_dir": {"type": "string", "description": "Directory containing images"},
"model": {"type": "string", "enum": ["resnet", "vgg", "mobilenet"], "description": "Pre-trained model to use"},
"output_path": {"type": "string", "description": "Path to save feature vectors CSV"}
},
"required": ["image_dir", "model", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_image_clustering",
"description": "Cluster images based on visual similarity using K-means or DBSCAN on extracted features.",
"parameters": {
"type": "object",
"properties": {
"image_dir": {"type": "string", "description": "Directory containing images"},
"n_clusters": {"type": "integer", "description": "Number of clusters (default: auto-detect)"},
"method": {"type": "string", "enum": ["kmeans", "dbscan"], "description": "Clustering method"},
"output_path": {"type": "string", "description": "Path to save clustering results"}
},
"required": ["image_dir", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "analyze_tabular_image_hybrid",
"description": "Combine tabular data with image features for hybrid ML models. Useful for e-commerce/medical data.",
"parameters": {
"type": "object",
"properties": {
"tabular_path": {"type": "string", "description": "Path to tabular data CSV"},
"image_dir": {"type": "string", "description": "Directory with images"},
"image_id_col": {"type": "string", "description": "Column linking tabular data to images"},
"output_path": {"type": "string", "description": "Path to save combined features"}
},
"required": ["tabular_path", "image_dir", "image_id_col", "output_path"]
}
}
},
# ============================================
# NLP/TEXT ANALYTICS (4)
# ============================================
{
"type": "function",
"function": {
"name": "perform_topic_modeling",
"description": "Discover topics in text documents using LDA or NMF. Extract themes from customer reviews, articles, etc.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset with text"},
"text_col": {"type": "string", "description": "Text column name"},
"n_topics": {"type": "integer", "description": "Number of topics to extract (default: 5)"},
"method": {"type": "string", "enum": ["lda", "nmf"], "description": "Topic modeling method"},
"output_path": {"type": "string", "description": "Path to save topics and document-topic matrix"}
},
"required": ["file_path", "text_col", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_named_entity_recognition",
"description": "Extract named entities (person, organization, location) from text using NER models.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset with text"},
"text_col": {"type": "string", "description": "Text column name"},
"output_path": {"type": "string", "description": "Path to save dataset with extracted entities"}
},
"required": ["file_path", "text_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "analyze_sentiment_advanced",
"description": "Perform advanced sentiment analysis with aspect-based sentiment (what features customers like/dislike).",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset with text"},
"text_col": {"type": "string", "description": "Text column name"},
"aspects": {"type": "array", "items": {"type": "string"}, "description": "Aspects to analyze sentiment for (e.g., 'price', 'quality')"},
"output_path": {"type": "string", "description": "Path to save sentiment scores"}
},
"required": ["file_path", "text_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_text_similarity",
"description": "Calculate text similarity using cosine similarity, Jaccard, or semantic embeddings. Find duplicate/similar documents.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset with text"},
"text_col": {"type": "string", "description": "Text column name"},
"method": {"type": "string", "enum": ["cosine", "jaccard", "semantic"], "description": "Similarity method"},
"threshold": {"type": "number", "description": "Similarity threshold (0-1)"},
"output_path": {"type": "string", "description": "Path to save similarity matrix"}
},
"required": ["file_path", "text_col", "method", "output_path"]
}
}
},
# ============================================
# PRODUCTION/MLOPS (5)
# ============================================
{
"type": "function",
"function": {
"name": "monitor_model_drift",
"description": "Detect data drift and concept drift in production models. Compare training vs production data distributions.",
"parameters": {
"type": "object",
"properties": {
"train_data_path": {"type": "string", "description": "Path to original training data"},
"production_data_path": {"type": "string", "description": "Path to recent production data"},
"features": {"type": "array", "items": {"type": "string"}, "description": "Features to monitor for drift"},
"output_path": {"type": "string", "description": "Path to save drift report"}
},
"required": ["train_data_path", "production_data_path", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "explain_predictions",
"description": "Explain model predictions using SHAP or LIME. Generate feature importance explanations for individual predictions.",
"parameters": {
"type": "object",
"properties": {
"model_path": {"type": "string", "description": "Path to trained model"},
"data_path": {"type": "string", "description": "Path to data to explain"},
"method": {"type": "string", "enum": ["shap", "lime"], "description": "Explanation method"},
"n_samples": {"type": "integer", "description": "Number of samples to explain (default: 10)"},
"output_path": {"type": "string", "description": "Path to save explanations"}
},
"required": ["model_path", "data_path", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_model_card",
"description": "Generate model card documentation with model details, performance metrics, bias analysis, and usage guidelines.",
"parameters": {
"type": "object",
"properties": {
"model_path": {"type": "string", "description": "Path to trained model"},
"train_data_path": {"type": "string", "description": "Path to training data"},
"test_data_path": {"type": "string", "description": "Path to test data"},
"target_col": {"type": "string", "description": "Target column name"},
"output_path": {"type": "string", "description": "Path to save model card JSON"}
},
"required": ["model_path", "train_data_path", "test_data_path", "target_col", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_ab_test_analysis",
"description": "Analyze A/B test results with statistical significance testing. Determine if variant B is better than control A.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to A/B test data"},
"variant_col": {"type": "string", "description": "Column indicating variant (A/B)"},
"metric_col": {"type": "string", "description": "Success metric column"},
"confidence_level": {"type": "number", "description": "Confidence level for significance (default: 0.95)"}
},
"required": ["file_path", "variant_col", "metric_col"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_feature_leakage",
"description": "Detect potential feature leakage by analyzing feature importance and temporal relationships. Prevents data leakage bugs.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"date_col": {"type": "string", "description": "Optional date column for temporal analysis"}
},
"required": ["file_path", "target_col"]
}
}
},
# ============================================
# TIME SERIES (3)
# ============================================
{
"type": "function",
"function": {
"name": "forecast_time_series",
"description": "Forecast future values using ARIMA, Prophet, or LSTM models. Handles seasonal and trend components.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to time series data"},
"date_col": {"type": "string", "description": "Date/timestamp column"},
"value_col": {"type": "string", "description": "Value column to forecast"},
"forecast_periods": {"type": "integer", "description": "Number of periods to forecast"},
"method": {"type": "string", "enum": ["arima", "prophet", "lstm"], "description": "Forecasting method"},
"output_path": {"type": "string", "description": "Path to save forecast results"}
},
"required": ["file_path", "date_col", "value_col", "forecast_periods", "method", "output_path"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_seasonality_trends",
"description": "Detect seasonality patterns and trends in time series data using STL decomposition and statistical tests.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to time series data"},
"date_col": {"type": "string", "description": "Date/timestamp column"},
"value_col": {"type": "string", "description": "Value column to analyze"},
"period": {"type": "integer", "description": "Expected seasonal period (e.g., 12 for monthly)"}
},
"required": ["file_path", "date_col", "value_col"]
}
}
},
{
"type": "function",
"function": {
"name": "create_time_series_features",
"description": "Create comprehensive time series features: lags, rolling stats, exponential moving averages, and Fourier features.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to time series data"},
"date_col": {"type": "string", "description": "Date/timestamp column"},
"value_col": {"type": "string", "description": "Value column"},
"lags": {"type": "array", "items": {"type": "integer"}, "description": "Lag periods to create (e.g., [1, 7, 30])"},
"windows": {"type": "array", "items": {"type": "integer"}, "description": "Rolling window sizes (e.g., [7, 30])"},
"output_path": {"type": "string", "description": "Path to save dataset with time series features"}
},
"required": ["file_path", "date_col", "value_col", "output_path"]
}
}
},
# ============================================
# ADVANCED INSIGHTS TOOLS (6) - NEW
# ============================================
{
"type": "function",
"function": {
"name": "analyze_root_cause",
"description": "Perform root cause analysis to identify why a metric dropped or changed. Analyzes correlations, temporal patterns, and identifies top influencing factors.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Column to analyze (e.g., 'sales')"},
"time_col": {"type": "string", "description": "Optional time column for trend analysis"},
"threshold_drop": {"type": "number", "description": "Percentage drop to flag as significant (default 0.15)"}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_trends_and_seasonality",
"description": "Detect trends and seasonal patterns in time series data using statistical methods and autocorrelation.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"value_col": {"type": "string", "description": "Column with values to analyze"},
"time_col": {"type": "string", "description": "Column with timestamps"},
"seasonal_period": {"type": "integer", "description": "Expected seasonal period (auto-detected if None)"}
},
"required": ["file_path", "value_col", "time_col"]
}
}
},
{
"type": "function",
"function": {
"name": "detect_anomalies_advanced",
"description": "Detect anomalies with confidence scores using Isolation Forest or statistical methods.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to analyze (all numeric if None)"},
"contamination": {"type": "number", "description": "Expected proportion of outliers (default 0.1)"},
"method": {"type": "string", "enum": ["isolation_forest", "statistical"], "description": "Detection method"}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_hypothesis_testing",
"description": "Perform statistical hypothesis testing (t-test, ANOVA, chi-square) to compare groups.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"group_col": {"type": "string", "description": "Column defining groups"},
"value_col": {"type": "string", "description": "Column with values to compare"},
"test_type": {"type": "string", "enum": ["t-test", "anova", "chi-square", "auto"], "description": "Test type (auto-detected if 'auto')"}
},
"required": ["file_path", "group_col", "value_col"]
}
}
},
{
"type": "function",
"function": {
"name": "analyze_distribution",
"description": "Analyze distribution of a column including normality tests, skewness, and kurtosis.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"column": {"type": "string", "description": "Column to analyze"},
"tests": {"type": "array", "items": {"type": "string"}, "description": "Tests to perform (normality, skewness)"}
},
"required": ["file_path", "column"]
}
}
},
{
"type": "function",
"function": {
"name": "perform_segment_analysis",
"description": "Perform cluster-based customer/data segmentation using K-means and profile each segment.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"n_segments": {"type": "integer", "description": "Number of segments to create (default 5)"},
"features": {"type": "array", "items": {"type": "string"}, "description": "Features for clustering (all numeric if None)"}
},
"required": ["file_path"]
}
}
},
# ============================================
# AUTOMATED PIPELINE TOOLS (2) - NEW
# ============================================
{
"type": "function",
"function": {
"name": "auto_ml_pipeline",
"description": "Fully automated ML pipeline: auto-detect types, clean missing values, handle outliers, encode categorical, engineer features, and select best features. Zero configuration required!",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to input dataset"},
"target_col": {"type": "string", "description": "Target column name"},
"task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "Task type (auto-detected if 'auto')"},
"output_path": {"type": "string", "description": "Where to save processed data"},
"feature_engineering_level": {"type": "string", "enum": ["basic", "intermediate", "advanced"], "description": "Feature engineering depth"}
},
"required": ["file_path", "target_col"]
}
}
},
{
"type": "function",
"function": {
"name": "auto_feature_selection",
"description": "Automatically select the best features for modeling using mutual information or F-statistics.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Target column"},
"task_type": {"type": "string", "enum": ["classification", "regression", "auto"], "description": "Task type"},
"max_features": {"type": "integer", "description": "Maximum features to keep (default 50)"},
"method": {"type": "string", "enum": ["mutual_info", "f_test", "auto"], "description": "Selection method"},
"output_path": {"type": "string", "description": "Where to save selected features"}
},
"required": ["file_path", "target_col"]
}
}
},
# ============================================
# VISUALIZATION TOOLS (3) - NEW
# ============================================
{
"type": "function",
"function": {
"name": "generate_all_plots",
"description": "Generate ALL plots for a dataset automatically: data quality, EDA, distributions, and correlations. Creates interactive HTML plots.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Optional target column"},
"output_dir": {"type": "string", "description": "Directory to save plots (default ./outputs/plots)"}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_data_quality_plots",
"description": "Generate data quality visualizations: missing values, data types, and outlier detection plots.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"output_dir": {"type": "string", "description": "Directory to save plots"}
},
"required": ["file_path", "output_dir"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_eda_plots",
"description": "Generate exploratory data analysis plots: correlation heatmap, feature relationships, and pairplots.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Optional target column"},
"output_dir": {"type": "string", "description": "Directory to save plots"}
},
"required": ["file_path", "output_dir"]
}
}
},
# ============================================
# INTERACTIVE PLOTLY VISUALIZATIONS (6)
# ============================================
{
"type": "function",
"function": {
"name": "generate_interactive_scatter",
"description": "Create interactive scatter plot with zoom, pan, and hover capabilities. Great for exploring relationships between variables.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"x_col": {"type": "string", "description": "Column for X-axis"},
"y_col": {"type": "string", "description": "Column for Y-axis"},
"color_col": {"type": "string", "description": "Optional column for color coding points"},
"size_col": {"type": "string", "description": "Optional column for bubble size"},
"output_path": {"type": "string", "description": "Path to save HTML file (default: ./outputs/plots/interactive/scatter.html)"}
},
"required": ["file_path", "x_col", "y_col"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_interactive_histogram",
"description": "Create interactive histogram with box plot overlay. Users can explore distribution interactively.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"column": {"type": "string", "description": "Column to plot distribution"},
"bins": {"type": "integer", "description": "Number of bins (default: 30)"},
"color_col": {"type": "string", "description": "Optional column for grouped histograms"},
"output_path": {"type": "string", "description": "Path to save HTML file"}
},
"required": ["file_path", "column"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_interactive_correlation_heatmap",
"description": "Create interactive correlation heatmap with hover values. Better than static matplotlib version.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"output_path": {"type": "string", "description": "Path to save HTML file"}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_interactive_box_plots",
"description": "Create interactive box plots for outlier detection. Supports grouping by categorical variable.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"columns": {"type": "array", "items": {"type": "string"}, "description": "Columns to plot (all numeric if not specified)"},
"group_by": {"type": "string", "description": "Optional categorical column for grouping"},
"output_path": {"type": "string", "description": "Path to save HTML file"}
},
"required": ["file_path"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_interactive_time_series",
"description": "Create interactive time series plot with range slider and zoom. Perfect for temporal data analysis.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"time_col": {"type": "string", "description": "Column with datetime values"},
"value_cols": {"type": "array", "items": {"type": "string"}, "description": "Columns to plot over time"},
"output_path": {"type": "string", "description": "Path to save HTML file"}
},
"required": ["file_path", "time_col", "value_cols"]
}
}
},
{
"type": "function",
"function": {
"name": "generate_plotly_dashboard",
"description": "Generate complete interactive dashboard with multiple visualizations: correlation heatmap, box plots, scatter plots, histograms. One-stop visualization solution.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to dataset"},
"target_col": {"type": "string", "description": "Optional target column for supervised analysis"},
"output_dir": {"type": "string", "description": "Directory to save all plots (default: ./outputs/plots/interactive)"}
},
"required": ["file_path"]
}
}
},
# EDA Report Generation (1) - NEW PHASE 2
{
"type": "function",
"function": {
"name": "generate_ydata_profiling_report",
"description": "Generate comprehensive HTML report using ydata-profiling (formerly pandas-profiling). Provides extensive analysis: overview, variable statistics, interactions, correlations (Pearson, Spearman, Cramér's V), missing values matrix, duplicate analysis, and more. Most detailed and comprehensive profiling tool with automated insights and data quality warnings.",
"parameters": {
"type": "object",
"properties": {
"file_path": {"type": "string", "description": "Path to the dataset CSV/Parquet file"},
"output_path": {"type": "string", "description": "Where to save HTML report (default: ./outputs/reports/ydata_profile.html)"},
"minimal": {"type": "boolean", "description": "If true, generates faster minimal report (useful for large datasets, default: false)"},
"title": {"type": "string", "description": "Report title (default: 'Data Profiling Report')"}
},
"required": ["file_path"]
}
}
},
# ========================================
# CODE INTERPRETER - THE GAME CHANGER 🚀
# ========================================
{
"type": "function",
"function": {
"name": "execute_python_code",
"description": "⭐ CRITICAL TOOL - Execute custom Python code for ANY data science task not covered by existing tools. This is what makes you a TRUE AI AGENT, not just a function-calling bot. Use this when user requests: 1) Custom visualizations (specific Plotly plots, interactive dashboards, unique chart types) 2) Domain-specific calculations 3) Custom data transformations 4) Specific export formats 5) Interactive widgets/filters. Code has access to pandas, polars, numpy, matplotlib, seaborn, plotly. ALWAYS save outputs to files and return file paths.",
"parameters": {
"type": "object",
"properties": {
"code": {
"type": "string",
"description": "Python code to execute. Auto-imported: pandas as pd, polars as pl, numpy as np, matplotlib.pyplot as plt, seaborn as sns, plotly.express as px, plotly.graph_objects as go. Code should save outputs to files in working_directory. Example: fig.write_html('./outputs/code/plot.html')"
},
"working_directory": {
"type": "string",
"description": "Directory to run code in (default: ./outputs/code). Code can read from ./temp/ and write to this directory."
},
"timeout": {
"type": "integer",
"description": "Maximum execution time in seconds (default: 60)"
}
},
"required": ["code"]
}
}
},
{
"type": "function",
"function": {
"name": "execute_code_from_file",
"description": "Execute Python code from an existing .py file. Useful when code is too long to pass as string, or when running pre-written scripts. Same capabilities as execute_python_code.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to .py file to execute"
},
"working_directory": {
"type": "string",
"description": "Directory to run code in (default: ./outputs/code)"
},
"timeout": {
"type": "integer",
"description": "Maximum execution time in seconds (default: 60)"
}
},
"required": ["file_path"]
}
}
},
# ============================================
# CLOUD DATA SOURCES (4) - NEW
# ============================================
{
"type": "function",
"function": {
"name": "load_bigquery_table",
"description": "Load data from Google BigQuery table into a Polars DataFrame. Supports sampling via LIMIT and column selection. Returns CSV path for downstream tools. Use profile_bigquery_table first for large tables.",
"parameters": {
"type": "object",
"properties": {
"project_id": {
"type": "string",
"description": "Google Cloud project ID"
},
"dataset": {
"type": "string",
"description": "BigQuery dataset name"
},
"table": {
"type": "string",
"description": "BigQuery table name"
},
"limit": {
"type": "integer",
"description": "Optional row limit for sampling (e.g., 10000 for large tables)"
},
"columns": {
"type": "array",
"items": {"type": "string"},
"description": "Optional list of column names to load"
},
"where_clause": {
"type": "string",
"description": "Optional SQL WHERE clause for filtering (without WHERE keyword)"
}
},
"required": ["project_id", "dataset", "table"]
}
}
},
{
"type": "function",
"function": {
"name": "write_bigquery_table",
"description": "Write predictions or processed data from CSV/Parquet file to BigQuery table. Supports append, overwrite, or fail modes.",
"parameters": {
"type": "object",
"properties": {
"file_path": {
"type": "string",
"description": "Path to CSV or Parquet file to write"
},
"project_id": {
"type": "string",
"description": "Google Cloud project ID"
},
"dataset": {
"type": "string",
"description": "BigQuery dataset name"
},
"table": {
"type": "string",
"description": "BigQuery table name"
},
"mode": {
"type": "string",
"enum": ["append", "overwrite", "fail"],
"description": "Write mode: append (add rows), overwrite (replace), fail (error if exists)"
}
},
"required": ["file_path", "project_id", "dataset", "table"]
}
}
},
{
"type": "function",
"function": {
"name": "profile_bigquery_table",
"description": "Profile a BigQuery table without loading all data. Returns row count, column types, null counts (sampled), table size, and load recommendations. Use this BEFORE load_bigquery_table for large tables.",
"parameters": {
"type": "object",
"properties": {
"project_id": {
"type": "string",
"description": "Google Cloud project ID"
},
"dataset": {
"type": "string",
"description": "BigQuery dataset name"
},
"table": {
"type": "string",
"description": "BigQuery table name"
}
},
"required": ["project_id", "dataset", "table"]
}
}
},
{
"type": "function",
"function": {
"name": "query_bigquery",
"description": "Execute custom BigQuery SQL query and return results as DataFrame. Useful for complex aggregations, joins, or transformations before analysis.",
"parameters": {
"type": "object",
"properties": {
"project_id": {
"type": "string",
"description": "Google Cloud project ID"
},
"query": {
"type": "string",
"description": "SQL query to execute"
},
"output_path": {
"type": "string",
"description": "Optional path to save results (default: auto-generated)"
},
"limit": {
"type": "integer",
"description": "Optional row limit to append to query"
}
},
"required": ["project_id", "query"]
}
}
}
]
def get_tool_by_name(tool_name: str) -> dict:
"""Get tool definition by name."""
for tool in TOOLS:
if tool["function"]["name"] == tool_name:
return tool
raise ValueError(f"Tool '{tool_name}' not found in registry")
def get_all_tool_names() -> list:
"""Get list of all tool names."""
return [tool["function"]["name"] for tool in TOOLS]
def get_tools_by_category() -> dict:
"""Get tools organized by category."""
return {
"basic": [t["function"]["name"] for t in TOOLS[:16]],
"advanced_analysis": [t["function"]["name"] for t in TOOLS[16:21]],
"advanced_feature_engineering": [t["function"]["name"] for t in TOOLS[21:25]],
"advanced_preprocessing": [t["function"]["name"] for t in TOOLS[25:28]],
"advanced_training": [t["function"]["name"] for t in TOOLS[28:31]],
"business_intelligence": [t["function"]["name"] for t in TOOLS[31:35]],
"computer_vision": [t["function"]["name"] for t in TOOLS[35:38]],
"nlp_text_analytics": [t["function"]["name"] for t in TOOLS[38:42]],
"production_mlops": [t["function"]["name"] for t in TOOLS[42:47]],
"time_series": [t["function"]["name"] for t in TOOLS[47:50]],
"cloud_data_sources": [t["function"]["name"] for t in TOOLS[50:54]]
}