Spaces:
Running
Running
Pulastya B
feat: Initial commit - Data Science Agent with React frontend and FastAPI backend
226ac39
| """ | |
| Tools Registry for Groq Function Calling | |
| Defines all available tools in Groq's function calling format. | |
| """ | |
| TOOLS = [ | |
| # Data Profiling Tools | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "profile_dataset", | |
| "description": "Get comprehensive statistics about a dataset including shape, data types, memory usage, null counts, and unique values. Use this as the first step to understand any new dataset.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Absolute or relative path to the CSV or Parquet file" | |
| } | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "detect_data_quality_issues", | |
| "description": "Detect data quality issues including outliers (using IQR method), duplicate rows, inconsistent formats, and data anomalies. Returns a prioritized list of issues with severity levels.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| } | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "analyze_correlations", | |
| "description": "Compute correlation matrix and identify top correlations. If a target column is specified, shows features most correlated with the target. Useful for feature selection and understanding relationships.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "target": { | |
| "type": "string", | |
| "description": "Optional target column name to analyze correlations with" | |
| } | |
| }, | |
| "required": ["file_path"] | |
| } | |
| } | |
| }, | |
| # Data Cleaning Tools | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "clean_missing_values", | |
| "description": "Handle missing values using appropriate strategies based on column type. Strategies include median/mean for numeric, mode for categorical, forward_fill for time series, or drop. Will not impute ID columns.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "strategy": { | |
| "type": "object", | |
| "description": "Dictionary mapping column names to strategies ('median', 'mean', 'mode', 'forward_fill', 'drop'). Use 'auto' to let the tool decide based on data type.", | |
| "additionalProperties": { | |
| "type": "string" | |
| } | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save cleaned dataset" | |
| } | |
| }, | |
| "required": ["file_path", "strategy", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "handle_outliers", | |
| "description": "Detect and handle outliers in numeric columns using IQR method. Methods: 'clip' (cap at boundaries), 'winsorize' (cap at percentiles), or 'remove' (delete rows).", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "method": { | |
| "type": "string", | |
| "enum": ["clip", "winsorize", "remove"], | |
| "description": "Method to handle outliers" | |
| }, | |
| "columns": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "List of column names to check for outliers. Use 'all' to check all numeric columns." | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save cleaned dataset" | |
| } | |
| }, | |
| "required": ["file_path", "method", "columns", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "fix_data_types", | |
| "description": "Auto-detect and fix incorrect data types. Handles dates, booleans, categoricals, and numeric columns. Fixes common issues like 'null' strings and mixed types.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "type_mapping": { | |
| "type": "object", | |
| "description": "Optional dictionary mapping column names to target types ('int', 'float', 'string', 'date', 'bool', 'category'). Use 'auto' for automatic detection.", | |
| "additionalProperties": { | |
| "type": "string" | |
| } | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save dataset with fixed types" | |
| } | |
| }, | |
| "required": ["file_path", "output_path"] | |
| } | |
| } | |
| }, | |
| # Feature Engineering Tools | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "create_time_features", | |
| "description": "Extract comprehensive time-based features from datetime columns including year, month, day, day_of_week, quarter, is_weekend, and cyclical encodings (sin/cos for month and hour).", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "date_col": { | |
| "type": "string", | |
| "description": "Name of the datetime column to extract features from" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save dataset with new features" | |
| } | |
| }, | |
| "required": ["file_path", "date_col", "output_path"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "encode_categorical", | |
| "description": "Encode categorical variables using one-hot encoding, target encoding, or frequency encoding. Handles high-cardinality columns intelligently.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the dataset file" | |
| }, | |
| "method": { | |
| "type": "string", | |
| "enum": ["one_hot", "target", "frequency"], | |
| "description": "Encoding method to use" | |
| }, | |
| "columns": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "List of categorical columns to encode. Use 'all' to encode all categorical columns." | |
| }, | |
| "target_col": { | |
| "type": "string", | |
| "description": "Required for target encoding: name of the target column" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save dataset with encoded features" | |
| } | |
| }, | |
| "required": ["file_path", "method", "columns", "output_path"] | |
| } | |
| } | |
| }, | |
| # Model Training Tools | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "train_baseline_models", | |
| "description": "Train multiple baseline models (Logistic Regression, Random Forest, XGBoost) and compare their performance. Automatically detects task type (classification/regression) and returns the best model with metrics.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "file_path": { | |
| "type": "string", | |
| "description": "Path to the prepared dataset file" | |
| }, | |
| "target_col": { | |
| "type": "string", | |
| "description": "Name of the target column to predict" | |
| }, | |
| "task_type": { | |
| "type": "string", | |
| "enum": ["classification", "regression", "auto"], | |
| "description": "Type of ML task. Use 'auto' to detect automatically." | |
| }, | |
| "test_size": { | |
| "type": "number", | |
| "description": "Proportion of data to use for testing (default: 0.2)" | |
| }, | |
| "random_state": { | |
| "type": "integer", | |
| "description": "Random seed for reproducibility (default: 42)" | |
| } | |
| }, | |
| "required": ["file_path", "target_col"] | |
| } | |
| } | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "generate_model_report", | |
| "description": "Generate comprehensive model evaluation report including metrics, confusion matrix (for classification), feature importance, and SHAP values for top features. Saves report as JSON.", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "model_path": { | |
| "type": "string", | |
| "description": "Path to saved model file (.pkl or .joblib)" | |
| }, | |
| "test_data_path": { | |
| "type": "string", | |
| "description": "Path to test dataset file" | |
| }, | |
| "target_col": { | |
| "type": "string", | |
| "description": "Name of the target column" | |
| }, | |
| "output_path": { | |
| "type": "string", | |
| "description": "Path to save the report JSON file" | |
| } | |
| }, | |
| "required": ["model_path", "test_data_path", "target_col", "output_path"] | |
| } | |
| } | |
| } | |
| ] | |
| def get_tool_by_name(tool_name: str) -> dict: | |
| """ | |
| Get tool definition by name. | |
| Args: | |
| tool_name: Name of the tool | |
| Returns: | |
| Tool definition dictionary | |
| Raises: | |
| ValueError: If tool not found | |
| """ | |
| for tool in TOOLS: | |
| if tool["function"]["name"] == tool_name: | |
| return tool | |
| raise ValueError(f"Tool '{tool_name}' not found in registry") | |
| def get_all_tool_names() -> list: | |
| """ | |
| Get list of all tool names. | |
| Returns: | |
| List of tool names | |
| """ | |
| return [tool["function"]["name"] for tool in TOOLS] | |