""" Dynamic prompt generation for small context window models. Loads only relevant tools based on user intent to reduce token usage. """ from typing import List, Dict, Set import re # Intent categories and their keywords INTENT_KEYWORDS = { "data_quality": ["clean", "missing", "outlier", "quality", "duplicates", "null", "na", "impute"], "visualization": ["plot", "chart", "graph", "visualize", "dashboard", "scatter", "histogram", "heatmap"], "feature_engineering": ["feature", "encode", "transform", "scale", "normalize", "binning", "interaction"], "model_training": ["train", "model", "predict", "classify", "regression", "forecast", "xgboost", "accuracy"], "eda": ["profile", "describe", "summary", "statistics", "distribution", "correlation", "eda"], "time_series": ["time", "date", "datetime", "temporal", "trend", "seasonality", "forecast"], "optimization": ["tune", "optimize", "hyperparameter", "improve", "best parameters"], "code_execution": ["execute", "run code", "calculate", "custom", "python"], } # Tool categories mapping TOOL_CATEGORIES = { "data_quality": [ "detect_data_quality_issues", "clean_missing_values", "handle_outliers", "detect_and_remove_duplicates", "force_numeric_conversion", ], "visualization": [ "generate_interactive_scatter", "generate_interactive_histogram", "generate_interactive_correlation_heatmap", "generate_interactive_box_plots", "generate_interactive_time_series", "generate_plotly_dashboard", "generate_all_plots", "generate_data_quality_plots", "generate_eda_plots", ], "feature_engineering": [ "encode_categorical", "perform_feature_scaling", "create_time_features", "create_ratio_features", "create_statistical_features", "create_log_features", "create_binned_features", "auto_feature_engineering", ], "model_training": [ "train_baseline_models", "hyperparameter_tuning", "train_ensemble_models", "perform_cross_validation", "handle_imbalanced_data", "auto_ml_pipeline", ], "eda": [ "profile_dataset", "generate_ydata_profiling_report", "analyze_distribution", "detect_trends_and_seasonality", "perform_hypothesis_testing", ], "time_series": [ "create_time_features", "forecast_time_series", "detect_trends_and_seasonality", "generate_interactive_time_series", ], "optimization": [ "hyperparameter_tuning", "auto_feature_selection", "detect_and_handle_multicollinearity", ], "code_execution": [ "execute_python_code", "execute_code_from_file", ], } # Core tools always included (used in all workflows) CORE_TOOLS = [ "profile_dataset", "detect_data_quality_issues", "clean_missing_values", "encode_categorical", ] def detect_intent(query: str) -> Set[str]: """ Detect user intent from query using keyword matching. Args: query: User's natural language query Returns: Set of intent categories detected """ query_lower = query.lower() detected_intents = set() for intent, keywords in INTENT_KEYWORDS.items(): for keyword in keywords: if keyword in query_lower: detected_intents.add(intent) break # Default to EDA if no specific intent detected if not detected_intents: detected_intents.add("eda") return detected_intents def get_relevant_tools(intents: Set[str]) -> List[str]: """ Get list of relevant tools based on detected intents. Args: intents: Set of detected intent categories Returns: List of tool names to include in prompt """ tools = set(CORE_TOOLS) # Always include core tools for intent in intents: if intent in TOOL_CATEGORIES: tools.update(TOOL_CATEGORIES[intent]) return sorted(list(tools)) def build_compact_system_prompt(user_query: str = None, detected_intents: Set[str] = None) -> str: """ Build a compact system prompt with only relevant tools. Args: user_query: Optional user query to detect intent detected_intents: Optional pre-detected intents Returns: Compact system prompt string """ # Detect intents if not provided if detected_intents is None and user_query: detected_intents = detect_intent(user_query) elif detected_intents is None: detected_intents = {"eda"} # Default # Get relevant tools relevant_tools = get_relevant_tools(detected_intents) # Build tool list string tool_list = "\n".join([f"- {tool}" for tool in relevant_tools]) prompt = f"""You are an autonomous Data Science Agent. You EXECUTE tasks, not advise. **TOOL CALLING FORMAT:** When you need to use a tool, respond with JSON: ```json {{ "tool": "tool_name", "arguments": {{"param1": "value1"}} }} ``` **RELEVANT TOOLS FOR THIS TASK:** {tool_list} **WORKFLOW RULES:** 1. **Execute tools sequentially** - ONE tool per response 2. **Use tool outputs** as inputs to next tool 3. **Save outputs** to ./outputs/data/ or ./outputs/plots/ 4. **Error recovery**: If tool fails, retry with corrected parameters OR skip to next step 5. **Never repeat** successful tools 6. **Stop when done** - Don't continue after fulfilling user request **COMMON WORKFLOWS:** **Visualization Only:** - User wants plots/charts/dashboard - generate_plotly_dashboard OR generate_interactive_scatter → STOP **Data Profiling:** - User wants "detailed report" - generate_ydata_profiling_report → STOP **Full ML Pipeline:** - User wants model training - profile_dataset → detect_data_quality_issues → clean_missing_values → encode_categorical → train_baseline_models → generate_plotly_dashboard **PARAMETER CORRECTIONS:** - Use exact column names from error messages - If "Did you mean X?" → retry with X - output_path (not output or output_dir) - file_path for data files **ERROR RECOVERY:** - Column not found? Use suggested column from error - File not found? Use last successful file - Missing param? Add the required parameter - Tool failed? Skip to next step (don't get stuck) Execute the user's task efficiently with relevant tools.""" return prompt def get_full_system_prompt() -> str: """ Get the original full system prompt for models with large context windows. This is the complete version used with Gemini 2.5 Flash. """ # Import the original prompt from orchestrator from src.orchestrator import DataScienceCopilot copilot = DataScienceCopilot.__new__(DataScienceCopilot) return copilot._build_system_prompt() # Quick stats def get_prompt_stats(prompt: str) -> Dict[str, int]: """Get token count estimate and character count for prompt.""" chars = len(prompt) # Rough estimate: 1 token ≈ 4 characters tokens = chars // 4 lines = len(prompt.split('\n')) return { "characters": chars, "estimated_tokens": tokens, "lines": lines, } if __name__ == "__main__": # Demo: Compare full vs compact prompts print("=" * 80) print("DYNAMIC PROMPT SYSTEM DEMO") print("=" * 80) # Example 1: Visualization request query1 = "Generate interactive plots for magnitude and latitude" intents1 = detect_intent(query1) prompt1 = build_compact_system_prompt(user_query=query1) stats1 = get_prompt_stats(prompt1) print(f"\n📊 Example 1: '{query1}'") print(f"Detected intents: {intents1}") print(f"Tools loaded: {len(get_relevant_tools(intents1))}") print(f"Prompt stats: {stats1['estimated_tokens']} tokens, {stats1['lines']} lines") # Example 2: Full ML pipeline query2 = "Train a model to predict earthquake magnitude" intents2 = detect_intent(query2) prompt2 = build_compact_system_prompt(user_query=query2) stats2 = get_prompt_stats(prompt2) print(f"\n🤖 Example 2: '{query2}'") print(f"Detected intents: {intents2}") print(f"Tools loaded: {len(get_relevant_tools(intents2))}") print(f"Prompt stats: {stats2['estimated_tokens']} tokens, {stats2['lines']} lines") # Example 3: Data profiling query3 = "Generate a detailed profiling report" intents3 = detect_intent(query3) prompt3 = build_compact_system_prompt(user_query=query3) stats3 = get_prompt_stats(prompt3) print(f"\n📈 Example 3: '{query3}'") print(f"Detected intents: {intents3}") print(f"Tools loaded: {len(get_relevant_tools(intents3))}") print(f"Prompt stats: {stats3['estimated_tokens']} tokens, {stats3['lines']} lines") print("\n" + "=" * 80) print("SUMMARY: Compact prompts reduce tokens by 80-90% for small context models!") print("=" * 80)