""" DataFrame Analysis Agent This module implements the pandas DataFrame analysis agent that processes CSV files and answers natural language questions about the data. The agent uses LangChain's create_pandas_dataframe_agent to enable natural language interaction with pandas DataFrames. Example: >>> from src.agents import ask_agent >>> result = ask_agent(files, "What is the average revenue by region?") """ import io import contextlib from typing import List, Optional, Any import pandas as pd from langchain.agents import AgentType from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent from ..prompts import get_analysis_prompt class DataFrameAgent: """ Agent for analyzing pandas DataFrames using natural language queries. This agent wraps LangChain's pandas DataFrame agent and provides a simplified interface for data analysis tasks. Attributes: model: The LLM model to use for inference (e.g., AzureChatOpenAI). verbose: Whether to enable verbose logging for debugging. Example: >>> agent = DataFrameAgent(model=azure_llm) >>> df = pd.read_csv("sales.csv") >>> result = agent.analyze(df, "What are the top 5 products by revenue?") """ def __init__(self, model: Any, verbose: bool = True): """ Initialize the DataFrame agent. Args: model: The LLM model instance to use for inference. Must be a LangChain-compatible chat model. verbose: Enable verbose output for debugging (default: True). """ self.model = model self.verbose = verbose def analyze(self, df: pd.DataFrame, question: str) -> str: """ Analyze a DataFrame and answer a natural language question. This method creates a LangChain pandas agent, constructs the full prompt, and invokes the agent to generate insights. Args: df: The pandas DataFrame to analyze. question: The natural language question about the data. Returns: str: The agent's analysis and answer in Markdown format. Raises: Exception: If the agent encounters an error during analysis. Example: >>> result = agent.analyze(sales_df, "Show monthly revenue trends") >>> print(result) # Markdown formatted analysis """ try: # Create the pandas DataFrame agent with ZERO_SHOT_REACT approach # This agent type can handle tasks without needing few-shot examples pandas_agent = create_pandas_dataframe_agent( llm=self.model, df=df, verbose=self.verbose, agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION, allow_dangerous_code=True, # Required for code execution handle_parsing_errors=True, # Gracefully handle LLM parsing issues ) # Construct the full prompt with prefix and suffix full_prompt = get_analysis_prompt(question) # Capture stdout to get the agent's reasoning trace buffer = io.StringIO() with contextlib.redirect_stdout(buffer): result = pandas_agent.invoke(full_prompt) # Extract the final output from the agent response return result.get("output", str(result)) except Exception as e: return f"Analysis error: {e}" def ask_agent( files: List[Any], question: str, model: Optional[Any] = None ) -> str: """ Analyze uploaded CSV files and answer a question about the data. This is a convenience function that handles file loading, DataFrame concatenation, and agent invocation in one call. Args: files: List of file objects with a .name attribute pointing to CSV paths. Typically comes from Gradio's file upload component. question: The natural language question to answer about the data. model: Optional LLM model to use. If None, uses the global model. Returns: str: The analysis result in Markdown format, or an error message. Note: Multiple CSV files are concatenated into a single DataFrame before analysis. Ensure files have compatible schemas for meaningful results. Example: >>> # With Gradio file input >>> result = ask_agent(uploaded_files, "What is the total revenue?") """ # Step 1: Load and concatenate all uploaded CSV files try: dataframes = [pd.read_csv(f.name) for f in files] combined_df = pd.concat(dataframes, ignore_index=True) except Exception as e: return f"Could not read CSV files: {e}" # Step 2: Create agent and perform analysis if model is None: return "Error: No LLM model provided. Please configure the model first." agent = DataFrameAgent(model=model) return agent.analyze(combined_df, question)