Spaces:

cryogenic22
/

sales_analytics

Runtime error

App Files Files Community

cryogenic22 commited on Mar 15, 2025

Commit

30ba4e2

verified ·

1 Parent(s): 4db5880

Create analytics_agent.py

Browse files

Files changed (1) hide show

agents/analytics_agent.py +292 -0

agents/analytics_agent.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import os
+import json
+import pandas as pd
+import numpy as np
+from typing import Dict, List, Any, Tuple, Optional
+from pydantic import BaseModel, Field
+from langchain_anthropic import ChatAnthropic
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+import re
+import matplotlib.pyplot as plt
+import seaborn as sns
+from io import StringIO
+class AnalysisRequest(BaseModel):
+    """Structure for an analysis request"""
+    request_id: str
+    description: str
+    data_sources: List[str]
+    analysis_type: str
+    parameters: Dict[str, Any] = None
+    purpose: str
+class AnalysisResult(BaseModel):
+    """Structure for analysis results"""
+    result_id: str
+    name: str
+    description: str
+    analysis_type: str
+    code: str
+    visualizations: List[str] = None
+    insights: List[Dict[str, Any]] = None
+    metrics: Dict[str, float] = None
+    model_details: Dict[str, Any] = None
+    attribution: Dict[str, float] = None
+    confidence: float = None
+class AnalyticsAgent:
+    """Agent responsible for data analysis and modeling"""
+    def __init__(self):
+        """Initialize the analytics agent"""
+        # Set up Claude API client
+        api_key = os.getenv("ANTHROPIC_API_KEY")
+        if not api_key:
+            raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
+        self.llm = ChatAnthropic(
+            model="claude-3-haiku-20240307",
+            anthropic_api_key=api_key,
+            temperature=0.1
+        )
+        # Create analysis code generation prompt
+        self.analysis_prompt = ChatPromptTemplate.from_messages([
+            ("system", """You are an expert data scientist specializing in pharmaceutical sales analysis.
+Your task is to generate Python code to analyze data based on specific requirements.
+For each analysis request:
+1. Generate clear, efficient pandas and numpy code
+2. Include appropriate data visualization with matplotlib/seaborn
+3. Apply statistical methods relevant to the analysis type
+4. Add detailed comments explaining your approach
+5. Extract and highlight key insights from the analysis
+The analysis should be thorough and focused on addressing the specific business question.
+Make sure to handle potential data issues and explain your assumptions.
+Format your response with a code block:
+```python
+# Analysis code
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+def run_analysis(data_sources):
+    # Your analysis code here
+    # Return results as a dictionary
+    return {
+        "insights": [
+            {"finding": "Key finding 1", "details": "Explanation", "impact": "Business impact"},
+            # More insights...
+        ],
+        "metrics": {
+            "metric1": value1,
+            "metric2": value2,
+            # More metrics...
+        },
+        "visualizations": ["fig1", "fig2"],  # References to generated figures
+        "attribution": {
+            "factor1": 0.65,  # 65% attribution to factor1
+            "factor2": 0.25,  # 25% attribution to factor2
+            "factor3": 0.10   # 10% attribution to factor3
+        },
+        "confidence": 0.95  # 95% confidence in the analysis
+    }
+```
+After the code block, explain your analytical approach and any assumptions.
+"""),
+            ("human", """
+Analysis Request: {description}
+Available data sources:
+{data_sources}
+Analysis type: {analysis_type}
+Parameters: {parameters}
+Purpose: {purpose}
+Please generate Python code to perform this analysis.
+""")
+        ])
+        # Set up the analysis chain
+        self.analysis_chain = (
+            self.analysis_prompt
+            | self.llm
+            | StrOutputParser()
+        )
+        # In-memory storage for analysis artifacts
+        self.analysis_artifacts = {}
+    def extract_python_from_response(self, response: str) -> str:
+        """Extract Python code from LLM response"""
+        # Extract Python between ```python and ``` markers
+        python_match = re.search(r'```python\s*(.*?)\s*```', response, re.DOTALL)
+        if python_match:
+            return python_match.group(1).strip()
+        # If not found with python tag, try generic code block
+        python_match = re.search(r'```\s*(.*?)\s*```', response, re.DOTALL)
+        if python_match:
+            return python_match.group(1).strip()
+        # If all else fails, return empty string
+        return ""
+    def extract_insights_from_code_output(self, output: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], Dict[str, float], float]:
+        """Extract insights, attribution, and confidence from code output"""
+        insights = output.get("insights", [])
+        attribution = output.get("attribution", {})
+        confidence = output.get("confidence", 0.0)
+        return insights, attribution, confidence
+    def perform_analysis(self, request: AnalysisRequest, data_sources: Dict[str, Any]) -> AnalysisResult:
+        """Perform analysis based on request and return results"""
+        print(f"Analytics Agent: Performing {request.analysis_type} analysis - {request.description}")
+        # Format data sources description for the prompt
+        data_sources_desc = ""
+        for source_id, source in data_sources.items():
+            df = source.content
+            data_sources_desc += f"Data source '{source_id}' ({source.name}):\n"
+            data_sources_desc += f"- Shape: {df.shape[0]} rows, {df.shape[1]} columns\n"
+            data_sources_desc += f"- Columns: {', '.join(df.columns)}\n"
+            data_sources_desc += f"- Sample data:\n{df.head(3).to_string()}\n\n"
+        # Format the request for the prompt
+        request_data = {
+            "description": request.description,
+            "data_sources": data_sources_desc,
+            "analysis_type": request.analysis_type,
+            "parameters": json.dumps(request.parameters, indent=2) if request.parameters else "None",
+            "purpose": request.purpose
+        }
+        # Generate analysis code
+        response = self.analysis_chain.invoke(request_data)
+        # Extract Python code
+        python_code = self.extract_python_from_response(response)
+        # Execute analysis (with safety checks)
+        insights = []
+        attribution = {}
+        confidence = 0.0
+        visualizations = []
+        metrics = {}
+        if not python_code:
+            print("Warning: No analysis code generated.")
+        else:
+            try:
+                # Prepare data sources for the analysis
+                analysis_data_sources = {src_id: src.content for src_id, src in data_sources.items()}
+                # Create a local namespace with access to pandas, numpy, etc.
+                local_namespace = {
+                    "pd": pd,
+                    "np": np,
+                    "plt": plt,
+                    "sns": sns,
+                    "data_sources": analysis_data_sources
+                }
+                # Capture print outputs
+                original_stdout = sys.stdout
+                sys.stdout = mystdout = StringIO()
+                # Execute the code
+                exec(python_code, local_namespace)
+                # Restore stdout
+                sys.stdout = original_stdout
+                print_output = mystdout.getvalue()
+                # Look for a run_analysis function and execute it
+                if "run_analysis" in local_namespace:
+                    analysis_output = local_namespace["run_analysis"](analysis_data_sources)
+                    if isinstance(analysis_output, dict):
+                        insights = analysis_output.get("insights", [])
+                        attribution = analysis_output.get("attribution", {})
+                        confidence = analysis_output.get("confidence", 0.0)
+                        metrics = analysis_output.get("metrics", {})
+                        visualizations = analysis_output.get("visualizations", [])
+                        # Store any figures in the local namespace as base64 encoded images
+                        for var_name, var_value in local_namespace.items():
+                            if isinstance(var_value, plt.Figure):
+                                fig_filename = f"figure_{request.request_id}_{var_name}.png"
+                                var_value.savefig(fig_filename)
+                                self.analysis_artifacts[fig_filename] = fig_filename
+                                visualizations.append(fig_filename)
+            except Exception as e:
+                print(f"Analysis execution error: {e}")
+        # Create analysis result
+        result = AnalysisResult(
+            result_id=f"analysis_{request.request_id}",
+            name=f"Analysis of {request.description}",
+            description=request.description,
+            analysis_type=request.analysis_type,
+            code=python_code,
+            visualizations=visualizations,
+            insights=insights,
+            metrics=metrics,
+            attribution=attribution,
+            confidence=confidence
+        )
+        return result
+# For testing
+if __name__ == "__main__":
+    import sys
+    # Set API key for testing
+    os.environ["ANTHROPIC_API_KEY"] = "your_api_key_here"
+    # Create mock data for testing
+    test_df = pd.DataFrame({
+        'date': pd.date_range(start='2023-01-01', periods=12, freq='M'),
+        'region': ['Northeast'] * 12,
+        'sales': [100, 110, 105, 115, 120, 115, 110, 105, 95, 85, 80, 70],
+        'target': [100, 105, 110, 115, 120, 125, 130, 135, 140, 145, 150, 155]
+    })
+    # Create mock data source
+    from dataclasses import dataclass
+    @dataclass
+    class MockDataSource:
+        content: pd.DataFrame
+        name: str
+    data_sources = {
+        "sales_data": MockDataSource(content=test_df, name="Monthly sales data")
+    }
+    # Create mock analysis request
+    class MockAnalysisRequest:
+        def __init__(self):
+            self.request_id = "test"
+            self.description = "Sales trend analysis for the Northeast region"
+            self.data_sources = ["sales_data"]
+            self.analysis_type = "time_series"
+            self.parameters = {"detect_anomalies": True}
+            self.purpose = "Identify factors causing the sales decline"
+    agent = AnalyticsAgent()
+    result = agent.perform_analysis(MockAnalysisRequest(), data_sources)
+    print(f"Generated code:\n{result.code}")
+    print(f"Insights: {json.dumps(result.insights, indent=2)}")
+    print(f"Attribution: {json.dumps(result.attribution, indent=2)}")