Spaces:

jzou19950715
/

Huggingface_AI_Agent_Function_Testing_2

Configuration error

App Files Files Community

jzou19950715 commited on Jan 24, 2025

Commit

344d561

verified ·

1 Parent(s): e68b5c5

Update app.py

Browse files

Files changed (1) hide show

app.py +275 -134

app.py CHANGED Viewed

@@ -1,175 +1,279 @@
 """
-Enhanced Data Analysis Assistant using smolagents for more powerful analysis capabilities.
 """
 import base64
 import io
 import os
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Union
 from pathlib import Path
 import gradio as gr
-import pandas as pd
 import numpy as np
 import plotly.express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
-import matplotlib.pyplot as plt
 import seaborn as sns
-from smolagents import CodeAgent, tool
 # Constants
 SUPPORTED_FILE_TYPES = [".csv", ".xlsx", ".xls"]
-DEFAULT_MODEL = "gpt-4o-mini"
-@tool
-def create_plotly_visualization(df: pd.DataFrame, plot_type: str, x: str, y: str,
-                              color: Optional[str] = None, title: Optional[str] = None) -> str:
-    """Create an interactive Plotly visualization.
-    Args:
-        df: DataFrame to visualize
-        plot_type: Type of plot (scatter, line, bar, box)
-        x: Column for x-axis
-        y: Column for y-axis
-        color: Optional column for color encoding
-        title: Optional plot title
-    Returns:
-        HTML string of the plot
-    """
-    if plot_type == "scatter":
-        fig = px.scatter(df, x=x, y=y, color=color, title=title)
-    elif plot_type == "line":
-        fig = px.line(df, x=x, y=y, color=color, title=title)
-    elif plot_type == "bar":
-        fig = px.bar(df, x=x, y=y, color=color, title=title)
-    elif plot_type == "box":
-        fig = px.box(df, x=x, y=y, color=color, title=title)
-    else:
-        raise ValueError(f"Unsupported plot type: {plot_type}")
-    return fig.to_html(include_plotlyjs=True, full_html=False)
-@tool
-def calculate_statistics(df: pd.DataFrame, columns: List[str]) -> Dict[str, Any]:
-    """Calculate basic statistics for specified columns.
-    Args:
-        df: DataFrame to analyze
-        columns: List of columns to analyze
-    Returns:
-        Dictionary of statistics
-    """
-    stats = {}
-    for col in columns:
-        if pd.api.types.is_numeric_dtype(df[col]):
-            stats[col] = {
-                "mean": df[col].mean(),
-                "median": df[col].median(),
-                "std": df[col].std(),
-                "min": df[col].min(),
-                "max": df[col].max(),
-                "missing": df[col].isna().sum()
-            }
-    return stats
-@tool
-def correlation_analysis(df: pd.DataFrame, threshold: float = 0.5) -> str:
-    """Generate correlation analysis with interactive heatmap.
-    Args:
-        df: DataFrame to analyze
-        threshold: Correlation threshold to highlight
-    Returns:
-        HTML string of the correlation heatmap
-    """
-    numeric_df = df.select_dtypes(include=[np.number])
-    corr = numeric_df.corr()
-    fig = go.Figure(data=go.Heatmap(
-        z=corr,
-        x=corr.columns,
-        y=corr.columns,
-        colorscale='RdBu',
-    ))
-    fig.update_layout(
-        title="Correlation Heatmap",
-        height=600,
-    )
-    return fig.to_html(include_plotlyjs=True, full_html=False)
 class DataAnalysisAssistant:
-    """Enhanced data analysis assistant using smolagents."""
-    def __init__(self, api_key: str, model_id: str = DEFAULT_MODEL):
-        """Initialize the assistant with API key and model."""
-        os.environ["OPENAI_API_KEY"] = api_key
         self.agent = CodeAgent(
-            tools=[
-                create_plotly_visualization,
-                calculate_statistics,
-                correlation_analysis
-            ],
-            model=model_id,
             additional_authorized_imports=[
-                "pandas",
-                "numpy",
-                "plotly.express",
-                "plotly.graph_objects",
-                "seaborn",
-            ]
         )
     def analyze(self, df: pd.DataFrame, query: str) -> str:
-        """Run analysis using the agent.
-        Args:
-            df: DataFrame to analyze
-            query: User's analysis request
-        Returns:
-            HTML string containing analysis and visualizations
-        """
-        context = f"""
-        Available DataFrame (as 'df'):
-        - Shape: {df.shape}
-        - Columns: {', '.join(df.columns)}
-        - Data Types:
-        {chr(10).join([f'  • {col}: {dtype}' for col, dtype in df.dtypes.items()])}
         User Query: {query}
-        Please provide:
-        1. Data insights and findings
-        2. Interactive visualizations where appropriate
-        3. Statistical analysis
-        4. Clear explanations
-        You can use these tools:
-        - create_plotly_visualization: Creates interactive Plotly plots
-        - calculate_statistics: Provides statistical summaries
-        - correlation_analysis: Generates correlation heatmaps
         """
-        try:
-            result = self.agent.run(context, additional_args={"df": df})
-            return str(result)
-        except Exception as e:
-            return f"Analysis failed: {str(e)}"
 def process_file(file: gr.File) -> Optional[pd.DataFrame]:
     """Process uploaded file into DataFrame."""
     if not file:
         return None
     try:
         file_path = Path(file.name)
         if file_path.suffix == '.csv':
@@ -181,19 +285,25 @@ def process_file(file: gr.File) -> Optional[pd.DataFrame]:
     except Exception as e:
         raise RuntimeError(f"Error reading file: {str(e)}")
-def analyze_data(file: gr.File, query: str, api_key: str) -> str:
     """Main analysis function for Gradio interface."""
     if not api_key:
         return "Error: Please provide an API key"
     if not file:
         return "Error: Please upload a data file"
     try:
         df = process_file(file)
         if df is None:
             return "Error: Could not process file"
         assistant = DataAnalysisAssistant(api_key)
         return assistant.analyze(df, query)
@@ -201,7 +311,7 @@ def analyze_data(file: gr.File, query: str, api_key: str) -> str:
         return f"Error: {str(e)}"
 def create_interface():
-    """Create Gradio interface."""
     css = """
     .plot-container {
         margin: 20px 0;
@@ -209,14 +319,34 @@ def create_interface():
         border: 1px solid #e0e0e0;
         border-radius: 8px;
         background: white;
     }
     """
     with gr.Blocks(css=css) as interface:
         gr.Markdown("""
-        # Enhanced Data Analysis Assistant
-        Powered by smolagents for more intelligent analysis
         """)
         with gr.Row():
@@ -227,12 +357,12 @@ def create_interface():
                 )
                 query = gr.Textbox(
                     label="What would you like to analyze?",
-                    placeholder="e.g., Show relationships between variables with interactive plots",
                     lines=3
                 )
                 api_key = gr.Textbox(
-                    label="API Key",
-                    placeholder="Your OpenAI API key",
                     type="password"
                 )
                 analyze_btn = gr.Button("Analyze")
@@ -246,6 +376,17 @@ def create_interface():
             outputs=output
         )
     return interface
 if __name__ == "__main__":

 """
+Advanced Data Analysis Assistant with Interactive Visualizations
+Integrates smolagents, GPT-4, and interactive Plotly visualizations.
 """
 import base64
 import io
+import json
 import os
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Union, Tuple
 import gradio as gr
 import numpy as np
+import pandas as pd
 import plotly.express as px
 import plotly.graph_objects as go
 from plotly.subplots import make_subplots
 import seaborn as sns
+from smolagents import CodeAgent, LiteLLMModel, tool
+from datetime import datetime, timedelta
 # Constants
 SUPPORTED_FILE_TYPES = [".csv", ".xlsx", ".xls"]
+DEFAULT_MODEL = "gpt-4"
+HISTORY_FILE = "analysis_history.json"
+@dataclass
+class VisualizationConfig:
+    """Configuration for visualizations."""
+    width: int = 800
+    height: int = 500
+    template: str = "plotly_white"
+    show_grid: bool = True
+    interactive: bool = True
+class DataPreprocessor:
+    """Handles data preprocessing and validation."""
+    @staticmethod
+    def preprocess_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Any]]:
+        """Preprocess the dataframe and return metadata."""
+        metadata = {
+            "original_shape": df.shape,
+            "missing_values": df.isnull().sum().to_dict(),
+            "dtypes": df.dtypes.astype(str).to_dict(),
+            "numeric_columns": df.select_dtypes(include=[np.number]).columns.tolist(),
+            "categorical_columns": df.select_dtypes(include=['object']).columns.tolist(),
+            "temporal_columns": []
+        }
+        # Handle date/time columns
+        for col in df.columns:
+            try:
+                pd.to_datetime(df[col])
+                metadata["temporal_columns"].append(col)
+                df[col] = pd.to_datetime(df[col])
+            except:
+                continue
+        # Handle missing values
+        df = df.fillna(method='ffill').fillna(method='bfill')
+        return df, metadata
+class CodeExecutionEnvironment:
+    """Safe environment for executing analysis code."""
+    def __init__(self, visualization_config: Optional[VisualizationConfig] = None):
+        self.viz_config = visualization_config or VisualizationConfig()
+        self.globals = {
+            'pd': pd,
+            'np': np,
+            'px': px,
+            'go': go,
+            'make_subplots': make_subplots,
+            'sns': sns
+        }
+        self.locals = {}
+    def execute(self, code: str, df: pd.DataFrame = None) -> Dict[str, Any]:
+        """Execute code and capture outputs including visualizations."""
+        if df is not None:
+            self.globals['df'] = df
+        output_buffer = io.StringIO()
+        import sys
+        sys.stdout = output_buffer
+        result = {
+            'output': '',
+            'plotly_html': [],
+            'error': None,
+            'dataframe_updates': None
+        }
+        try:
+            exec(code, self.globals, self.locals)
+            # Capture Plotly figures
+            for var_name, value in self.locals.items():
+                if isinstance(value, (go.Figure, px.Figure)):
+                    # Apply visualization config
+                    value.update_layout(
+                        width=self.viz_config.width,
+                        height=self.viz_config.height,
+                        template=self.viz_config.template,
+                        showgrid=self.viz_config.show_grid
+                    )
+                    html = value.to_html(
+                        include_plotlyjs=True,
+                        full_html=False,
+                        config={'displayModeBar': True}
+                    )
+                    result['plotly_html'].append(html)
+            # Capture DataFrame updates
+            if 'df' in self.locals and id(self.locals['df']) != id(df):
+                result['dataframe_updates'] = self.locals['df']
+            result['output'] = output_buffer.getvalue()
+        except Exception as e:
+            result['error'] = f"Error executing code: {str(e)}"
+        finally:
+            sys.stdout = sys.__stdout__
+            output_buffer.close()
+        return result
+class AnalysisHistory:
+    """Manages analysis history and persistence."""
+    def __init__(self, history_file: str = HISTORY_FILE):
+        self.history_file = history_file
+        self.history = self._load_history()
+    def _load_history(self) -> List[Dict]:
+        if os.path.exists(self.history_file):
+            try:
+                with open(self.history_file, 'r') as f:
+                    return json.load(f)
+            except:
+                return []
+        return []
+    def add_entry(self, query: str, result: str) -> None:
+        """Add new analysis entry to history."""
+        entry = {
+            'timestamp': datetime.now().isoformat(),
+            'query': query,
+            'result': result
+        }
+        self.history.append(entry)
+        with open(self.history_file, 'w') as f:
+            json.dump(self.history, f)
+    def get_recent_analyses(self, limit: int = 5) -> List[Dict]:
+        """Get recent analysis entries."""
+        return sorted(
+            self.history,
+            key=lambda x: x['timestamp'],
+            reverse=True
+        )[:limit]
 class DataAnalysisAssistant:
+    """Enhanced data analysis assistant with visualization capabilities."""
+    def __init__(self, api_key: str):
+        self.model = LiteLLMModel(
+            model_id=DEFAULT_MODEL,
+            api_key=api_key
+        )
+        self.code_env = CodeExecutionEnvironment()
+        self.history = AnalysisHistory()
+        # Initialize agent with tools
         self.agent = CodeAgent(
+            model=self.model,
             additional_authorized_imports=[
+                'pandas', 'numpy', 'plotly.express', 'plotly.graph_objects',
+                'seaborn', 'scipy', 'statsmodels'
+            ],
         )
     def analyze(self, df: pd.DataFrame, query: str) -> str:
+        """Perform analysis with interactive visualizations."""
+        # Preprocess data
+        df, metadata = DataPreprocessor.preprocess_dataframe(df)
+        # Create context for the agent
+        context = self._create_analysis_context(df, metadata, query)
+        try:
+            # Get analysis plan
+            response = self.agent.run(context, additional_args={"df": df})
+            # Extract and execute code blocks
+            results = self._execute_analysis(response, df)
+            # Save to history
+            self.history.add_entry(query, str(response))
+            return self._format_results(response, results)
+        except Exception as e:
+            return f"Analysis failed: {str(e)}"
+    def _create_analysis_context(self, df: pd.DataFrame, metadata: Dict, query: str) -> str:
+        """Create detailed context for analysis."""
+        return f"""
+        Analyze the following data with interactive visualizations.
+        DataFrame Information:
+        - Shape: {metadata['original_shape']}
+        - Numeric columns: {', '.join(metadata['numeric_columns'])}
+        - Categorical columns: {', '.join(metadata['categorical_columns'])}
+        - Temporal columns: {', '.join(metadata['temporal_columns'])}
         User Query: {query}
+        Guidelines:
+        1. Use Plotly for interactive visualizations
+        2. Store figures in variables named 'fig'
+        3. Include clear titles and labels
+        4. Add hover information
+        5. Use color effectively
+        6. Handle errors gracefully
+        The DataFrame is available as 'df'.
         """
+    def _execute_analysis(self, response: str, df: pd.DataFrame) -> List[Dict]:
+        """Execute code blocks from analysis."""
+        import re
+        results = []
+        # Extract code blocks
+        code_blocks = re.findall(r'```python\n(.*?)```', str(response), re.DOTALL)
+        for code in code_blocks:
+            result = self.code_env.execute(code, df)
+            results.append(result)
+        return results
+    def _format_results(self, response: str, results: List[Dict]) -> str:
+        """Format analysis results with visualizations."""
+        output_parts = []
+        # Add analysis text
+        analysis_text = str(response).replace("```python", "").replace("```", "")
+        output_parts.append(f'<div class="analysis-text">{analysis_text}</div>')
+        # Add execution results
+        for result in results:
+            if result['error']:
+                output_parts.append(f'<div class="error">{result["error"]}</div>')
+            else:
+                if result['output']:
+                    output_parts.append(f'<pre>{result["output"]}</pre>')
+                for html in result['plotly_html']:
+                    output_parts.append(
+                        f'<div class="plot-container">{html}</div>'
+                    )
+        return "\n".join(output_parts)
 def process_file(file: gr.File) -> Optional[pd.DataFrame]:
     """Process uploaded file into DataFrame."""
     if not file:
         return None
     try:
         file_path = Path(file.name)
         if file_path.suffix == '.csv':
     except Exception as e:
         raise RuntimeError(f"Error reading file: {str(e)}")
+def analyze_data(
+    file: gr.File,
+    query: str,
+    api_key: str,
+) -> str:
     """Main analysis function for Gradio interface."""
     if not api_key:
         return "Error: Please provide an API key"
     if not file:
         return "Error: Please upload a data file"
     try:
+        # Process file
         df = process_file(file)
         if df is None:
             return "Error: Could not process file"
+        # Create assistant and run analysis
         assistant = DataAnalysisAssistant(api_key)
         return assistant.analyze(df, query)
         return f"Error: {str(e)}"
 def create_interface():
+    """Create enhanced Gradio interface."""
     css = """
     .plot-container {
         margin: 20px 0;
         border: 1px solid #e0e0e0;
         border-radius: 8px;
         background: white;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+    }
+    .analysis-text {
+        margin: 20px 0;
+        line-height: 1.6;
+    }
+    .error {
+        color: red;
+        padding: 10px;
+        margin: 10px 0;
+        border-left: 4px solid red;
     }
     """
     with gr.Blocks(css=css) as interface:
         gr.Markdown("""
+        # Advanced Data Analysis Assistant
+        Upload your data and get AI-powered analysis with interactive visualizations.
+        **Features:**
+        - Interactive Plotly visualizations
+        - GPT-4 powered analysis
+        - Time series analysis
+        - Statistical insights
+        - Natural language queries
+        **Required:** OpenAI API key
         """)
         with gr.Row():
                 )
                 query = gr.Textbox(
                     label="What would you like to analyze?",
+                    placeholder="e.g., Analyze trends and patterns in the data with interactive visualizations",
                     lines=3
                 )
                 api_key = gr.Textbox(
+                    label="OpenAI API Key",
+                    placeholder="Your API key",
                     type="password"
                 )
                 analyze_btn = gr.Button("Analyze")
             outputs=output
         )
+        # Add examples
+        gr.Examples(
+            examples=[
+                [None, "Show trends over time with interactive visualizations"],
+                [None, "Create a comprehensive analysis of relationships between variables"],
+                [None, "Analyze distributions and statistical patterns"],
+                [None, "Generate financial metrics and performance indicators"],
+            ],
+            inputs=[file, query]
+        )
     return interface
 if __name__ == "__main__":