Spaces:

jzou19950715
/

Huggingface_AI_Agent_Function_Testing_2

Configuration error

App Files Files Community

jzou19950715 commited on Jan 24, 2025

Commit

cce2b52

verified ·

1 Parent(s): f3139c4

Update tools.py

Browse files

Files changed (1) hide show

tools.py +270 -315

tools.py CHANGED Viewed

@@ -1,339 +1,294 @@
-#!/usr/bin/env python
-# coding=utf-8
 """
-Analysis and visualization tools for data analysis assistant.
-Provides a collection of tools for data analysis, statistical computations,
-and interactive visualizations using Plotly.
 """
-import logging
-from typing import Any, Dict, List, Optional, Tuple, Union
-from datetime import datetime
 from pathlib import Path
-import numpy as np
 import pandas as pd
-import plotly.express as px
-import plotly.graph_objects as go
-from plotly.subplots import make_subplots
-import seaborn as sns
-from scipy import stats
-from smolagents import tool
-# Configure logging
-logging.basicConfig(
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    level=logging.INFO
 )
-logger = logging.getLogger(__name__)
-class AnalysisError(Exception):
-    """Custom exception for analysis errors."""
-    pass
-@tool
-def create_time_series_plot(
-    df: pd.DataFrame,
-    time_column: str,
-    value_column: str,
-    title: Optional[str] = None
-) -> Dict[str, Any]:
-    """
-    Create an interactive time series plot.
-    Args:
-        df: Input DataFrame
-        time_column: Name of the time column
-        value_column: Name of the value column to plot
-        title: Optional title for the plot
-    Returns:
-        Dict containing the plotly figure and stats
-    """
-    try:
-        # Validate inputs
-        if time_column not in df.columns or value_column not in df.columns:
-            raise AnalysisError(f"Columns {time_column} or {value_column} not found in DataFrame")
-        # Create plot
-        fig = px.line(
-            df,
-            x=time_column,
-            y=value_column,
-            title=title or f"{value_column} over Time",
-            template="plotly_white"
         )
-        # Add hover data
-        fig.update_traces(
-            hovertemplate=(
-                f"{time_column}: %{{x}}<br>"
-                f"{value_column}: %{{y:.2f}}<br>"
-                "<extra></extra>"
-            )
         )
-        # Calculate basic stats
-        stats_dict = {
-            "mean": df[value_column].mean(),
-            "std": df[value_column].std(),
-            "min": df[value_column].min(),
-            "max": df[value_column].max()
-        }
-        return {"figure": fig, "stats": stats_dict}
-    except Exception as e:
-        logger.error(f"Error in create_time_series_plot: {str(e)}")
-        raise AnalysisError(f"Failed to create time series plot: {str(e)}")
-@tool
-def create_correlation_heatmap(df: pd.DataFrame, numeric_only: bool = True) -> Dict[str, Any]:
-    """
-    Create an interactive correlation heatmap.
-    Args:
-        df: Input DataFrame
-        numeric_only: Whether to include only numeric columns
-    Returns:
-        Dict containing the plotly figure and correlation matrix
-    """
     try:
-        # Select numeric columns if requested
-        if numeric_only:
-            df = df.select_dtypes(include=[np.number])
-        # Calculate correlation matrix
-        corr_matrix = df.corr()
-        # Create heatmap
-        fig = go.Figure(data=go.Heatmap(
-            z=corr_matrix,
-            x=corr_matrix.columns,
-            y=corr_matrix.columns,
-            colorscale='RdBu',
-            zmid=0,
-            text=np.round(corr_matrix, 2),
-            texttemplate='%{text:.2f}',
-            textfont={"size": 10},
-            hoverongaps=False
-        ))
-        # Update layout
-        fig.update_layout(
-            title="Correlation Heatmap",
-            template="plotly_white",
-            width=800,
-            height=800
-        )
-        return {
-            "figure": fig,
-            "correlation_matrix": corr_matrix.to_dict()
-        }
     except Exception as e:
-        logger.error(f"Error in create_correlation_heatmap: {str(e)}")
-        raise AnalysisError(f"Failed to create correlation heatmap: {str(e)}")
-@tool
-def create_statistical_summary(df: pd.DataFrame, column: str) -> Dict[str, Any]:
-    """
-    Create statistical summary with visualization for a column.
-    Args:
-        df: Input DataFrame
-        column: Column name to analyze
-    Returns:
-        Dict containing summary statistics and visualization
-    """
-    try:
-        if column not in df.columns:
-            raise AnalysisError(f"Column {column} not found in DataFrame")
-        # Calculate summary statistics
-        summary_stats = df[column].describe().to_dict()
-        # Additional statistics
-        if pd.api.types.is_numeric_dtype(df[column]):
-            summary_stats.update({
-                "skewness": stats.skew(df[column].dropna()),
-                "kurtosis": stats.kurtosis(df[column].dropna())
-            })
-        # Create distribution plot
-        fig = make_subplots(rows=2, cols=1)
-        # Add histogram
-        fig.add_trace(
-            go.Histogram(
-                x=df[column],
-                name="Distribution",
-                nbinsx=30
-            ),
-            row=1, col=1
-        )
-        # Add box plot
-        fig.add_trace(
-            go.Box(
-                y=df[column],
-                name="Box Plot"
-            ),
-            row=2, col=1
-        )
-        # Update layout
-        fig.update_layout(
-            title=f"Statistical Analysis of {column}",
-            showlegend=False,
-            template="plotly_white",
-            height=800
-        )
-        return {
-            "figure": fig,
-            "stats": summary_stats
-        }
     except Exception as e:
-        logger.error(f"Error in create_statistical_summary: {str(e)}")
-        raise AnalysisError(f"Failed to create statistical summary: {str(e)}")
-@tool
-def detect_outliers(
-    df: pd.DataFrame,
-    column: str,
-    method: str = "zscore",
-    threshold: float = 3.0
-) -> Dict[str, Any]:
-    """
-    Detect outliers in a column using various methods.
-    Args:
-        df: Input DataFrame
-        column: Column to analyze
-        method: Detection method ('zscore' or 'iqr')
-        threshold: Threshold for outlier detection
-    Returns:
-        Dict containing outlier indices and visualization
     """
-    try:
-        if column not in df.columns:
-            raise AnalysisError(f"Column {column} not found in DataFrame")
-        values = df[column].dropna()
-        if method == "zscore":
-            z_scores = np.abs(stats.zscore(values))
-            outlier_mask = z_scores > threshold
-        elif method == "iqr":
-            Q1 = values.quantile(0.25)
-            Q3 = values.quantile(0.75)
-            IQR = Q3 - Q1
-            outlier_mask = (values < (Q1 - threshold * IQR)) | (values > (Q3 + threshold * IQR))
-        else:
-            raise AnalysisError(f"Unknown outlier detection method: {method}")
-        # Create visualization
-        fig = go.Figure()
-        # Add main scatter plot
-        fig.add_trace(
-            go.Scatter(
-                x=df.index[~outlier_mask],
-                y=values[~outlier_mask],
-                mode='markers',
-                name='Normal Points',
-                marker=dict(color='blue')
-            )
-        )
-        # Add outliers
-        fig.add_trace(
-            go.Scatter(
-                x=df.index[outlier_mask],
-                y=values[outlier_mask],
-                mode='markers',
-                name='Outliers',
-                marker=dict(color='red')
-            )
         )
-        fig.update_layout(
-            title=f"Outlier Detection for {column}",
-            template="plotly_white",
-            showlegend=True
         )
-        return {
-            "figure": fig,
-            "outlier_indices": df.index[outlier_mask].tolist(),
-            "outlier_count": sum(outlier_mask)
-        }
-    except Exception as e:
-        logger.error(f"Error in detect_outliers: {str(e)}")
-        raise AnalysisError(f"Failed to detect outliers: {str(e)}")
-# Additional utility functions
-def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, str]:
-    """
-    Validate DataFrame for analysis.
-    Args:
-        df: Input DataFrame
-    Returns:
-        Tuple of (is_valid, error_message)
-    """
-    if df is None:
-        return False, "DataFrame is None"
-    if df.empty:
-        return False, "DataFrame is empty"
-    if df.columns.duplicated().any():
-        return False, "DataFrame contains duplicate column names"
-    return True, ""
-def get_numeric_columns(df: pd.DataFrame) -> List[str]:
-    """Get list of numeric columns from DataFrame."""
-    return df.select_dtypes(include=[np.number]).columns.tolist()
-def get_temporal_columns(df: pd.DataFrame) -> List[str]:
-    """Get list of temporal columns from DataFrame."""
-    temporal_cols = []
-    for col in df.columns:
-        try:
-            pd.to_datetime(df[col])
-            temporal_cols.append(col)
-        except:
-            continue
-    return temporal_cols
 if __name__ == "__main__":
-    # Example usage and testing
-    logging.info("Running tools.py tests...")
-    # Create sample data
-    dates = pd.date_range(start='2023-01-01', periods=100, freq='D')
-    df = pd.DataFrame({
-        'date': dates,
-        'value': np.random.normal(100, 10, 100),
-        'category': np.random.choice(['A', 'B', 'C'], 100)
-    })
-    # Test time series plot
-    try:
-        result = create_time_series_plot(df, 'date', 'value')
-        logging.info("Time series plot created successfully")
-    except Exception as e:
-        logging.error(f"Time series plot test failed: {str(e)}")
-    # Add more tests as needed

 """
+Advanced Data Analysis Assistant with Interactive Visualizations
+Integrates smolagents, GPT-4, and interactive Plotly visualizations.
 """
+import json
+import os
+from dataclasses import dataclass
 from pathlib import Path
+from typing import Any, Dict, List, Optional, Union, Tuple
+from datetime import datetime
+import gradio as gr
 import pandas as pd
+from smolagents import CodeAgent, LiteLLMModel
+from tools import (
+    create_time_series_plot,
+    create_correlation_heatmap,
+    create_statistical_summary,
+    detect_outliers,
+    validate_dataframe,
+    get_numeric_columns,
+    get_temporal_columns,
+    AnalysisError
 )
+# Constants
+SUPPORTED_FILE_TYPES = [".csv", ".xlsx", ".xls"]
+DEFAULT_MODEL = "gpt-4o-mini"
+HISTORY_FILE = "analysis_history.json"
+@dataclass
+class VisualizationConfig:
+    """Configuration for visualizations."""
+    width: int = 800
+    height: int = 500
+    template: str = "plotly_white"
+    show_grid: bool = True
+    interactive: bool = True
+class DataPreprocessor:
+    """Handles data preprocessing and validation."""
+    @staticmethod
+    def preprocess_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict[str, Any]]:
+        """Preprocess the dataframe and return metadata."""
+        # First validate the dataframe
+        is_valid, error_msg = validate_dataframe(df)
+        if not is_valid:
+            raise ValueError(error_msg)
+        metadata = {
+            "original_shape": df.shape,
+            "missing_values": df.isnull().sum().to_dict(),
+            "dtypes": df.dtypes.astype(str).to_dict(),
+            "numeric_columns": get_numeric_columns(df),
+            "categorical_columns": df.select_dtypes(include=['object']).columns.tolist(),
+            "temporal_columns": get_temporal_columns(df)
+        }
+        # Handle missing values
+        df = df.fillna(method='ffill').fillna(method='bfill')
+        return df, metadata
+class AnalysisHistory:
+    """Manages analysis history and persistence."""
+    def __init__(self, history_file: str = HISTORY_FILE):
+        self.history_file = history_file
+        self.history = self._load_history()
+    def _load_history(self) -> List[Dict]:
+        if os.path.exists(self.history_file):
+            try:
+                with open(self.history_file, 'r') as f:
+                    return json.load(f)
+            except:
+                return []
+        return []
+    def add_entry(self, query: str, result: str) -> None:
+        """Add new analysis entry to history."""
+        entry = {
+            'timestamp': datetime.now().isoformat(),
+            'query': query,
+            'result': result
+        }
+        self.history.append(entry)
+        with open(self.history_file, 'w') as f:
+            json.dump(self.history, f)
+    def get_recent_analyses(self, limit: int = 5) -> List[Dict]:
+        """Get recent analysis entries."""
+        return sorted(
+            self.history,
+            key=lambda x: x['timestamp'],
+            reverse=True
+        )[:limit]
+class DataAnalysisAssistant:
+    """Enhanced data analysis assistant with visualization capabilities."""
+    def __init__(self, api_key: str):
+        self.model = LiteLLMModel(
+            model_id=DEFAULT_MODEL,
+            api_key=api_key
         )
+        self.history = AnalysisHistory()
+        self.agent = CodeAgent(
+            model=self.model,
+            tools=[
+                create_time_series_plot,
+                create_correlation_heatmap,
+                create_statistical_summary,
+                detect_outliers
+            ],
+            additional_authorized_imports=[
+                'pandas', 'numpy', 'plotly.express', 'plotly.graph_objects',
+                'seaborn', 'scipy', 'statsmodels'
+            ],
         )
+    def analyze(self, df: pd.DataFrame, query: str) -> str:
+        """Perform analysis with interactive visualizations."""
+        try:
+            df, metadata = DataPreprocessor.preprocess_dataframe(df)
+            context = self._create_analysis_context(df, metadata, query)
+            response = self.agent.run(context, additional_args={"df": df})
+            self.history.add_entry(query, str(response))
+            return self._format_results(response)
+        except Exception as e:
+            return f"Analysis failed: {str(e)}"
+    def _create_analysis_context(self, df: pd.DataFrame, metadata: Dict, query: str) -> str:
+        """Create detailed context for analysis."""
+        tools_description = """
+        Available analysis tools:
+        - create_time_series_plot: Create interactive time series visualizations
+        - create_correlation_heatmap: Generate correlation analysis with heatmap
+        - create_statistical_summary: Compute statistical summaries with visualizations
+        - detect_outliers: Identify and visualize outliers
+        """
+        return f"""
+        Analyze the following data with interactive visualizations.
+        DataFrame Information:
+        - Shape: {metadata['original_shape']}
+        - Numeric columns: {', '.join(metadata['numeric_columns'])}
+        - Categorical columns: {', '.join(metadata['categorical_columns'])}
+        - Temporal columns: {', '.join(metadata['temporal_columns'])}
+        {tools_description}
+        User Query: {query}
+        Guidelines:
+        1. Use the provided analysis tools for visualizations
+        2. Include clear titles and labels
+        3. Handle errors gracefully
+        4. Chain multiple analyses when needed
+        5. Provide insights along with visualizations
+        The DataFrame is available as 'df'.
+        """
+    def _format_results(self, response: str) -> str:
+        """Format analysis results with visualizations."""
+        return f'<div class="analysis-text">{response}</div>'
+def process_file(file: gr.File) -> Optional[pd.DataFrame]:
+    """Process uploaded file into DataFrame."""
+    if not file:
+        return None
     try:
+        file_path = Path(file.name)
+        if file_path.suffix == '.csv':
+            return pd.read_csv(file_path)
+        elif file_path.suffix in ('.xlsx', '.xls'):
+            return pd.read_excel(file_path)
+        else:
+            raise ValueError(f"Unsupported file type: {file_path.suffix}")
     except Exception as e:
+        raise RuntimeError(f"Error reading file: {str(e)}")
+def analyze_data(
+    file: gr.File,
+    query: str,
+    api_key: str,
+) -> str:
+    """Main analysis function for Gradio interface."""
+    if not api_key:
+        return "Error: Please provide an API key"
+    if not file:
+        return "Error: Please upload a data file"
+    try:
+        df = process_file(file)
+        if df is None:
+            return "Error: Could not process file"
+        assistant = DataAnalysisAssistant(api_key)
+        return assistant.analyze(df, query)
     except Exception as e:
+        return f"Error: {str(e)}"
+def create_interface():
+    """Create enhanced Gradio interface."""
+    css = """
+    .plot-container {
+        margin: 20px 0;
+        padding: 15px;
+        border: 1px solid #e0e0e0;
+        border-radius: 8px;
+        background: white;
+        box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+    }
+    .analysis-text {
+        margin: 20px 0;
+        line-height: 1.6;
+    }
+    .error {
+        color: red;
+        padding: 10px;
+        margin: 10px 0;
+        border-left: 4px solid red;
+    }
     """
+    with gr.Blocks(css=css) as interface:
+        gr.Markdown("""
+        # Advanced Data Analysis Assistant
+        Upload your data and get AI-powered analysis with interactive visualizations.
+        **Features:**
+        - Interactive Plotly visualizations
+        - gpt-4o-mini powered analysis
+        - Time series analysis
+        - Statistical insights
+        - Natural language queries
+        **Required:** OpenAI API key
+        """)
+        with gr.Row():
+            with gr.Column():
+                file = gr.File(
+                    label="Upload Data File",
+                    file_types=SUPPORTED_FILE_TYPES
+                )
+                query = gr.Textbox(
+                    label="What would you like to analyze?",
+                    placeholder="e.g., Analyze trends and patterns in the data with interactive visualizations",
+                    lines=3
+                )
+                api_key = gr.Textbox(
+                    label="OpenAI API Key",
+                    placeholder="Your API key",
+                    type="password"
+                )
+                analyze_btn = gr.Button("Analyze")
+            with gr.Column():
+                output = gr.HTML(label="Analysis Results")
+        analyze_btn.click(
+            analyze_data,
+            inputs=[file, query, api_key],
+            outputs=output
         )
+        gr.Examples(
+            examples=[
+                [None, "Show trends over time with interactive visualizations"],
+                [None, "Create a comprehensive analysis of relationships between variables"],
+                [None, "Analyze distributions and statistical patterns"],
+                [None, "Generate financial metrics and performance indicators"],
+            ],
+            inputs=[file, query]
         )
+    return interface
 if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch()