Spaces:

jzou19950715
/

Huggingface_AI_Agent_Function_Testing_2

Configuration error

App Files Files Community

jzou19950715 commited on Jan 24, 2025

Commit

48a1160

verified ·

1 Parent(s): 4e06409

Create tools.py

Browse files

Files changed (1) hide show

tools.py +339 -0

tools.py ADDED Viewed

	@@ -0,0 +1,339 @@

+#!/usr/bin/env python
+# coding=utf-8
+"""
+Analysis and visualization tools for data analysis assistant.
+Provides a collection of tools for data analysis, statistical computations,
+and interactive visualizations using Plotly.
+"""
+import logging
+from typing import Any, Dict, List, Optional, Tuple, Union
+from datetime import datetime
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from plotly.subplots import make_subplots
+import seaborn as sns
+from scipy import stats
+from smolagents import tool
+# Configure logging
+logging.basicConfig(
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+    level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+class AnalysisError(Exception):
+    """Custom exception for analysis errors."""
+    pass
+@tool
+def create_time_series_plot(
+    df: pd.DataFrame,
+    time_column: str,
+    value_column: str,
+    title: Optional[str] = None
+) -> Dict[str, Any]:
+    """
+    Create an interactive time series plot.
+    Args:
+        df: Input DataFrame
+        time_column: Name of the time column
+        value_column: Name of the value column to plot
+        title: Optional title for the plot
+    Returns:
+        Dict containing the plotly figure and stats
+    """
+    try:
+        # Validate inputs
+        if time_column not in df.columns or value_column not in df.columns:
+            raise AnalysisError(f"Columns {time_column} or {value_column} not found in DataFrame")
+        # Create plot
+        fig = px.line(
+            df,
+            x=time_column,
+            y=value_column,
+            title=title or f"{value_column} over Time",
+            template="plotly_white"
+        )
+        # Add hover data
+        fig.update_traces(
+            hovertemplate=(
+                f"{time_column}: %{{x}}<br>"
+                f"{value_column}: %{{y:.2f}}<br>"
+                "<extra></extra>"
+            )
+        )
+        # Calculate basic stats
+        stats_dict = {
+            "mean": df[value_column].mean(),
+            "std": df[value_column].std(),
+            "min": df[value_column].min(),
+            "max": df[value_column].max()
+        }
+        return {"figure": fig, "stats": stats_dict}
+    except Exception as e:
+        logger.error(f"Error in create_time_series_plot: {str(e)}")
+        raise AnalysisError(f"Failed to create time series plot: {str(e)}")
+@tool
+def create_correlation_heatmap(df: pd.DataFrame, numeric_only: bool = True) -> Dict[str, Any]:
+    """
+    Create an interactive correlation heatmap.
+    Args:
+        df: Input DataFrame
+        numeric_only: Whether to include only numeric columns
+    Returns:
+        Dict containing the plotly figure and correlation matrix
+    """
+    try:
+        # Select numeric columns if requested
+        if numeric_only:
+            df = df.select_dtypes(include=[np.number])
+        # Calculate correlation matrix
+        corr_matrix = df.corr()
+        # Create heatmap
+        fig = go.Figure(data=go.Heatmap(
+            z=corr_matrix,
+            x=corr_matrix.columns,
+            y=corr_matrix.columns,
+            colorscale='RdBu',
+            zmid=0,
+            text=np.round(corr_matrix, 2),
+            texttemplate='%{text:.2f}',
+            textfont={"size": 10},
+            hoverongaps=False
+        ))
+        # Update layout
+        fig.update_layout(
+            title="Correlation Heatmap",
+            template="plotly_white",
+            width=800,
+            height=800
+        )
+        return {
+            "figure": fig,
+            "correlation_matrix": corr_matrix.to_dict()
+        }
+    except Exception as e:
+        logger.error(f"Error in create_correlation_heatmap: {str(e)}")
+        raise AnalysisError(f"Failed to create correlation heatmap: {str(e)}")
+@tool
+def create_statistical_summary(df: pd.DataFrame, column: str) -> Dict[str, Any]:
+    """
+    Create statistical summary with visualization for a column.
+    Args:
+        df: Input DataFrame
+        column: Column name to analyze
+    Returns:
+        Dict containing summary statistics and visualization
+    """
+    try:
+        if column not in df.columns:
+            raise AnalysisError(f"Column {column} not found in DataFrame")
+        # Calculate summary statistics
+        summary_stats = df[column].describe().to_dict()
+        # Additional statistics
+        if pd.api.types.is_numeric_dtype(df[column]):
+            summary_stats.update({
+                "skewness": stats.skew(df[column].dropna()),
+                "kurtosis": stats.kurtosis(df[column].dropna())
+            })
+        # Create distribution plot
+        fig = make_subplots(rows=2, cols=1)
+        # Add histogram
+        fig.add_trace(
+            go.Histogram(
+                x=df[column],
+                name="Distribution",
+                nbinsx=30
+            ),
+            row=1, col=1
+        )
+        # Add box plot
+        fig.add_trace(
+            go.Box(
+                y=df[column],
+                name="Box Plot"
+            ),
+            row=2, col=1
+        )
+        # Update layout
+        fig.update_layout(
+            title=f"Statistical Analysis of {column}",
+            showlegend=False,
+            template="plotly_white",
+            height=800
+        )
+        return {
+            "figure": fig,
+            "stats": summary_stats
+        }
+    except Exception as e:
+        logger.error(f"Error in create_statistical_summary: {str(e)}")
+        raise AnalysisError(f"Failed to create statistical summary: {str(e)}")
+@tool
+def detect_outliers(
+    df: pd.DataFrame,
+    column: str,
+    method: str = "zscore",
+    threshold: float = 3.0
+) -> Dict[str, Any]:
+    """
+    Detect outliers in a column using various methods.
+    Args:
+        df: Input DataFrame
+        column: Column to analyze
+        method: Detection method ('zscore' or 'iqr')
+        threshold: Threshold for outlier detection
+    Returns:
+        Dict containing outlier indices and visualization
+    """
+    try:
+        if column not in df.columns:
+            raise AnalysisError(f"Column {column} not found in DataFrame")
+        values = df[column].dropna()
+        if method == "zscore":
+            z_scores = np.abs(stats.zscore(values))
+            outlier_mask = z_scores > threshold
+        elif method == "iqr":
+            Q1 = values.quantile(0.25)
+            Q3 = values.quantile(0.75)
+            IQR = Q3 - Q1
+            outlier_mask = (values < (Q1 - threshold * IQR)) | (values > (Q3 + threshold * IQR))
+        else:
+            raise AnalysisError(f"Unknown outlier detection method: {method}")
+        # Create visualization
+        fig = go.Figure()
+        # Add main scatter plot
+        fig.add_trace(
+            go.Scatter(
+                x=df.index[~outlier_mask],
+                y=values[~outlier_mask],
+                mode='markers',
+                name='Normal Points',
+                marker=dict(color='blue')
+            )
+        )
+        # Add outliers
+        fig.add_trace(
+            go.Scatter(
+                x=df.index[outlier_mask],
+                y=values[outlier_mask],
+                mode='markers',
+                name='Outliers',
+                marker=dict(color='red')
+            )
+        )
+        fig.update_layout(
+            title=f"Outlier Detection for {column}",
+            template="plotly_white",
+            showlegend=True
+        )
+        return {
+            "figure": fig,
+            "outlier_indices": df.index[outlier_mask].tolist(),
+            "outlier_count": sum(outlier_mask)
+        }
+    except Exception as e:
+        logger.error(f"Error in detect_outliers: {str(e)}")
+        raise AnalysisError(f"Failed to detect outliers: {str(e)}")
+# Additional utility functions
+def validate_dataframe(df: pd.DataFrame) -> Tuple[bool, str]:
+    """
+    Validate DataFrame for analysis.
+    Args:
+        df: Input DataFrame
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    if df is None:
+        return False, "DataFrame is None"
+    if df.empty:
+        return False, "DataFrame is empty"
+    if df.columns.duplicated().any():
+        return False, "DataFrame contains duplicate column names"
+    return True, ""
+def get_numeric_columns(df: pd.DataFrame) -> List[str]:
+    """Get list of numeric columns from DataFrame."""
+    return df.select_dtypes(include=[np.number]).columns.tolist()
+def get_temporal_columns(df: pd.DataFrame) -> List[str]:
+    """Get list of temporal columns from DataFrame."""
+    temporal_cols = []
+    for col in df.columns:
+        try:
+            pd.to_datetime(df[col])
+            temporal_cols.append(col)
+        except:
+            continue
+    return temporal_cols
+if __name__ == "__main__":
+    # Example usage and testing
+    logging.info("Running tools.py tests...")
+    # Create sample data
+    dates = pd.date_range(start='2023-01-01', periods=100, freq='D')
+    df = pd.DataFrame({
+        'date': dates,
+        'value': np.random.normal(100, 10, 100),
+        'category': np.random.choice(['A', 'B', 'C'], 100)
+    })
+    # Test time series plot
+    try:
+        result = create_time_series_plot(df, 'date', 'value')
+        logging.info("Time series plot created successfully")
+    except Exception as e:
+        logging.error(f"Time series plot test failed: {str(e)}")
+    # Add more tests as needed