File size: 12,809 Bytes
ddabbe4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
"""
Tools for CrewAI agents to interact with NBA data.
"""
import pandas as pd
from crewai.tools import tool
from typing import Optional
from vector_db import get_vector_db


def get_agent_tools(data_path: str):
    """
    Get the list of tools available for agents.
    
    Args:
        data_path: Path to the CSV data file
        
    Returns:
        list: List of tools for agents to use
    """
    # Define helper functions first, then wrap them with @tool
    
    def _read_nba_data(limit: int = 10) -> str:
        """Read a sample of the NBA data file to understand its structure."""
        try:
            # Read only a sample to avoid token limits
            df = pd.read_csv(data_path)
            sample = df.head(limit)
            return f"Dataset: {len(df):,} total records, {len(df.columns)} columns\n\nColumn names: {', '.join(df.columns.tolist())}\n\nSample (first {limit} rows):\n\n{sample.to_string()}"
        except Exception as e:
            return f"Error reading file {data_path}: {str(e)}"
    
    def _search_nba_data(
        query: Optional[str] = None,
        column: Optional[str] = None,
        value: Optional[str] = None,
        limit: int = 100
    ) -> str:
        """Search and filter NBA data CSV file."""
        try:
            df = pd.read_csv(data_path)
            
            # Apply filters if provided
            if column and value:
                if column in df.columns:
                    df = df[df[column].astype(str).str.contains(str(value), case=False, na=False)]
                else:
                    return f"Column '{column}' not found. Available columns: {', '.join(df.columns.tolist())}"
            
            if query:
                # Search across all string columns
                mask = pd.Series([False] * len(df))
                for col in df.columns:
                    if df[col].dtype == 'object':
                        mask |= df[col].astype(str).str.contains(query, case=False, na=False)
                df = df[mask]
            
            # Limit results to prevent token overflow
            limit = min(limit, 50)  # Cap at 50 rows
            df = df.head(limit)
            
            if len(df) == 0:
                return "No matching records found."
            
            # Truncate output if too large
            result_str = df.to_string()
            if len(result_str) > 2000:
                result_str = df.head(20).to_string() + f"\n\n... (showing first 20 of {len(df)} matching records)"
            
            return f"Found {len(df)} matching records:\n\n{result_str}"
        except Exception as e:
            return f"Error searching CSV {data_path}: {str(e)}"
    
    def _get_nba_data_summary() -> str:
        """Get a concise summary of the NBA data file."""
        try:
            df = pd.read_csv(data_path)
            
            # Calculate basic stats - keep it concise
            numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
            
            summary = f"""NBA Dataset Summary:
- Total Records: {len(df):,}
- Columns: {len(df.columns)} ({', '.join(df.columns.tolist()[:10])}{'...' if len(df.columns) > 10 else ''})
- Unique Players: {df['Player'].nunique() if 'Player' in df.columns else 'N/A'}
- Unique Teams: {df['Tm'].nunique() if 'Tm' in df.columns else 'N/A'}
- Date Range: {df['Data'].min() if 'Data' in df.columns else 'N/A'} to {df['Data'].max() if 'Data' in df.columns else 'N/A'}
- Key Numeric Columns: {', '.join(numeric_cols[:10]) if numeric_cols else 'None'}

Sample (first 3 rows):
{df.head(3).to_string()}
"""
            return summary
        except Exception as e:
            return f"Error getting CSV summary for {data_path}: {str(e)}"
    
    # Now wrap them with @tool decorator
    @tool("read_nba_data")
    def read_nba_data(limit: int = 10) -> str:
        """
        Read a sample of the NBA data file to understand its structure.
        Use this to see column names and data format, NOT for full analysis.
        
        Args:
            limit: Number of sample rows to return (default: 10, max: 50)
        """
        limit = min(limit, 50)  # Cap at 50 rows
        return _read_nba_data(limit)
    
    @tool("search_nba_data")
    def search_nba_data(
        query: Optional[str] = None,
        column: Optional[str] = None,
        value: Optional[str] = None,
        limit: int = 100
    ) -> str:
        """
        Search and filter NBA data CSV file. Use this to find specific players, teams, or statistics.
        
        Args:
            query: Optional text query to search for in any column (e.g., player name, team name)
            column: Optional column name to filter by (e.g., 'Player', 'Tm', 'PTS')
            value: Optional value to match in the specified column
            limit: Maximum number of rows to return (default: 100)
        """
        return _search_nba_data(query, column, value, limit)
    
    @tool("get_nba_data_summary")
    def get_nba_data_summary() -> str:
        """
        Get a comprehensive summary of the NBA data file including structure, basic statistics, 
        and data quality information. Use this first to understand the dataset.
        """
        return _get_nba_data_summary()
    
    def _semantic_search_nba_data(query: str, n_results: int = 10) -> str:
        """
        Perform semantic search on NBA data using vector embeddings.
        This understands natural language queries and finds semantically similar records.
        """
        try:
            # Get vector database instance
            vector_db = get_vector_db(data_path)
            
            # Perform semantic search
            results = vector_db.search(query, n_results=n_results)
            
            if not results:
                return f"No results found for query: '{query}'"
            
            # Format results
            output = [f"Semantic search results for: '{query}'\n"]
            output.append(f"Found {len(results)} similar records:\n")
            output.append("=" * 80 + "\n")
            
            # Load original CSV to get full row data
            df = pd.read_csv(data_path)
            
            for i, result in enumerate(results, 1):
                metadata = result['metadata']
                similarity = result['similarity']
                row_index = metadata.get('row_index', -1)
                
                output.append(f"\nResult {i} (Similarity: {similarity:.3f}):")
                output.append(f"Document: {result['document']}\n")
                
                # Get full row data if available
                if row_index >= 0 and row_index < len(df):
                    row = df.iloc[row_index]
                    output.append("Full record:")
                    output.append(row.to_string())
                    output.append("\n" + "-" * 80 + "\n")
            
            return "\n".join(output)
        except Exception as e:
            return f"Error performing semantic search: {str(e)}"
    
    @tool("semantic_search_nba_data")
    def semantic_search_nba_data(query: str, n_results: int = 10) -> str:
        """
        Perform semantic search on NBA data using vector embeddings and natural language understanding.
        This tool understands the meaning of your query, not just exact text matches.
        
        Use this for natural language questions like:
        - "high scoring games"
        - "LeBron James best performances"
        - "games with many assists"
        - "efficient shooters"
        - "close games"
        
        Args:
            query: Natural language query describing what you're looking for
            n_results: Number of results to return (default: 10, max: 50)
        
        Examples:
            semantic_search_nba_data("LeBron James high scoring games")
            semantic_search_nba_data("games with triple doubles", n_results=5)
            semantic_search_nba_data("most efficient three point shooters")
        """
        # Limit n_results to prevent overwhelming output
        n_results = min(n_results, 50)
        return _semantic_search_nba_data(query, n_results)
    
    def _analyze_nba_data(pandas_code: str) -> str:
        """
        Execute pandas operations on NBA data for advanced analysis.
        This tool allows you to perform aggregations, groupby, sorting, filtering, etc.
        
        The pandas code should work with a DataFrame variable named 'df'.
        You can use any pandas operations like:
        - df.groupby('Player')['3P'].sum().sort_values(ascending=False).head(5)
        - df.groupby('Player').agg({'PTS': 'sum', 'AST': 'sum'}).sort_values('PTS', ascending=False)
        - df[df['3P'] > 5].groupby('Player')['3P'].sum().nlargest(5)
        """
        try:
            # Load the CSV data
            df = pd.read_csv(data_path)
            
            # Execute the pandas code in a safe environment
            # Create a namespace with only pandas and the dataframe
            namespace = {
                'pd': pd,
                'df': df,
                '__builtins__': __builtins__
            }
            
            # Execute the code
            exec(f"result = {pandas_code}", namespace)
            result = namespace.get('result')
            
            # Convert result to string representation - limit size to avoid token limits
            if isinstance(result, pd.DataFrame):
                # Limit DataFrame output to prevent token overflow
                if len(result) > 50:
                    result_str = f"{result.head(50).to_string()}\n\n... (showing first 50 of {len(result)} rows)"
                else:
                    result_str = result.to_string()
                return f"Analysis Result ({result.shape[0]} rows, {result.shape[1]} cols):\n\n{result_str}"
            elif isinstance(result, pd.Series):
                # Limit Series output
                if len(result) > 50:
                    result_str = f"{result.head(50).to_string()}\n\n... (showing first 50 of {len(result)} items)"
                else:
                    result_str = result.to_string()
                return f"Analysis Result ({len(result)} items):\n\n{result_str}"
            else:
                # For other types, limit string length
                result_str = str(result)
                if len(result_str) > 2000:
                    result_str = result_str[:2000] + "\n\n... (truncated)"
                return f"Analysis Result:\n\n{result_str}"
                
        except Exception as e:
            return f"Error executing pandas code: {str(e)}\n\nMake sure your code uses 'df' as the DataFrame variable and returns a result."
    
    @tool("analyze_nba_data")
    def analyze_nba_data(pandas_code: str) -> str:
        """
        Execute pandas operations on NBA data for advanced analysis, aggregations, and statistical queries.
        
        This is the PRIMARY tool for data analysis tasks like:
        - Finding top players by statistics (groupby + aggregation + sorting)
        - Calculating totals, averages, counts per player/team
        - Filtering and aggregating data
        - Statistical analysis
        
        IMPORTANT: Use this tool for queries that require:
        - Aggregating data (sum, mean, count, etc.)
        - Grouping by player, team, etc.
        - Finding top N results
        - Calculating totals or averages
        
        Args:
            pandas_code: Valid pandas code that operates on a DataFrame variable named 'df'
                         The code should return a result (DataFrame, Series, or value)
        
        Examples:
            # Top 5 players by total 3-pointers made
            analyze_nba_data("df.groupby('Player')['3P'].sum().sort_values(ascending=False).head(5)")
            
            # Top 10 players by total points
            analyze_nba_data("df.groupby('Player')['PTS'].sum().sort_values(ascending=False).head(10)")
            
            # Players with highest 3-point percentage (minimum 100 attempts)
            analyze_nba_data("df[df['3PA'] >= 100].groupby('Player').agg({'3P': 'sum', '3PA': 'sum'}).assign(percentage=lambda x: x['3P']/x['3PA']*100).sort_values('percentage', ascending=False).head(5)")
            
            # Top 5 players by assists
            analyze_nba_data("df.groupby('Player')['AST'].sum().sort_values(ascending=False).head(5)")
            
            # Team win rates
            analyze_nba_data("df.groupby('Tm')['Res'].apply(lambda x: (x == 'W').sum() / len(x) * 100).sort_values(ascending=False)")
        """
        return _analyze_nba_data(pandas_code)
    
    return [read_nba_data, search_nba_data, get_nba_data_summary, semantic_search_nba_data, analyze_nba_data]