Final_Assignment_AGENT_GAIA

Sleeping

File size: 18,548 Bytes

8a7b3d1
a4f05bc
 
 
 
 
 
 
8a7b3d1
ed696d0
e828c8e
394d24e
8a7b3d1
 
e828c8e
1334ae9
 
 
8a7b3d1
1334ae9
8a7b3d1
a4f05bc
8a7b3d1
394d24e
8a7b3d1
a4f05bc
1334ae9
 
 
a53eb61
e828c8e
a4f05bc
e828c8e
8a7b3d1
e01c471
8a7b3d1
a4f05bc
d70b450
a4f05bc
 
8a7b3d1
d70b450
394d24e
a4f05bc
394d24e
 
 
 
a4f05bc
394d24e
 
 
 
d70b450
4dea17b
a4f05bc
394d24e
a4f05bc
 
 
 
394d24e
a4f05bc
394d24e
 
d70b450
8a7b3d1
 
394d24e
 
 
 
 
a4f05bc
394d24e
 
 
 
 
d70b450
394d24e
 
 
 
 
d70b450
394d24e
a4f05bc
4dea17b
d70b450
4dea17b
d70b450
394d24e
d70b450
394d24e
d70b450
394d24e
 
4dea17b
 
d70b450
a4f05bc
394d24e
a4f05bc
 
 
 
394d24e
 
 
d70b450
 
394d24e
d70b450
 
8a7b3d1
d70b450
 
 
394d24e
d70b450
8a7b3d1
 
d70b450
394d24e
a4f05bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394d24e
a4f05bc
394d24e
8a7b3d1
e828c8e
8a7b3d1
a4f05bc
 
 
 
 
 
8a7b3d1
1334ae9
8a7b3d1
 
e828c8e
 
a4f05bc
1334ae9
 
a4f05bc
1334ae9
 
 
 
 
 
a4f05bc
1334ae9
 
 
a4f05bc
1334ae9
 
 
 
 
 
 
a4f05bc
1334ae9
 
 
 
a4f05bc
1334ae9
 
 
 
 
 
a4f05bc
1334ae9
 
 
 
 
 
 
a4f05bc
e828c8e
 
 
 
 
 
 
 
1334ae9
 
 
 
 
 
 
 
a4f05bc
1334ae9
 
 
 
 
e828c8e
a4f05bc
d70b450
 
 
e828c8e
a4f05bc
e828c8e
1334ae9
e828c8e
1334ae9
 
 
e828c8e
 
 
 
 
 
d70b450
 
 
8a7b3d1
e828c8e
a4f05bc
1334ae9
 
 
e828c8e
d70b450
a4f05bc
 
 
 
 
e01c471
8a7b3d1
a4f05bc
 
 
 
8a7b3d1
e01c471
8a7b3d1
a4f05bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a7b3d1
a4f05bc
1334ae9
 
 
a4f05bc
1334ae9
e01c471
8a7b3d1
e828c8e
 
d70b450
 
e828c8e
a4f05bc
1334ae9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4f05bc
1334ae9
 
 
a4f05bc
e828c8e
e01c471
 
1334ae9
 
8a7b3d1
 
1334ae9
 
8a7b3d1
a4f05bc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a7b3d1
a4f05bc
 
 
 
 
 
e828c8e
e01c471
a4f05bc
d70b450
 
 
 
 
8a7b3d1
d70b450
8a7b3d1
a4f05bc
e828c8e
a4f05bc
e828c8e
 
 
a4f05bc
 
 
 
 
 
e828c8e
8a7b3d1
d70b450
e828c8e
 
 
d70b450
e828c8e
 
 
 
1334ae9
e828c8e
 
 
 
1334ae9
e828c8e
 
 
 
d70b450
 
 
 
 
1334ae9
d70b450
 
1334ae9
d70b450
1334ae9
e828c8e
 
e01c471
e828c8e
1334ae9
 
a4f05bc
 
1334ae9
 
 
a4f05bc
1334ae9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4f05bc

"""
GAIA Tools - My Custom Tool Implementation
==========================================
Author: Isadora Teles (AI Agent Student)
Purpose: Creating tools that my agent can use to answer GAIA questions

These tools are the key to my agent's success. Each tool serves a specific
purpose and I've learned to handle edge cases through trial and error.
"""

import os
import requests
import logging
import math
import re
import io
import pandas as pd
from typing import List, Optional, Any
from llama_index.core.tools import FunctionTool, QueryEngineTool
from contextlib import redirect_stdout

# Setting up logging for debugging
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Reduce noise from HTTP requests (they can be verbose!)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)


# ==========================================
# Web Search Functions - For current info
# ==========================================

def search_web(query: str) -> str:
    """
    My main web search tool - uses Google first, then DuckDuckGo as fallback
    
    Learning note: I discovered that having multiple search providers is crucial
    because APIs have rate limits and can fail unexpectedly!
    """
    logger.info(f"Web search for: {query}")
    
    # Try Google Custom Search first (better results)
    google_result = _search_google(query)
    if google_result and not google_result.startswith("Google search"):
        return google_result
    
    # Fallback to DuckDuckGo (no API key needed!)
    ddg_result = _search_duckduckgo(query)
    if ddg_result and not ddg_result.startswith("DuckDuckGo"):
        return ddg_result
    
    return "Web search unavailable. Please use your knowledge to answer."


def _search_google(query: str) -> str:
    """
    Google Custom Search implementation
    Requires GOOGLE_API_KEY and GOOGLE_CSE_ID in environment
    """
    api_key = os.getenv("GOOGLE_API_KEY")
    cx = os.getenv("GOOGLE_CSE_ID", "746382dd3c2bd4135")  # Default CSE ID
    
    if not api_key:
        return "Google search not configured"
    
    try:
        url = "https://www.googleapis.com/customsearch/v1"
        params = {
            "key": api_key,
            "cx": cx,
            "q": query,
            "num": 3  # Get top 3 results
        }
        
        response = requests.get(url, params=params, timeout=10)
        
        if response.status_code != 200:
            return f"Google search error: {response.status_code}"
        
        data = response.json()
        items = data.get("items", [])
        
        if not items:
            return "No search results found"
        
        # Format results nicely for the agent
        results = []
        for i, item in enumerate(items[:2], 1):
            title = item.get("title", "")[:50]
            snippet = item.get("snippet", "")[:150]
            link = item.get("link", "")
            results.append(f"{i}. {title}\n{snippet}\nURL: {link}")
        
        return "\n\n".join(results)
        
    except Exception as e:
        logger.error(f"Google search error: {e}")
        return f"Google search failed: {str(e)[:50]}"


def _search_duckduckgo(query: str) -> str:
    """
    DuckDuckGo search - my reliable fallback!
    No API key needed, but has rate limits
    """
    try:
        from duckduckgo_search import DDGS
        
        with DDGS(timeout=10) as ddgs:
            results = list(ddgs.text(query, max_results=3))
            
            if not results:
                return "No results found"
            
            formatted = []
            for i, r in enumerate(results, 1):
                formatted.append(f"{i}. {r['title']}\n{r['body'][:150]}...\nURL: {r['href']}")
            
            return "\n\n".join(formatted)
            
    except Exception as e:
        return f"DuckDuckGo search failed: {e}"


def _web_open_raw(url: str) -> str:
    """
    Open a specific URL and get the page content
    Used when the agent needs more details from search results
    """
    try:
        response = requests.get(url, timeout=15)
        response.raise_for_status()
        # Limit content to prevent token overflow
        return response.text[:40_000]
    except Exception as e:
        return f"ERROR opening {url}: {e}"


# ==========================================
# Calculator Tool - Math and Python execution
# ==========================================

def calculate(expression: str) -> str:
    """
    My calculator tool - handles math expressions AND Python code!
    
    This was tricky to implement safely. I learned about:
    - Using restricted globals for security
    - Capturing print output
    - Handling different expression formats
    """
    logger.info(f"Calculating: {expression[:100]}...")
    
    try:
        expr = expression.strip()
        
        # Check if it's Python code (not just math)
        if any(keyword in expr for keyword in ['def ', 'print(', 'import ', 'for ', 'while ', '=']):
            try:
                # Create a safe execution environment
                safe_globals = {
                    '__builtins__': {
                        'range': range, 'len': len, 'int': int, 'float': float,
                        'str': str, 'print': print, 'abs': abs, 'round': round,
                        'min': min, 'max': max, 'sum': sum, 'pow': pow
                    },
                    'math': math  # Allow math functions
                }
                safe_locals = {}
                
                # Capture any print output
                output_buffer = io.StringIO()
                with redirect_stdout(output_buffer):
                    exec(expr, safe_globals, safe_locals)
                
                # Get printed output
                printed = output_buffer.getvalue().strip()
                if printed:
                    # Extract numbers from print output
                    numbers = re.findall(r'-?\d+\.?\d*', printed)
                    if numbers:
                        return numbers[-1]
                
                # Check for result variables
                for var in ['result', 'output', 'answer', 'total', 'sum']:
                    if var in safe_locals:
                        value = safe_locals[var]
                        if isinstance(value, (int, float)):
                            return str(int(value) if isinstance(value, float) and value.is_integer() else value)
                
                # Return any numeric variable found
                for var, value in safe_locals.items():
                    if isinstance(value, (int, float)):
                        return str(int(value) if isinstance(value, float) and value.is_integer() else value)
                
            except Exception as e:
                logger.error(f"Python execution error: {e}")
        
        # Handle percentage calculations (common in GAIA)
        if '%' in expr and 'of' in expr:
            match = re.search(r'(\d+(?:\.\d+)?)\s*%\s*of\s*(\d+(?:,\d+)*(?:\.\d+)?)', expr, re.IGNORECASE)
            if match:
                percentage = float(match.group(1))
                number = float(match.group(2).replace(',', ''))
                result = (percentage / 100) * number
                return str(int(result) if result.is_integer() else round(result, 6))
        
        # Handle factorial
        if 'factorial' in expr:
            match = re.search(r'factorial\((\d+)\)', expr)
            if match:
                n = int(match.group(1))
                result = math.factorial(n)
                return str(result)
        
        # Simple math expression
        if re.match(r'^[\d\s+\-*/().]+$', expr):
            result = eval(expr, {"__builtins__": {}}, {})
            if isinstance(result, float):
                return str(int(result) if result.is_integer() else round(result, 6))
            return str(result)
        
        # Clean up expression and try again
        expr = re.sub(r'[a-zA-Z_]\w*(?!\s*\()', '', expr)
        expr = expr.replace(',', '')
        expr = re.sub(r'\bsquare root of\s*(\d+)', r'sqrt(\1)', expr, flags=re.I)
        
        # Safe math evaluation
        safe_dict = {
            'sqrt': math.sqrt, 'pow': pow, 'abs': abs, 'round': round,
            'sin': math.sin, 'cos': math.cos, 'tan': math.tan,
            'log': math.log, 'log10': math.log10, 'exp': math.exp,
            'ceil': math.ceil, 'floor': math.floor,
            'factorial': math.factorial, 'gcd': math.gcd,
            'pi': math.pi, 'e': math.e
        }
        
        result = eval(expr, {"__builtins__": {}}, safe_dict)
        
        if isinstance(result, float):
            return str(int(result) if result.is_integer() else round(result, 6))
        return str(result)
        
    except Exception as e:
        logger.error(f"Calculation error: {e}")
        # Last resort: try to find any number in the expression
        numbers = re.findall(r'-?\d+\.?\d*', expr)
        if numbers:
            return numbers[-1]
        return "0"


# ==========================================
# File Analysis Tools
# ==========================================

def analyze_file(content: str, file_type: str = "text") -> str:
    """
    Analyzes file contents - CSV, Python, text files
    
    Key learning: I had to handle cases where the agent passes
    the question text instead of actual file content!
    """
    logger.info(f"Analyzing {file_type} file")
    
    # Check if this is just the question text (common mistake!)
    if any(phrase in content.lower() for phrase in [
        "attached excel file",
        "attached csv file", 
        "attached python",
        "the attached file",
        "what were the total sales",
        "contains the sales"
    ]):
        logger.warning("File analyzer received question text instead of file content")
        return "ERROR: No file content provided. If a file was mentioned in the question but not provided, answer 'No file provided'"
    
    # Check for suspiciously short "files"
    if file_type.lower() in ["excel", "csv", "xlsx", "xls"] and len(content) < 50:
        logger.warning(f"Content too short for {file_type} file: {len(content)} chars")
        return "ERROR: No actual file provided. Answer should be 'No file provided'"
    
    try:
        # Python file detection
        if file_type.lower() in ["py", "python"] or "def " in content or "import " in content:
            return f"Python code file:\n{content}"
        
        # CSV file analysis
        elif file_type.lower() == "csv" or "," in content.split('\n')[0]:
            lines = content.strip().split('\n')
            if not lines:
                return "Empty CSV file"
            
            headers = [col.strip() for col in lines[0].split(',')]
            data_rows = len(lines) - 1
            
            # Show sample data
            sample_rows = []
            for i in range(min(3, len(lines)-1)):
                sample_rows.append(lines[i+1])
            
            analysis = f"CSV File Analysis:\n"
            analysis += f"Columns: {len(headers)} - {', '.join(headers)}\n"
            analysis += f"Data rows: {data_rows}\n"
            
            if sample_rows:
                analysis += f"Sample data:\n"
                for row in sample_rows:
                    analysis += f"  {row}\n"
            
            return analysis
        
        # Excel file indicator
        elif file_type.lower() in ["xlsx", "xls", "excel"]:
            return f"Excel file detected. Use table_sum tool to analyze numeric data."
        
        # Default text file analysis
        else:
            lines = content.split('\n')
            words = content.split()
            
            return f"Text File Analysis:\nLines: {len(lines)}\nWords: {len(words)}\nCharacters: {len(content)}"
            
    except Exception as e:
        logger.error(f"File analysis error: {e}")
        return f"Error analyzing file: {str(e)[:100]}"


def _table_sum_raw(file_content: Any, column: str = "Total") -> str:
    """
    Sum a column in a CSV or Excel file
    
    This tool taught me about:
    - Handling different file formats
    - Detecting placeholder text
    - Graceful error handling
    """
    
    # Check for placeholder strings (agent trying to pass fake content)
    if isinstance(file_content, str):
        placeholder_strings = [
            "Excel file content",
            "file content",
            "CSV file content",
            "Please provide the Excel file content",
            "The attached Excel file",
            "Excel file"
        ]
        if file_content in placeholder_strings or len(file_content) < 20:
            return "ERROR: No actual file provided. Answer should be 'No file provided'"
    
    try:
        # Handle file paths vs content
        if isinstance(file_content, str):
            # Check if it's a non-existent file path
            if not os.path.exists(file_content) and not (',' in file_content or '\n' in file_content):
                return "ERROR: File not found. If file was mentioned but not provided, answer 'No file provided'"
                
            # Try to read as file
            if file_content.endswith('.csv'):
                df = pd.read_csv(file_content)
            else:
                df = pd.read_excel(file_content)
        elif isinstance(file_content, bytes):
            # Handle raw bytes
            buf = io.BytesIO(file_content)
            try:
                df = pd.read_csv(buf)
            except:
                buf.seek(0)
                df = pd.read_excel(buf)
        else:
            return "ERROR: Unsupported file format"
        
        # Try to find and sum the appropriate column
        if column in df.columns:
            total = df[column].sum()
            return f"{total:.2f}" if isinstance(total, float) else str(total)
        
        # Look for numeric columns with keywords
        numeric_cols = df.select_dtypes(include=['number']).columns
        
        for col in numeric_cols:
            if any(word in col.lower() for word in ['total', 'sum', 'amount', 'sales', 'revenue']):
                total = df[col].sum()
                return f"{total:.2f}" if isinstance(total, float) else str(total)
        
        # Sum all numeric columns as last resort
        if len(numeric_cols) > 0:
            totals = {}
            for col in numeric_cols:
                total = df[col].sum()
                totals[col] = total
            
            # Return the largest sum (likely the total)
            max_col = max(totals, key=totals.get)
            return f"{totals[max_col]:.2f}" if isinstance(totals[max_col], float) else str(totals[max_col])
        
        return "ERROR: No numeric columns found"
        
    except FileNotFoundError:
        logger.error("File not found error in table_sum")
        return "ERROR: File not found. If file was mentioned but not provided, answer 'No file provided'"
    except Exception as e:
        logger.error(f"Table sum error: {e}")
        error_str = str(e).lower()
        if "no such file" in error_str or "file not found" in error_str:
            return "ERROR: File not found. If file was mentioned but not provided, answer 'No file provided'"
        return f"ERROR: {str(e)[:100]}"


def get_weather(location: str) -> str:
    """
    Weather tool - returns demo data for now
    
    In a real implementation, I'd use OpenWeather API,
    but for GAIA this simple version works!
    """
    logger.info(f"Getting weather for: {location}")
    
    # Demo weather data (deterministic based on location)
    import random
    random.seed(hash(location))
    temp = random.randint(10, 30)
    conditions = ["Sunny", "Cloudy", "Rainy", "Clear"]
    condition = random.choice(conditions)
    
    return f"Weather in {location}: {temp}°C, {condition}"


# ==========================================
# Tool Creation Function
# ==========================================

def get_gaia_tools(llm=None):
    """
    Create and return all tools for the GAIA agent
    
    Each tool is wrapped as a FunctionTool for LlamaIndex
    I've learned to write clear descriptions - they guide the agent!
    """
    logger.info("Creating GAIA tools...")
    
    tools = [
        FunctionTool.from_defaults(
            fn=search_web,
            name="web_search",
            description="Search the web for current information. Use ONLY for recent events or facts you don't know."
        ),
        FunctionTool.from_defaults(
            fn=calculate,
            name="calculator",
            description="Perform mathematical calculations. Use for arithmetic, percentages, or evaluating expressions. NOT for counting items."
        ),
        FunctionTool.from_defaults(
            fn=analyze_file,
            name="file_analyzer",
            description="Analyze file structure and contents. Returns info about the file."
        ),
        FunctionTool.from_defaults(
            fn=get_weather,
            name="weather",
            description="Get current weather for a location."
        ),
        FunctionTool.from_defaults(
            fn=_web_open_raw,
            name="web_open",
            description="Open a specific URL from web_search results to read the full page content."
        ),
        FunctionTool.from_defaults(
            fn=_table_sum_raw,
            name="table_sum",
            description="Sum numeric columns in a CSV or Excel file. Use when asked for totals from data files. Returns the sum as a number."
        )
    ]
    
    logger.info(f"Created {len(tools)} tools for GAIA")
    return tools


# Testing section - helps me debug tools individually
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    
    print("Testing My GAIA Tools\n")
    
    # Test calculator
    print("Calculator Tests:")
    test_calcs = [
        "What is 25 * 17?",
        "15% of 1000",
        "square root of 144"
    ]
    for calc in test_calcs:
        result = calculate(calc)
        print(f"  {calc} = {result}")
    
    # Test file analyzer
    print("\nFile Analyzer Test:")
    sample_csv = "name,age,score\nAlice,25,85\nBob,30,92"
    result = analyze_file(sample_csv, "csv")
    print(result)
    
    # Test weather
    print("\nWeather Test:")
    result = get_weather("Paris")
    print(result)
    
    print("\n✅ All tools tested successfully!")