agentbee

Running

File size: 5,840 Bytes

"""
Tool implementations package
Author: @mangubee

This package contains all agent tools:
- web_search: Web search using Tavily/Exa
- file_parser: Multi-format file parsing (PDF/Excel/Word/Text)
- calculator: Safe mathematical expression evaluation
- vision: Multimodal image analysis using LLMs
- youtube: YouTube transcript extraction with Whisper fallback
- audio: Audio transcription using Whisper (ZeroGPU)

Stage 2: All tools implemented with retry logic and error handling
Phase 1: YouTube + Audio transcription added
"""

from src.tools.web_search import search, tavily_search, exa_search
from src.tools.file_parser import parse_file, parse_pdf, parse_excel, parse_word, parse_text
from src.tools.calculator import safe_eval
from src.tools.vision import analyze_image, analyze_image_gemini, analyze_image_claude
from src.tools.youtube import youtube_transcript
from src.tools.audio import transcribe_audio, cleanup

# Tool registry with metadata
# Schema matches LLM function calling requirements (parameters as dict, not list)
TOOLS = {
    "web_search": {
        "function": search,
        "description": "Search the web for factual information, current events, Wikipedia articles, statistics, people, companies, and research. Use when question requires external knowledge not in context or files.",
        "parameters": {
            "query": {
                "description": "Search query string",
                "type": "string"
            },
            "max_results": {
                "description": "Maximum number of search results to return (default: 5)",
                "type": "integer"
            }
        },
        "required_params": ["query"],
        "category": "information_retrieval",
    },
    "parse_file": {
        "function": parse_file,
        "description": "Extract and parse content from uploaded files (PDF, Excel, Word, Text, CSV). Use when question references 'the file', 'uploaded document', 'attachment', or specific file formats. Reads file structure and text content.",
        "parameters": {
            "file_path": {
                "description": "Absolute or relative path to the file to parse",
                "type": "string"
            }
        },
        "required_params": ["file_path"],
        "category": "file_processing",
    },
    "calculator": {
        "function": safe_eval,
        "description": "Evaluate mathematical expressions and perform calculations (arithmetic, algebra, trigonometry, logarithms). Supports operators (+, -, *, /, **) and functions (sqrt, sin, cos, log, abs, etc). Use for any numerical computation or formula evaluation.",
        "parameters": {
            "expression": {
                "description": "Mathematical expression to evaluate (e.g., '2 + 2', 'sqrt(16)', '25 * 37 + 100')",
                "type": "string"
            }
        },
        "required_params": ["expression"],
        "category": "computation",
    },
    "vision": {
        "function": analyze_image,
        "description": "Analyze image files using multimodal AI vision models. Describe visual content, identify objects, read text from images, answer questions about photos or screenshots. Use when question mentions images, photos, pictures, screenshots, or visual content. Supports JPEG, PNG, GIF, BMP, WebP formats. NOTE: Cannot process videos. For YouTube videos or video files, use youtube_transcript or transcribe_audio tools instead.",
        "parameters": {
            "image_path": {
                "description": "Path to the image file to analyze (JPEG, PNG, GIF, BMP, WebP)",
                "type": "string"
            },
            "question": {
                "description": "Question to ask about the image (optional, defaults to 'Describe this image')",
                "type": "string"
            }
        },
        "required_params": ["image_path"],
        "category": "multimodal",
    },
    "youtube_transcript": {
        "function": youtube_transcript,
        "description": "Extract transcript from YouTube video URLs (youtube.com, youtu.be, shorts) OR analyze video frames visually. Use this tool FIRST when question mentions YouTube, video, or contains a YouTube URL. This tool handles video content in two modes: (1) Transcript mode extracts what is said/discussed via captions or Whisper fallback, (2) Frame mode extracts and analyzes video frames with vision models. Mode is controlled by YOUTUBE_MODE env variable. This is the ONLY tool that can process YouTube URLs directly.",
        "parameters": {
            "url": {
                "description": "YouTube video URL (youtube.com/watch?v=ID, youtu.be/ID, or shorts/ID format)",
                "type": "string"
            }
        },
        "required_params": ["url"],
        "category": "video_processing",
    },
    "transcribe_audio": {
        "function": transcribe_audio,
        "description": "Transcribe audio file using Whisper speech-to-text. Supports MP3, WAV, M4A, OGG, FLAC, AAC formats. Use when question references audio files, podcasts, voice recordings, or when YouTube video lacks transcript. Returns transcribed text.",
        "parameters": {
            "file_path": {
                "description": "Path to the audio file to transcribe",
                "type": "string"
            }
        },
        "required_params": ["file_path"],
        "category": "audio_processing",
    },
}

__all__ = [
    # Main unified tool functions
    "search",
    "parse_file",
    "safe_eval",
    "analyze_image",
    "youtube_transcript",
    "transcribe_audio",
    # Specific implementations (for advanced use)
    "tavily_search",
    "exa_search",
    "parse_pdf",
    "parse_excel",
    "parse_word",
    "parse_text",
    "analyze_image_gemini",
    "analyze_image_claude",
    "cleanup",
    # Tool registry
    "TOOLS",
]