File size: 5,840 Bytes
bd73133
1041734
e7b4937
bd73133
1041734
 
 
 
 
38cc8e4
 
bd73133
1041734
38cc8e4
bd73133
 
1041734
 
 
 
38cc8e4
 
1041734
 
4eed151
1041734
 
 
5890f66
4eed151
 
 
 
 
 
 
 
 
 
 
1041734
 
 
 
5890f66
4eed151
 
 
 
 
 
 
1041734
 
 
 
5890f66
4eed151
 
5890f66
4eed151
 
 
 
1041734
 
 
 
3ee1e2c
4eed151
 
3ee1e2c
4eed151
 
 
 
 
 
 
 
1041734
 
38cc8e4
 
f1b095a
38cc8e4
 
61ecfdb
38cc8e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1041734
 
 
 
 
 
 
 
38cc8e4
 
1041734
 
 
 
 
 
 
 
 
38cc8e4
1041734
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
Tool implementations package
Author: @mangubee

This package contains all agent tools:
- web_search: Web search using Tavily/Exa
- file_parser: Multi-format file parsing (PDF/Excel/Word/Text)
- calculator: Safe mathematical expression evaluation
- vision: Multimodal image analysis using LLMs
- youtube: YouTube transcript extraction with Whisper fallback
- audio: Audio transcription using Whisper (ZeroGPU)

Stage 2: All tools implemented with retry logic and error handling
Phase 1: YouTube + Audio transcription added
"""

from src.tools.web_search import search, tavily_search, exa_search
from src.tools.file_parser import parse_file, parse_pdf, parse_excel, parse_word, parse_text
from src.tools.calculator import safe_eval
from src.tools.vision import analyze_image, analyze_image_gemini, analyze_image_claude
from src.tools.youtube import youtube_transcript
from src.tools.audio import transcribe_audio, cleanup

# Tool registry with metadata
# Schema matches LLM function calling requirements (parameters as dict, not list)
TOOLS = {
    "web_search": {
        "function": search,
        "description": "Search the web for factual information, current events, Wikipedia articles, statistics, people, companies, and research. Use when question requires external knowledge not in context or files.",
        "parameters": {
            "query": {
                "description": "Search query string",
                "type": "string"
            },
            "max_results": {
                "description": "Maximum number of search results to return (default: 5)",
                "type": "integer"
            }
        },
        "required_params": ["query"],
        "category": "information_retrieval",
    },
    "parse_file": {
        "function": parse_file,
        "description": "Extract and parse content from uploaded files (PDF, Excel, Word, Text, CSV). Use when question references 'the file', 'uploaded document', 'attachment', or specific file formats. Reads file structure and text content.",
        "parameters": {
            "file_path": {
                "description": "Absolute or relative path to the file to parse",
                "type": "string"
            }
        },
        "required_params": ["file_path"],
        "category": "file_processing",
    },
    "calculator": {
        "function": safe_eval,
        "description": "Evaluate mathematical expressions and perform calculations (arithmetic, algebra, trigonometry, logarithms). Supports operators (+, -, *, /, **) and functions (sqrt, sin, cos, log, abs, etc). Use for any numerical computation or formula evaluation.",
        "parameters": {
            "expression": {
                "description": "Mathematical expression to evaluate (e.g., '2 + 2', 'sqrt(16)', '25 * 37 + 100')",
                "type": "string"
            }
        },
        "required_params": ["expression"],
        "category": "computation",
    },
    "vision": {
        "function": analyze_image,
        "description": "Analyze image files using multimodal AI vision models. Describe visual content, identify objects, read text from images, answer questions about photos or screenshots. Use when question mentions images, photos, pictures, screenshots, or visual content. Supports JPEG, PNG, GIF, BMP, WebP formats. NOTE: Cannot process videos. For YouTube videos or video files, use youtube_transcript or transcribe_audio tools instead.",
        "parameters": {
            "image_path": {
                "description": "Path to the image file to analyze (JPEG, PNG, GIF, BMP, WebP)",
                "type": "string"
            },
            "question": {
                "description": "Question to ask about the image (optional, defaults to 'Describe this image')",
                "type": "string"
            }
        },
        "required_params": ["image_path"],
        "category": "multimodal",
    },
    "youtube_transcript": {
        "function": youtube_transcript,
        "description": "Extract transcript from YouTube video URLs (youtube.com, youtu.be, shorts) OR analyze video frames visually. Use this tool FIRST when question mentions YouTube, video, or contains a YouTube URL. This tool handles video content in two modes: (1) Transcript mode extracts what is said/discussed via captions or Whisper fallback, (2) Frame mode extracts and analyzes video frames with vision models. Mode is controlled by YOUTUBE_MODE env variable. This is the ONLY tool that can process YouTube URLs directly.",
        "parameters": {
            "url": {
                "description": "YouTube video URL (youtube.com/watch?v=ID, youtu.be/ID, or shorts/ID format)",
                "type": "string"
            }
        },
        "required_params": ["url"],
        "category": "video_processing",
    },
    "transcribe_audio": {
        "function": transcribe_audio,
        "description": "Transcribe audio file using Whisper speech-to-text. Supports MP3, WAV, M4A, OGG, FLAC, AAC formats. Use when question references audio files, podcasts, voice recordings, or when YouTube video lacks transcript. Returns transcribed text.",
        "parameters": {
            "file_path": {
                "description": "Path to the audio file to transcribe",
                "type": "string"
            }
        },
        "required_params": ["file_path"],
        "category": "audio_processing",
    },
}

__all__ = [
    # Main unified tool functions
    "search",
    "parse_file",
    "safe_eval",
    "analyze_image",
    "youtube_transcript",
    "transcribe_audio",
    # Specific implementations (for advanced use)
    "tavily_search",
    "exa_search",
    "parse_pdf",
    "parse_excel",
    "parse_word",
    "parse_text",
    "analyze_image_gemini",
    "analyze_image_claude",
    "cleanup",
    # Tool registry
    "TOOLS",
]