File size: 9,694 Bytes
8c162e4
546a399
8c162e4
982aacc
 
3e94ec9
1c67957
8c162e4
982aacc
82cfb9f
982aacc
 
 
 
 
 
 
 
8c162e4
982aacc
79d4225
 
fa1a705
 
 
 
f9f587e
fa1a705
 
 
 
 
58f7a3e
fa1a705
 
982aacc
8c162e4
2bcf72e
 
 
3e94ec9
2bcf72e
 
982aacc
2bcf72e
 
 
 
 
 
 
 
 
 
 
 
 
 
3e94ec9
0b52792
982aacc
3e94ec9
 
 
 
 
 
 
 
 
0b52792
3e94ec9
2bcf72e
82cfb9f
982aacc
 
3e94ec9
982aacc
 
31dbc84
982aacc
 
 
 
 
 
 
 
 
 
 
 
 
 
e1b0e5c
0b52792
982aacc
3e94ec9
 
 
 
 
 
 
982aacc
3e94ec9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b52792
3e94ec9
 
31dbc84
 
 
3e94ec9
 
 
 
 
 
 
 
 
 
 
 
31dbc84
 
3e94ec9
31dbc84
 
3e94ec9
 
 
 
 
 
 
 
2bcf72e
3e94ec9
 
 
 
cede6a0
3e94ec9
 
22985f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e94ec9
cbbbfb4
 
 
 
eb84627
cbbbfb4
 
68a5482
 
 
cbbbfb4
 
 
 
 
eb84627
 
 
 
 
 
cbbbfb4
9d6412f
cbbbfb4
 
 
3e94ec9
 
68a5482
3e94ec9
 
 
 
 
 
22985f3
68a5482
92cc01c
cbbbfb4
1393885
 
bf05a20
bea827c
cbbbfb4
99b8231
3e94ec9
31dbc84
 
 
 
bf05a20
31dbc84
 
 
 
 
 
 
 
 
bf05a20
 
31dbc84
bf05a20
cbbbfb4
982aacc
cbbbfb4
eb84627
cbbbfb4
85dd0e2
cbbbfb4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import math
from typing import Optional, Tuple, Literal
from smolagents import tool
import base64
from openai import OpenAI
import joblib
import os

@tool
def download_and_get_path_for_provided_file(path: str) -> str:
    """
    Download and cache the provided file. Returns the path of the cached file.
    
    Args:
        path (str): Intended file path 
    
    Returns:
        bytes: The binary content of the downloaded file

    """
    from huggingface_hub import hf_hub_download

    for dataset in ["test","validation"]:
        try:
            file_path = hf_hub_download(
            repo_id="gaia-benchmark/GAIA",
            filename=f"2023/{dataset}/{path}",
            repo_type="dataset",
            token=os.environ['HF_TOKEN'])
            if file_path:
                return file_path
        except Exception as e:
            print(e)
            continue

        

@tool
def extract_text_from_audio(file_path: str) -> str:
    """
    Extract and return text transcription from an audio file given its path.
    
    Args:
        file_path (str): Path to the audio file to be transcribed.
    
    Returns:
        str: The extracted text content from the audio file.
    
    Raises:
        Exception : the exception 
    
    Examples:
        >>> extract_text_from_audio("meeting_recording.wav")
        "Hello team, welcome to our weekly meeting..."
        
        >>> extract_text_from_audio("/path/to/audio/interview.mp3")
        "Could you please introduce yourself and your background?"
    """
    try:
        return joblib.load(f"{file_path}")

    except:
        client = OpenAI()
        audio_file = open(file_path, "rb")
    
        transcription = client.audio.transcriptions.create(
            model="gpt-4o-transcribe", 
            file=audio_file, 
            response_format="text"
        )
        joblib.dump(transcription, f"{file_path}")
        return transcription

@tool
def describe_image(request:str, file_path: str) -> str:
    """
    Extract and return the requested information from an image given its path.
    
    Args:
        request: The information to retreive from the image. The request must be simple, short and precise.
        file_path (str): Path to the audio file to be transcribed. The file should
                        be in a format compatible with the SpeechRecognition library.
    
    Returns:
        str: The extracted text from the image.

    Examples:
        >>> describe_image("how many birds are in the picture", "underwater_picture.jpg")
        "There are 2 birds depicted in an frame placed underwater"
        
        >>> describe_image("what is the position of the black queen?","chess_board.png")
        "Qd3"
    """

    try :
        return joblib.load(f"{file_path}")

    except:
        client = OpenAI()
        
        # Function to encode the image
        def encode_image(image_path):
            with open(image_path, "rb") as image_file:
                return base64.b64encode(image_file.read()).decode("utf-8")
    
        # Getting the Base64 string
        base64_image = encode_image(file_path)
        
        
        response = client.responses.create(
            model="gpt-4.1",
            input=[
                {
                    "role": "user",
                    "content": [
                        { "type": "input_text", "text": request },
                        {
                            "type": "input_image",
                            "image_url": f"data:image/jpeg;base64,{base64_image}",
                        },
                    ],
                }
            ],
        )
        joblib.dump(response.output_text,f"{file_path}")
        return response.output_text



@tool
def get_transcript_from_youtube_file_id(file_id: str) -> str:
    """
    Retrieve the transcript for a YouTube video given its id.
    
    Args:
        file_id (str): The YouTube video ID (the alphanumeric string that appears after
                      'v=' in a YouTube URL, e.g., 'dQw4w9WgXcQ').
    
    Returns:
        str: The transcript content for the specified video. a JSON string or formatted
             text containing transcript segments with timestamps.
    """
    from youtube_transcript_api import YouTubeTranscriptApi
    ytt_api = YouTubeTranscriptApi()
    transcript = ytt_api.fetch(file_id)  
    return transcript


@tool
def parse_python_file(path: str) -> str:
    """
    Read and return the contents of a Python file from its path.
    
    Args:
        path (str): The file path to the Python file to be read.
    
    Returns:
        str: The complete contents of the Python file as a string.
    
    """    
    with open(path, "r") as py_file:
        return py_file.read()

@tool
def parse_pdf_file(path: str) -> str:
    """
    Read and return the contents of a pdf file from its path.
    
    Args:
        path (str): The file path to the pdf file to be read.
    
    Returns:
        str: The complete contents of the pdf file as a string.
    
    """    
    from pypdf import PdfReader

    if not path.endswith(".pdf"):
        return "file does not end with .pdf"

    reader = PdfReader(path)
    len_pages = len(reader.pages)

    out = ""
    for p in range(len_pages):
        page = reader.pages[0]
        text = page.extract_text()
        out+=text+"\n"
    return out


        
class TestAgent:
    def __init__(self):
        
        # import code agent and basic tool from smolagent
        from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool, FinalAnswerTool, VisitWebpageTool, MCPClient

        # import additional tool from langchain @ https://docs.langchain.com/oss/python/integrations/tools
        #from langchain_community.agent_toolkits import load_tools
        from langchain_community.agent_toolkits.load_tools import load_tools

        from smolagents import Tool
        wikipedia_tool = Tool.from_langchain(load_tools(["wikipedia"])[0])
        wikipedia_tool.top_k_results=3

        # import tools from MCP servers @ https://github.com/mcp
        #from mcp import StdioServerParameters
        #server_parameters = StdioServerParameters(command="uvx",
        #                                          args=["--quiet", "youtubeqa@0.2.1"],
        #                                          env={"UV_PYTHON": "3.12", **os.environ},
        #                                         )
        #youtube_tools = MCPServerTool(server_params=server_parameters)

        model = OpenAIServerModel(model_id="gpt-4.1-mini")
        #model = InferenceClientModel("Qwen/Qwen2.5-Coder-32B-Instruct")
        # Instantiate the agent
        self.agent = CodeAgent(
            tools=[download_and_get_path_for_provided_file,        # V4. get attached file
                   DuckDuckGoSearchTool(),                         # basic tools from smolagent
                   VisitWebpageTool(),
                   wikipedia_tool,                                 # tool from langchain with extra parmaeters
                   #youtube_tools,                                 # tool from MCP server
                   get_transcript_from_youtube_file_id,            # V4
                   parse_python_file,                              # V4
                   describe_image,                                 # V4
                   extract_text_from_audio,                        # V4
                   parse_pdf_file,                                 # V5
                   FinalAnswerTool()],
            additional_authorized_imports=["pandas","markdownify","requests","chess","os"],    # V2 add markdownify & requests V5 add chess and os
            model=model,
            max_steps=6,                              # V3 increase steps
            planning_interval=3,                      # V3 add structure
            verbosity_level=0,
            use_structured_outputs_internally=True   # V3. Adds structure
        )
        # V3. add Guidance
        #prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!"
        #self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance

        # V4. use prompt from the paper as guidance
        prompt = """\n\n
                It is very important to remember the foillowing: You are a general AI assistant. I will ask you a question. Report your thoughts, and
                finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
                YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated
                list of numbers and/or strings.
                If you are asked for a number, don’t use comma to write your number neither use units such as $ or
                percent sign unless specified otherwise.
                If you are asked for a string, don’t use articles, neither abbreviations (e.g. for cities), and write the
                digits in plain text unless specified otherwise.
                If you are asked for a comma separated list, apply the above rules depending of whether the element
                to be put in the list is a number or a string.
                \n\n
                Now it's your turn.
                """
        self.agent.prompt_templates['system_prompt'] =  self.agent.prompt_templates['system_prompt']  + prompt


    def __call__(self, question: str) -> str:

        print(f"Agent received question (first 50 chars): {question[:50]}...")
        answer = self.agent.run(question)
        print(f"Agent returning his answer: {answer}")
        return answer