RCaz commited on
Commit
982aacc
·
verified ·
1 Parent(s): faa5fe0

2 more llm for audio and vieo processing

Browse files
Files changed (1) hide show
  1. agent.py +83 -18
agent.py CHANGED
@@ -1,22 +1,38 @@
1
  import math
2
  from typing import Optional, Tuple, Literal
3
  from smolagents import tool
 
 
4
 
5
 
 
 
 
 
 
 
 
 
 
 
6
 
 
 
 
 
 
 
 
 
 
7
 
8
  @tool
9
  def extract_text_from_audio(file_path: str) -> str:
10
  """
11
- Extract and return text transcription from an audio file using speech recognition.
12
-
13
- This tool uses Google's speech recognition API to convert spoken audio content
14
- into text. It supports various audio formats including WAV, AIFF, and FLAC
15
- (formats supported by the SpeechRecognition library).
16
 
17
  Args:
18
- file_path (str): Path to the audio file to be transcribed. The file should
19
- be in a format compatible with the SpeechRecognition library.
20
 
21
  Returns:
22
  str: The extracted text content from the audio file.
@@ -32,18 +48,65 @@ def extract_text_from_audio(file_path: str) -> str:
32
  "Could you please introduce yourself and your background?"
33
  """
34
 
35
- import speech_recognition as sr
36
- r = sr.Recognizer()
37
- try:
38
- with sr.AudioFile(file_path) as source:
39
- # listen for the data (load audio to memory)
40
- audio_data = r.record(source)
41
- # recognize (convert from speech to text)
42
- text = r.recognize_google(audio_data)
43
- return text
44
- except Exception as e:
45
- return e
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
  class TestAgent:
49
  def __init__(self):
@@ -88,6 +151,8 @@ class TestAgent:
88
  prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!"
89
  self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance
90
 
 
 
91
  def __call__(self, question: str) -> str:
92
 
93
  print(f"Agent received question (first 50 chars): {question[:50]}...")
 
1
  import math
2
  from typing import Optional, Tuple, Literal
3
  from smolagents import tool
4
+ import base64
5
+ from openai import OpenAI
6
 
7
 
8
+ @tool
9
+ def download_and_get_path_for_provided_file(path: str):
10
+ """
11
+ Download and cache the provided file. Returns the path of the cached file.
12
+
13
+ Args:
14
+ path (str): Intended file path
15
+
16
+ Returns:
17
+ bytes: The binary content of the downloaded file
18
 
19
+ """
20
+ file_path = hf_hub_download(
21
+ repo_id="gaia-benchmark/GAIA",
22
+ filename="2023/test/063800f6-8832-4856-972b-17b877612533.png",
23
+ repo_type="dataset",
24
+ token=os.environ['HF_TOKEN']
25
+ )
26
+ return file_path
27
+
28
 
29
  @tool
30
  def extract_text_from_audio(file_path: str) -> str:
31
  """
32
+ Extract and return text transcription from an audio file.
 
 
 
 
33
 
34
  Args:
35
+ file_path (str): Path to the audio file to be transcribed.
 
36
 
37
  Returns:
38
  str: The extracted text content from the audio file.
 
48
  "Could you please introduce yourself and your background?"
49
  """
50
 
51
+ client = OpenAI()
52
+ audio_file = open(file_path, "rb")
53
+
54
+ transcription = client.audio.transcriptions.create(
55
+ model="gpt-4o-transcribe",
56
+ file=audio_file,
57
+ response_format="text"
58
+ )
59
+ return transcription
 
 
60
 
61
+
62
+ def describe_image(request:str, file_path: str) -> str:
63
+ """
64
+ Extract and return the requested information from an image.
65
+
66
+ Args:
67
+ request: The information to retreive from the image.
68
+ file_path (str): Path to the audio file to be transcribed. The file should
69
+ be in a format compatible with the SpeechRecognition library.
70
+
71
+ Returns:
72
+ str: The extracted text from the image.
73
+
74
+ Examples:
75
+ >>> describe_image("how many birds are in the picture", "underwater_picture.jpg")
76
+ "There are 2 birds depicted in an frame placed underwater"
77
+
78
+ >>> describe_image("what is the position of the black queen?","chess_board.png")
79
+ "Qd3"
80
+ """
81
+
82
+ client = OpenAI()
83
+
84
+ # Function to encode the image
85
+ def encode_image(image_path):
86
+ with open(image_path, "rb") as image_file:
87
+ return base64.b64encode(image_file.read()).decode("utf-8")
88
+
89
+ # Getting the Base64 string
90
+ base64_image = encode_image(file_path)
91
+
92
+
93
+ response = client.responses.create(
94
+ model="gpt-4.1",
95
+ input=[
96
+ {
97
+ "role": "user",
98
+ "content": [
99
+ { "type": "input_text", "text": request },
100
+ {
101
+ "type": "input_image",
102
+ "image_url": f"data:image/jpeg;base64,{base64_image}",
103
+ },
104
+ ],
105
+ }
106
+ ],
107
+ )
108
+
109
+ return response.output_text
110
 
111
  class TestAgent:
112
  def __init__(self):
 
151
  prompt_for_guidance = "\n10. Provide the answer axactly as it is asked, be concise and precise\n\nNow Begin!"
152
  self.agent.prompt_templates['system_prompt'] = self.agent.prompt_templates['system_prompt'] + prompt_for_guidance
153
 
154
+ # V4. use prompt from the paper ?
155
+
156
  def __call__(self, question: str) -> str:
157
 
158
  print(f"Agent received question (first 50 chars): {question[:50]}...")