Alessio-Chiovelli commited on
Commit
4d25efb
·
1 Parent(s): cb9639f

Audio summarization tools enabled

Browse files
Utils/__init__.py CHANGED
@@ -1,18 +1,28 @@
1
  from Utils.base64_to_image import base64_to_pil_image
2
  from Utils.image_to_base_64 import image_to_base64
 
 
 
 
 
3
  from Utils.chains import (
4
  get_chain, execute_chain,
5
  TextContent, FileContent,
6
- Base64ImageContent, UrlImageContent,
7
  Message, Content, ROLES_PROMPTS, IMAGE_MIME_TYPES
8
  )
9
 
10
  __all__ = [
 
 
 
 
11
  'base64_to_pil_image',
12
  'image_to_base64',
 
13
  'get_chain', 'execute_chain',
14
  'TextContent', 'FileContent',
15
- 'Base64ImageContent', 'UrlImageContent',
16
  'Message',
17
  'ROLES_PROMPTS', 'IMAGE_MIME_TYPES', 'Content',
18
  ]
 
1
  from Utils.base64_to_image import base64_to_pil_image
2
  from Utils.image_to_base_64 import image_to_base64
3
+ from Utils.extract_image_mime_type import extract_image_mime_type
4
+ from Utils.open_file_bytes import open_file_bytes
5
+ from Utils.bytes_to_base64 import bytes_to_base64
6
+ from Utils.extract_file_extension import extract_file_extension
7
+ from Utils.get_audio_mime_type_from_audio_file_path import get_audio_mime_type_from_audio_file_path
8
  from Utils.chains import (
9
  get_chain, execute_chain,
10
  TextContent, FileContent,
11
+ Base64ImageContent, Base64AudioContent, UrlImageContent,
12
  Message, Content, ROLES_PROMPTS, IMAGE_MIME_TYPES
13
  )
14
 
15
  __all__ = [
16
+ 'bytes_to_base64',
17
+ 'open_file_bytes',
18
+ 'extract_file_extension',
19
+ 'get_audio_mime_type_from_audio_file_path',
20
  'base64_to_pil_image',
21
  'image_to_base64',
22
+ 'extract_image_mime_type',
23
  'get_chain', 'execute_chain',
24
  'TextContent', 'FileContent',
25
+ 'Base64ImageContent', 'Base64AudioContent', 'UrlImageContent',
26
  'Message',
27
  'ROLES_PROMPTS', 'IMAGE_MIME_TYPES', 'Content',
28
  ]
Utils/bytes_to_base64.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+
3
+ def bytes_to_base64(data: bytes) -> str:
4
+ """
5
+ Converte un oggetto bytes in una stringa codificata in base64.
6
+
7
+ Args:
8
+ data (bytes): I dati da convertire.
9
+
10
+ Returns:
11
+ str: La rappresentazione base64 dei dati.
12
+ """
13
+ return base64.b64encode(data).decode('utf-8')
Utils/chains.py CHANGED
@@ -5,10 +5,13 @@ from langchain.chat_models import init_chat_model
5
  from typing import Literal
6
  from pydantic import BaseModel
7
  from typing import List, Literal, Union, Any
 
8
 
9
- from Utils.image_to_base_64 import image_to_base64
10
 
11
  IMAGE_MIME_TYPES = Literal['image/png', 'image/jpeg', 'image/gif', 'image/webp']
 
 
12
  class TextContent(BaseModel):
13
  type: Literal["text"] = "text"
14
  text: str
@@ -25,12 +28,18 @@ class Base64ImageContent(BaseModel):
25
  mime_type: IMAGE_MIME_TYPES = "image/png"
26
  data: str
27
 
 
 
 
 
 
 
28
  class UrlImageContent(BaseModel):
29
  type: Literal["image"] = "image"
30
  source_type: Literal["url"] = "url"
31
  url: str
32
 
33
- Content = Union[TextContent, TextContent, FileContent, Base64ImageContent, UrlImageContent]
34
  ROLES_PROMPTS = Literal['user', 'system', 'assistant']
35
 
36
  class Message(BaseModel):
 
5
  from typing import Literal
6
  from pydantic import BaseModel
7
  from typing import List, Literal, Union, Any
8
+ import re
9
 
10
+ from Utils import image_to_base64, extract_file_extension
11
 
12
  IMAGE_MIME_TYPES = Literal['image/png', 'image/jpeg', 'image/gif', 'image/webp']
13
+ AUDIO_MIME_TYPES = Literal[ "audio/mp3", "audio/wav", "audio/flac","audio/aac", "audio/ogg"]
14
+
15
  class TextContent(BaseModel):
16
  type: Literal["text"] = "text"
17
  text: str
 
28
  mime_type: IMAGE_MIME_TYPES = "image/png"
29
  data: str
30
 
31
+ class Base64AudioContent(BaseModel):
32
+ type: Literal["audio"] = "audio"
33
+ source_type: Literal["base64"] = "base64"
34
+ mime_type: AUDIO_MIME_TYPES = "audio/mp3"
35
+ data: str
36
+
37
  class UrlImageContent(BaseModel):
38
  type: Literal["image"] = "image"
39
  source_type: Literal["url"] = "url"
40
  url: str
41
 
42
+ Content = Union[TextContent, TextContent, FileContent, Base64ImageContent, Base64AudioContent, UrlImageContent]
43
  ROLES_PROMPTS = Literal['user', 'system', 'assistant']
44
 
45
  class Message(BaseModel):
Utils/extract_file_extension.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ def extract_file_extension(filename):
4
+ """
5
+ Estrae l'estensione del file dal nome del file.
6
+ Restituisce l'estensione inclusa di punto (es: '.txt'), oppure una stringa vuota se non c'è estensione.
7
+ """
8
+ _, ext = os.path.splitext(filename)
9
+ return ext
Utils/extract_image_mime_type.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from io import BytesIO
3
+ from PIL.ImageFile import ImageFile
4
+ from PIL import Image
5
+
6
+ from Utils.chains import IMAGE_MIME_TYPES
7
+
8
+ def extract_image_mime_type(img:ImageFile) -> IMAGE_MIME_TYPES:
9
+ img_mime_type : str | None = img.format
10
+ assert img_mime_type is not None, "The image format could not be determined. Please provide a valid image file."
11
+ img_mime_type = f'image/{img_mime_type.lower()}' if hasattr(img, 'format') else 'image/webp'
12
+ assert img_mime_type in ['image/jpeg', 'image/jpg', 'image/png', 'image/webp'], "Unsupported image format. Supported formats are: jpeg, jpg, png, webp."
13
+ return img_mime_type # type: ignore
Utils/get_audio_mime_type_from_audio_file_path.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from Utils.chains import AUDIO_MIME_TYPES
2
+ from Utils.extract_file_extension import extract_file_extension
3
+
4
+ AUDIO_MIME_TYPE_MAPPING = {
5
+ "mp3" : "audio/mp3",
6
+ "wav" : "audio/wav",
7
+ "flac" : "audio/flac",
8
+ "aac" : "audio/aac",
9
+ "ogg" : "audio/ogg"
10
+ }
11
+
12
+ def get_audio_mime_type_from_audio_file_path(audio_file_path: str) -> AUDIO_MIME_TYPES:
13
+ """
14
+ Returns the MIME type for the given audio file extension.
15
+ """
16
+ extension = extract_file_extension(audio_file_path).replace('.', '').lower()
17
+ return AUDIO_MIME_TYPE_MAPPING[extension]
Utils/open_file_bytes.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def open_file_bytes(file_path: str) -> bytes:
2
+ """
3
+ Opens a file and returns its content as bytes.
4
+
5
+ Args:
6
+ file_path (str): The path to the file to be opened.
7
+
8
+ Returns:
9
+ bytes: The content of the file as bytes.
10
+ """
11
+ with open(file_path, 'rb') as file:
12
+ return file.read()
constants.py CHANGED
@@ -8,7 +8,7 @@ from tools import (
8
  # search tools
9
  tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
10
  # multimodal tools
11
- describe_image_tool, extract_text_from_image,
12
  )
13
 
14
  load_dotenv('.env')
@@ -37,7 +37,7 @@ AGENT_TOOLS = [
37
  # search tools
38
  tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
39
  # multimodal tools
40
- describe_image_tool, extract_text_from_image
41
  ]
42
 
43
  LANGFUSE_API_KEY = os.getenv("LANGFUSE_API_KEY")
 
8
  # search tools
9
  tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
10
  # multimodal tools
11
+ describe_image_tool, extract_text_from_image_tool, summarize_audio_content_tool
12
  )
13
 
14
  load_dotenv('.env')
 
37
  # search tools
38
  tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
39
  # multimodal tools
40
+ describe_image_tool, extract_text_from_image_tool, summarize_audio_content_tool
41
  ]
42
 
43
  LANGFUSE_API_KEY = os.getenv("LANGFUSE_API_KEY")
tools/__init__.py CHANGED
@@ -2,7 +2,7 @@ from tools.calculator import calculator_tool, sum, sub, multiply, divide
2
  from tools.reverse_words import reverse_words
3
  from tools.websearch import tavily_search_tool, google_serper_web_search
4
  from tools.youtube_tool import custom_youtube_transcription_tool
5
- from tools.multimodal_llm_tools import describe_image_tool, extract_text_from_image
6
 
7
  __all__ = [
8
  # math tools
@@ -19,5 +19,6 @@ __all__ = [
19
  'custom_youtube_transcription_tool',
20
  # multimodal description tools
21
  'describe_image_tool',
22
- 'extract_text_from_image',
 
23
  ]
 
2
  from tools.reverse_words import reverse_words
3
  from tools.websearch import tavily_search_tool, google_serper_web_search
4
  from tools.youtube_tool import custom_youtube_transcription_tool
5
+ from tools.multimodal_llm_tools import describe_image_tool, extract_text_from_image_tool, summarize_audio_content_tool
6
 
7
  __all__ = [
8
  # math tools
 
19
  'custom_youtube_transcription_tool',
20
  # multimodal description tools
21
  'describe_image_tool',
22
+ 'extract_text_from_image_tool',
23
+ 'summarize_audio_content_tool',
24
  ]
tools/multimodal_llm_tools.py CHANGED
@@ -6,36 +6,41 @@ from functools import partial
6
  import os
7
 
8
  from Utils import *
 
9
 
10
- def extract_image_mime_type(img:ImageFile) -> IMAGE_MIME_TYPES:
11
- img_mime_type : str | None = img.format
12
- assert img_mime_type is not None, "The image format could not be determined. Please provide a valid image file."
13
- img_mime_type = f'image/{img_mime_type.lower()}' if hasattr(img, 'format') else 'image/webp'
14
- assert img_mime_type in ['image/jpeg', 'image/jpg', 'image/png', 'image/webp'], "Unsupported image format. Supported formats are: jpeg, jpg, png, webp."
15
- return img_mime_type # type: ignore
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
- def image_base_tool_template(image : str | ImageFile, system_prompt : str, additional_user_query : Optional[str] = "") -> str:
 
 
 
 
 
 
18
  prompt_messages = [Message(role = 'system', content=[TextContent(text = system_prompt)]), ]
19
-
20
- if isinstance(image, str):
21
- if os.path.exists(image): # the image was passed as a path
22
- image = Image.open(image)
23
- img = Image.open(image) if isinstance(image, str) else image
24
- else: # we passed a base64 string
25
- img = base64_to_pil_image(image)
26
- elif isinstance(image, ImageFile):
27
- img = image
28
  user_message_content = []
29
- user_message_content.append(
30
- Base64ImageContent(
31
- data=image_to_base64(image),
32
- mime_type=extract_image_mime_type(img) # TODO: fix the linter error
33
- ))
34
- if additional_user_query:
35
- user_message_content.append(TextContent(text=additional_user_query))
36
  user_message = Message(role = 'user', content = user_message_content)
37
  prompt_messages.append(user_message)
38
- image_descriptor_llm_chain = get_chain(model_settings ={"model" : "gpt-4.1", "temperature": 0.5})
39
  try:
40
  llm_image_description : str = execute_chain(chain = image_descriptor_llm_chain, messages=prompt_messages)
41
  except Exception as e:
@@ -52,23 +57,131 @@ EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT = ''.join([
52
  'Additionally, the user may give you some additional info in a query',
53
  'If so, use it to give a more precise answer'
54
  ])
55
- describe_image_tool = Tool(
56
- name = "DescribeImageTool",
57
- description="""
58
- This tool describes the content of an image.
59
- The image can be passed as a base64 string, a PIL image or as a path to an image file.
60
- The images, if passed as paths, are in your mount disk.
61
- Additionally, this tool can take a user query that can be used to provide a more precise answer.
62
- """,
63
- func = partial(image_base_tool_template, system_prompt = DESCRIBE_IMAGE_SYSTEM_PROMPT)
64
- )
65
- extract_text_from_image = Tool(
66
- name = "ExtractImageTextTool",
67
- description="""
68
- This tool extracts the text contained inside of an image.
69
- The image can be passed as a base64 string, a PIL image or as a path to an image file.
70
- The images, if passed as paths, are in your mount disk.
71
- Additionally, this tool can take a user query that can be used to provide a more precise answer.
72
- """,
73
- func = partial(image_base_tool_template, system_prompt = EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT)
74
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import os
7
 
8
  from Utils import *
9
+ from langchain_core.tools import tool
10
 
11
+ def parse_multimodal_contents(**kwargs) -> list[Content]:
12
+ """
13
+ Parses the contents of the multimodal message.
14
+ """
15
+ contents = []
16
+ if (image := kwargs.get('image')):
17
+ if isinstance(image, str):
18
+ if os.path.exists(image): # the image was passed as a path
19
+ img = Image.open(image) if isinstance(image, str) else image
20
+ else: # we passed a base64 string
21
+ img = base64_to_pil_image(image)
22
+ elif isinstance(image, ImageFile):
23
+ img = image
24
+ contents.append(Base64ImageContent(data=image_to_base64(image), mime_type=extract_image_mime_type(img)))
25
+ if (audio_path :=kwargs.get('audio')):
26
+ audio_base64 = bytes_to_base64(open_file_bytes(audio_path))
27
+ contents.append(Base64AudioContent(data=audio_base64, mime_type=get_audio_mime_type_from_audio_file_path(audio_path)))
28
+ return contents
29
 
30
+ def multimodal_tool_template(
31
+ system_prompt : str,
32
+ image: ImageFile | str | None,
33
+ audio: str | None,
34
+ model_settings : dict = {"model": "gpt-4.1", "temperature": 0.5},
35
+ additional_user_query : Optional[str] = "",
36
+ ) -> str:
37
  prompt_messages = [Message(role = 'system', content=[TextContent(text = system_prompt)]), ]
 
 
 
 
 
 
 
 
 
38
  user_message_content = []
39
+ user_message_content += parse_multimodal_contents(image = image, audio = audio)
40
+ if additional_user_query:user_message_content.append(TextContent(text=additional_user_query))
 
 
 
 
 
41
  user_message = Message(role = 'user', content = user_message_content)
42
  prompt_messages.append(user_message)
43
+ image_descriptor_llm_chain = get_chain(model_settings = model_settings)
44
  try:
45
  llm_image_description : str = execute_chain(chain = image_descriptor_llm_chain, messages=prompt_messages)
46
  except Exception as e:
 
57
  'Additionally, the user may give you some additional info in a query',
58
  'If so, use it to give a more precise answer'
59
  ])
60
+ SUMMARIZE_AUDIO_SYSTEM_PROMPT = ''.join([
61
+ 'Summarize what has been said in this audio file',
62
+ ])
63
+
64
+ # describe_image_tool = Tool(
65
+ # name = "DescribeImageTool",
66
+ # description="""
67
+ # This tool describes the content of an image.
68
+ # The image can be passed as a base64 string, a PIL image or as a path to an image file.
69
+ # The images, if passed as paths, are in your mount disk.
70
+ # Additionally, this tool can take a user query that can be used to provide a more precise answer.
71
+
72
+ # Args:
73
+ # image (str | PIL.ImageFile): The image to be described. This parameter is required.
74
+ # additional_user_query (str, optional): Additional information or query from the user to refine the description.
75
+ # Returns:
76
+ # str: The description of the image.
77
+ # Note:
78
+ # The 'image' parameter must be provided, otherwise the tool will not work as expected.
79
+ # """,
80
+ # func = lambda *args, **kwargs: image_base_tool_template(
81
+ # system_prompt=DESCRIBE_IMAGE_SYSTEM_PROMPT,
82
+ # audio=None,
83
+ # **kwargs)
84
+ # # partial(image_base_tool_template, system_prompt = DESCRIBE_IMAGE_SYSTEM_PROMPT, audio = None)
85
+ # )
86
+ # extract_text_from_image = Tool(
87
+ # name = "ExtractImageTextTool",
88
+ # description="""
89
+ # This tool extracts the text contained inside of an image.
90
+ # The image can be passed as a base64 string, a PIL image or as a path to an image file.
91
+ # The images, if passed as paths, are in your mount disk.
92
+ # Additionally, this tool can take a user query that can be used to provide a more precise answer.
93
+
94
+ # Args:
95
+ # image (str | PIL.ImageFile): The image to be described. This parameter is required.
96
+ # additional_user_query (str, optional): Additional information or query from the user to refine the description.
97
+ # Returns:
98
+ # str: The description of the image.
99
+ # Note:
100
+ # The 'image' parameter must be provided, otherwise the tool will not work as expected.
101
+ # """,
102
+ # func = lambda *args, **kwargs: image_base_tool_template(
103
+ # system_prompt=EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT,
104
+ # audio=None,
105
+ # **kwargs)
106
+ # # partial(image_base_tool_template, system_prompt = EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT, audio = None)
107
+ # )
108
+ # summarize_audio_content = Tool(
109
+ # name = "AnalizeAudioContentTool",
110
+ # description="""
111
+ # This tool analizes and summarizes the content of an audio file.
112
+
113
+ # Args:
114
+ # audio (str): The path where to extract the audio from.
115
+ # additional_user_query (str, optional): Additional information or query from the user to refine the description.
116
+ # Returns:
117
+ # str: The audio summarized.
118
+ # Note:
119
+ # The 'audio' parameter must be provided, otherwise the tool will not work as expected.
120
+ # """,
121
+ # func = lambda *args, **kwargs: image_base_tool_template(
122
+ # system_prompt=SUMMARIZE_AUDIO_SYSTEM_PROMPT,
123
+ # image=None,
124
+ # **kwargs)
125
+ # # partial(image_base_tool_template, system_prompt = SUMMARIZE_AUDIO_SYSTEM_PROMPT, image = None)
126
+ # )
127
+
128
+
129
+ @tool("DescribeImageTool", parse_docstring=True)
130
+ def describe_image_tool(image: ImageFile | str, additional_user_query: Optional[str] = "") -> str:
131
+ """Describes the content of an image. The image can be a base64 string, a PIL image, or a path to an image file.
132
+ Optionally, a user query can be provided to refine the description.
133
+
134
+ Args:
135
+ image (ImageFile | str): path to the image or base64 string of the image or Opened ImageFile.
136
+ additional_user_query (Optional[str], optional): An additional user string query. Defaults to "".
137
+
138
+ Returns:
139
+ str: The description of the image.
140
+ """
141
+ return multimodal_tool_template(
142
+ system_prompt=DESCRIBE_IMAGE_SYSTEM_PROMPT,
143
+ model_settings={"model": "gpt-4.1", "temperature": 0.5},
144
+ image=image,
145
+ audio=None,
146
+ additional_user_query=additional_user_query
147
+ )
148
+
149
+ @tool("ExtractImageTextTool", parse_docstring=True)
150
+ def extract_text_from_image_tool(image: ImageFile | str, additional_user_query: Optional[str] = "") -> str:
151
+ """Extracts the text contained inside an image. The image can be a base64 string, a PIL image, or a path to an image file.
152
+ Optionally, a user query can be provided to refine the extraction.
153
+
154
+ Args:
155
+ image (ImageFile | str): path to the image or base64 string of the image or Opened ImageFile.
156
+ additional_user_query (Optional[str], optional): An additional user string query. Defaults to "".
157
+
158
+ Returns:
159
+ str: The extracted text from the image.
160
+ """
161
+ return multimodal_tool_template(
162
+ system_prompt=EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT,
163
+ model_settings={"model": "gpt-4o-mini-audio", "temperature": 0.5, "modalities" : ["text"]},
164
+ image=image,
165
+ audio=None,
166
+ additional_user_query=additional_user_query
167
+ )
168
+
169
+ @tool("AnalizeAudioContentTool", parse_docstring=True)
170
+ def summarize_audio_content_tool(audio: str, additional_user_query: Optional[str] = "") -> str:
171
+ """Analyzes and summarizes the content of an audio file. The audio must be provided as a file path.
172
+ Optionally, a user query can be provided to refine the summary.
173
+
174
+ Args:
175
+ audio (str): The path to the audio file to be summarized.
176
+ additional_user_query (Optional[str], optional): An additional user string query. Defaults to "".
177
+
178
+ Returns:
179
+ str: The summarized content of the audio file.
180
+ """
181
+ return multimodal_tool_template(
182
+ system_prompt=SUMMARIZE_AUDIO_SYSTEM_PROMPT,
183
+ model_settings={"model": "gpt-4o-audio-preview", "temperature": 0.5},
184
+ image=None,
185
+ audio=audio,
186
+ additional_user_query=additional_user_query
187
+ )