Spaces:

nicolacaione
/

HFAgentsCourse

Sleeping

App Files Files Community

Alessio-Chiovelli commited on Jun 21, 2025

Commit

4d25efb

1 Parent(s): cb9639f

Audio summarization tools enabled

Browse files

Files changed (10) hide show

Utils/__init__.py +12 -2
Utils/bytes_to_base64.py +13 -0
Utils/chains.py +11 -2
Utils/extract_file_extension.py +9 -0
Utils/extract_image_mime_type.py +13 -0
Utils/get_audio_mime_type_from_audio_file_path.py +17 -0
Utils/open_file_bytes.py +12 -0
constants.py +2 -2
tools/__init__.py +3 -2
tools/multimodal_llm_tools.py +157 -44

Utils/__init__.py CHANGED Viewed

@@ -1,18 +1,28 @@
 from Utils.base64_to_image import base64_to_pil_image
 from Utils.image_to_base_64 import image_to_base64
 from Utils.chains import (
     get_chain, execute_chain,
     TextContent, FileContent,
-    Base64ImageContent, UrlImageContent,
     Message, Content, ROLES_PROMPTS, IMAGE_MIME_TYPES
 )
 __all__ = [
     'base64_to_pil_image',
     'image_to_base64',
     'get_chain', 'execute_chain',
     'TextContent', 'FileContent',
-    'Base64ImageContent', 'UrlImageContent',
     'Message',
     'ROLES_PROMPTS', 'IMAGE_MIME_TYPES', 'Content',
 ]

 from Utils.base64_to_image import base64_to_pil_image
 from Utils.image_to_base_64 import image_to_base64
+from Utils.extract_image_mime_type import extract_image_mime_type
+from Utils.open_file_bytes import open_file_bytes
+from Utils.bytes_to_base64 import bytes_to_base64
+from Utils.extract_file_extension import extract_file_extension
+from Utils.get_audio_mime_type_from_audio_file_path import get_audio_mime_type_from_audio_file_path
 from Utils.chains import (
     get_chain, execute_chain,
     TextContent, FileContent,
+    Base64ImageContent, Base64AudioContent, UrlImageContent,
     Message, Content, ROLES_PROMPTS, IMAGE_MIME_TYPES
 )
 __all__ = [
+    'bytes_to_base64',
+    'open_file_bytes',
+    'extract_file_extension',
+    'get_audio_mime_type_from_audio_file_path',
     'base64_to_pil_image',
     'image_to_base64',
+    'extract_image_mime_type',
     'get_chain', 'execute_chain',
     'TextContent', 'FileContent',
+    'Base64ImageContent', 'Base64AudioContent', 'UrlImageContent',
     'Message',
     'ROLES_PROMPTS', 'IMAGE_MIME_TYPES', 'Content',
 ]

Utils/bytes_to_base64.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import base64
+def bytes_to_base64(data: bytes) -> str:
+    """
+    Converte un oggetto bytes in una stringa codificata in base64.
+    Args:
+        data (bytes): I dati da convertire.
+    Returns:
+        str: La rappresentazione base64 dei dati.
+    """
+    return base64.b64encode(data).decode('utf-8')

Utils/chains.py CHANGED Viewed

@@ -5,10 +5,13 @@ from langchain.chat_models import init_chat_model
 from typing import Literal
 from pydantic import BaseModel
 from typing import List, Literal, Union, Any
-from Utils.image_to_base_64 import image_to_base64
 IMAGE_MIME_TYPES = Literal['image/png', 'image/jpeg', 'image/gif', 'image/webp']
 class TextContent(BaseModel):
     type: Literal["text"] = "text"
     text: str
@@ -25,12 +28,18 @@ class Base64ImageContent(BaseModel):
     mime_type: IMAGE_MIME_TYPES = "image/png"
     data: str
 class UrlImageContent(BaseModel):
     type: Literal["image"] = "image"
     source_type: Literal["url"] = "url"
     url: str
-Content = Union[TextContent, TextContent, FileContent, Base64ImageContent, UrlImageContent]
 ROLES_PROMPTS = Literal['user', 'system', 'assistant']
 class Message(BaseModel):

 from typing import Literal
 from pydantic import BaseModel
 from typing import List, Literal, Union, Any
+import re
+from Utils import image_to_base64, extract_file_extension
 IMAGE_MIME_TYPES = Literal['image/png', 'image/jpeg', 'image/gif', 'image/webp']
+AUDIO_MIME_TYPES = Literal[ "audio/mp3", "audio/wav", "audio/flac","audio/aac", "audio/ogg"]
 class TextContent(BaseModel):
     type: Literal["text"] = "text"
     text: str
     mime_type: IMAGE_MIME_TYPES = "image/png"
     data: str
+class Base64AudioContent(BaseModel):
+    type: Literal["audio"] = "audio"
+    source_type: Literal["base64"] = "base64"
+    mime_type: AUDIO_MIME_TYPES = "audio/mp3"
+    data: str
 class UrlImageContent(BaseModel):
     type: Literal["image"] = "image"
     source_type: Literal["url"] = "url"
     url: str
+Content = Union[TextContent, TextContent, FileContent, Base64ImageContent, Base64AudioContent, UrlImageContent]
 ROLES_PROMPTS = Literal['user', 'system', 'assistant']
 class Message(BaseModel):

Utils/extract_file_extension.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+def extract_file_extension(filename):
+    """
+    Estrae l'estensione del file dal nome del file.
+    Restituisce l'estensione inclusa di punto (es: '.txt'), oppure una stringa vuota se non c'è estensione.
+    """
+    _, ext = os.path.splitext(filename)
+    return ext

Utils/extract_image_mime_type.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import base64
+from io import BytesIO
+from PIL.ImageFile import ImageFile
+from PIL import Image
+from Utils.chains import IMAGE_MIME_TYPES
+def extract_image_mime_type(img:ImageFile) -> IMAGE_MIME_TYPES:
+    img_mime_type : str | None = img.format
+    assert img_mime_type is not None, "The image format could not be determined. Please provide a valid image file."
+    img_mime_type = f'image/{img_mime_type.lower()}' if hasattr(img, 'format') else 'image/webp'
+    assert img_mime_type in ['image/jpeg', 'image/jpg', 'image/png', 'image/webp'], "Unsupported image format. Supported formats are: jpeg, jpg, png, webp."
+    return img_mime_type # type: ignore

Utils/get_audio_mime_type_from_audio_file_path.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from Utils.chains import AUDIO_MIME_TYPES
+from Utils.extract_file_extension import extract_file_extension
+AUDIO_MIME_TYPE_MAPPING = {
+    "mp3" : "audio/mp3",
+    "wav" : "audio/wav",
+    "flac" : "audio/flac",
+    "aac" : "audio/aac",
+    "ogg" : "audio/ogg"
+}
+def get_audio_mime_type_from_audio_file_path(audio_file_path: str) -> AUDIO_MIME_TYPES:
+    """
+    Returns the MIME type for the given audio file extension.
+    """
+    extension = extract_file_extension(audio_file_path).replace('.', '').lower()
+    return AUDIO_MIME_TYPE_MAPPING[extension]

Utils/open_file_bytes.py ADDED Viewed

	@@ -0,0 +1,12 @@

+def open_file_bytes(file_path: str) -> bytes:
+    """
+    Opens a file and returns its content as bytes.
+    Args:
+        file_path (str): The path to the file to be opened.
+    Returns:
+        bytes: The content of the file as bytes.
+    """
+    with open(file_path, 'rb') as file:
+        return file.read()

constants.py CHANGED Viewed

@@ -8,7 +8,7 @@ from tools import (
     # search tools
     tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
     # multimodal tools
-    describe_image_tool, extract_text_from_image,
 )
 load_dotenv('.env')
@@ -37,7 +37,7 @@ AGENT_TOOLS = [
     # search tools
     tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
     # multimodal tools
-    describe_image_tool, extract_text_from_image
 ]
 LANGFUSE_API_KEY = os.getenv("LANGFUSE_API_KEY")

     # search tools
     tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
     # multimodal tools
+    describe_image_tool, extract_text_from_image_tool, summarize_audio_content_tool
 )
 load_dotenv('.env')
     # search tools
     tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
     # multimodal tools
+    describe_image_tool, extract_text_from_image_tool, summarize_audio_content_tool
 ]
 LANGFUSE_API_KEY = os.getenv("LANGFUSE_API_KEY")

tools/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ from tools.calculator import calculator_tool, sum, sub, multiply, divide
 from tools.reverse_words import reverse_words
 from tools.websearch import  tavily_search_tool, google_serper_web_search
 from tools.youtube_tool import  custom_youtube_transcription_tool
-from tools.multimodal_llm_tools import describe_image_tool, extract_text_from_image
 __all__ = [
     # math tools
@@ -19,5 +19,6 @@ __all__ = [
     'custom_youtube_transcription_tool',
     # multimodal description tools
     'describe_image_tool',
-    'extract_text_from_image',
 ]

 from tools.reverse_words import reverse_words
 from tools.websearch import  tavily_search_tool, google_serper_web_search
 from tools.youtube_tool import  custom_youtube_transcription_tool
+from tools.multimodal_llm_tools import describe_image_tool, extract_text_from_image_tool, summarize_audio_content_tool
 __all__ = [
     # math tools
     'custom_youtube_transcription_tool',
     # multimodal description tools
     'describe_image_tool',
+    'extract_text_from_image_tool',
+    'summarize_audio_content_tool',
 ]

tools/multimodal_llm_tools.py CHANGED Viewed

@@ -6,36 +6,41 @@ from functools import partial
 import os
 from Utils import *
-def extract_image_mime_type(img:ImageFile) -> IMAGE_MIME_TYPES:
-    img_mime_type : str | None = img.format
-    assert img_mime_type is not None, "The image format could not be determined. Please provide a valid image file."
-    img_mime_type = f'image/{img_mime_type.lower()}' if hasattr(img, 'format') else 'image/webp'
-    assert img_mime_type in ['image/jpeg', 'image/jpg', 'image/png', 'image/webp'], "Unsupported image format. Supported formats are: jpeg, jpg, png, webp."
-    return img_mime_type # type: ignore
-def image_base_tool_template(image : str | ImageFile, system_prompt : str, additional_user_query : Optional[str] = "") -> str:
     prompt_messages =  [Message(role = 'system', content=[TextContent(text = system_prompt)]), ]
-    if isinstance(image, str):
-        if os.path.exists(image): # the image was passed as a path
-            image = Image.open(image)
-            img = Image.open(image) if isinstance(image, str) else image
-        else: #  we passed a base64 string
-            img = base64_to_pil_image(image)
-    elif isinstance(image, ImageFile):
-        img = image
     user_message_content = []
-    user_message_content.append(
-        Base64ImageContent(
-        data=image_to_base64(image),
-        mime_type=extract_image_mime_type(img) # TODO: fix the linter error
-    ))
-    if additional_user_query:
-        user_message_content.append(TextContent(text=additional_user_query))
     user_message = Message(role = 'user', content = user_message_content)
     prompt_messages.append(user_message)
-    image_descriptor_llm_chain = get_chain(model_settings ={"model" : "gpt-4.1", "temperature": 0.5})
     try:
         llm_image_description : str = execute_chain(chain = image_descriptor_llm_chain, messages=prompt_messages)
     except Exception as e:
@@ -52,23 +57,131 @@ EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT = ''.join([
         'Additionally, the user may give you some additional info in a query',
         'If so, use it to give a more precise answer'
     ])
-describe_image_tool = Tool(
-    name = "DescribeImageTool",
-    description="""
-This tool describes the content of an image.
-The image can be passed as a base64 string, a PIL image or as a path to an image file.
-The images, if passed as paths, are in your mount disk.
-Additionally, this tool can take a user query that can be used to provide a more precise answer.
-""",
-    func = partial(image_base_tool_template, system_prompt = DESCRIBE_IMAGE_SYSTEM_PROMPT)
-)
-extract_text_from_image = Tool(
-    name = "ExtractImageTextTool",
-    description="""
-This tool extracts the text contained inside of an image.
-The image can be passed as a base64 string, a PIL image or as a path to an image file.
-The images, if passed as paths, are in your mount disk.
-Additionally, this tool can take a user query that can be used to provide a more precise answer.
-""",
-    func = partial(image_base_tool_template, system_prompt = EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT)
-)

 import os
 from Utils import *
+from langchain_core.tools import tool
+def parse_multimodal_contents(**kwargs) -> list[Content]:
+    """
+    Parses the contents of the multimodal message.
+    """
+    contents = []
+    if  (image := kwargs.get('image')):
+        if isinstance(image, str):
+            if os.path.exists(image):  # the image was passed as a path
+                img = Image.open(image) if isinstance(image, str) else image
+            else: #  we passed a base64 string
+                img = base64_to_pil_image(image)
+        elif isinstance(image, ImageFile):
+            img = image
+        contents.append(Base64ImageContent(data=image_to_base64(image), mime_type=extract_image_mime_type(img)))
+    if (audio_path :=kwargs.get('audio')):
+        audio_base64 = bytes_to_base64(open_file_bytes(audio_path))
+        contents.append(Base64AudioContent(data=audio_base64, mime_type=get_audio_mime_type_from_audio_file_path(audio_path)))
+    return contents
+def multimodal_tool_template(
+        system_prompt : str,
+        image: ImageFile | str | None,
+        audio: str | None,
+        model_settings : dict = {"model": "gpt-4.1", "temperature": 0.5},
+        additional_user_query : Optional[str] = "",
+    ) -> str:
     prompt_messages =  [Message(role = 'system', content=[TextContent(text = system_prompt)]), ]
     user_message_content = []
+    user_message_content += parse_multimodal_contents(image = image, audio = audio)
+    if additional_user_query:user_message_content.append(TextContent(text=additional_user_query))
     user_message = Message(role = 'user', content = user_message_content)
     prompt_messages.append(user_message)
+    image_descriptor_llm_chain = get_chain(model_settings = model_settings)
     try:
         llm_image_description : str = execute_chain(chain = image_descriptor_llm_chain, messages=prompt_messages)
     except Exception as e:
         'Additionally, the user may give you some additional info in a query',
         'If so, use it to give a more precise answer'
     ])
+SUMMARIZE_AUDIO_SYSTEM_PROMPT = ''.join([
+        'Summarize what has been said in this audio file',
+    ])
+# describe_image_tool = Tool(
+#     name = "DescribeImageTool",
+#     description="""
+# This tool describes the content of an image.
+# The image can be passed as a base64 string, a PIL image or as a path to an image file.
+# The images, if passed as paths, are in your mount disk.
+# Additionally, this tool can take a user query that can be used to provide a more precise answer.
+# Args:
+#     image (str | PIL.ImageFile): The image to be described. This parameter is required.
+#     additional_user_query (str, optional): Additional information or query from the user to refine the description.
+# Returns:
+#     str: The description of the image.
+# Note:
+#     The 'image' parameter must be provided, otherwise the tool will not work as expected.
+# """,
+#     func = lambda *args, **kwargs: image_base_tool_template(
+#         system_prompt=DESCRIBE_IMAGE_SYSTEM_PROMPT,
+#         audio=None,
+#         **kwargs)
+#     # partial(image_base_tool_template, system_prompt = DESCRIBE_IMAGE_SYSTEM_PROMPT, audio = None)
+# )
+# extract_text_from_image = Tool(
+#     name = "ExtractImageTextTool",
+#     description="""
+# This tool extracts the text contained inside of an image.
+# The image can be passed as a base64 string, a PIL image or as a path to an image file.
+# The images, if passed as paths, are in your mount disk.
+# Additionally, this tool can take a user query that can be used to provide a more precise answer.
+# Args:
+#     image (str | PIL.ImageFile): The image to be described. This parameter is required.
+#     additional_user_query (str, optional): Additional information or query from the user to refine the description.
+# Returns:
+#     str: The description of the image.
+# Note:
+#     The 'image' parameter must be provided, otherwise the tool will not work as expected.
+# """,
+#     func = lambda *args, **kwargs: image_base_tool_template(
+#         system_prompt=EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT,
+#         audio=None,
+#         **kwargs)
+#     # partial(image_base_tool_template, system_prompt = EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT, audio = None)
+# )
+# summarize_audio_content = Tool(
+#     name = "AnalizeAudioContentTool",
+#     description="""
+# This tool analizes and summarizes the content of an audio file.
+# Args:
+#     audio (str): The path where to extract the audio from.
+#     additional_user_query (str, optional): Additional information or query from the user to refine the description.
+# Returns:
+#     str: The audio summarized.
+# Note:
+#     The 'audio' parameter must be provided, otherwise the tool will not work as expected.
+# """,
+#     func = lambda *args, **kwargs: image_base_tool_template(
+#     system_prompt=SUMMARIZE_AUDIO_SYSTEM_PROMPT,
+#     image=None,
+#     **kwargs)
+#     # partial(image_base_tool_template, system_prompt = SUMMARIZE_AUDIO_SYSTEM_PROMPT, image = None)
+# )
+@tool("DescribeImageTool", parse_docstring=True)
+def describe_image_tool(image: ImageFile | str, additional_user_query: Optional[str] = "") -> str:
+    """Describes the content of an image. The image can be a base64 string, a PIL image, or a path to an image file.
+    Optionally, a user query can be provided to refine the description.
+    Args:
+        image (ImageFile | str): path to the image or base64 string of the image or Opened ImageFile.
+        additional_user_query (Optional[str], optional): An additional user string query. Defaults to "".
+    Returns:
+        str: The description of the image.
+    """
+    return multimodal_tool_template(
+        system_prompt=DESCRIBE_IMAGE_SYSTEM_PROMPT,
+        model_settings={"model": "gpt-4.1", "temperature": 0.5},
+        image=image,
+        audio=None,
+        additional_user_query=additional_user_query
+    )
+@tool("ExtractImageTextTool", parse_docstring=True)
+def extract_text_from_image_tool(image: ImageFile | str, additional_user_query: Optional[str] = "") -> str:
+    """Extracts the text contained inside an image. The image can be a base64 string, a PIL image, or a path to an image file.
+    Optionally, a user query can be provided to refine the extraction.
+    Args:
+        image (ImageFile | str): path to the image or base64 string of the image or Opened ImageFile.
+        additional_user_query (Optional[str], optional): An additional user string query. Defaults to "".
+    Returns:
+        str: The extracted text from the image.
+    """
+    return multimodal_tool_template(
+        system_prompt=EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT,
+        model_settings={"model": "gpt-4o-mini-audio", "temperature": 0.5, "modalities" : ["text"]},
+        image=image,
+        audio=None,
+        additional_user_query=additional_user_query
+    )
+@tool("AnalizeAudioContentTool", parse_docstring=True)
+def summarize_audio_content_tool(audio: str, additional_user_query: Optional[str] = "") -> str:
+    """Analyzes and summarizes the content of an audio file. The audio must be provided as a file path.
+    Optionally, a user query can be provided to refine the summary.
+    Args:
+        audio (str): The path to the audio file to be summarized.
+        additional_user_query (Optional[str], optional): An additional user string query. Defaults to "".
+    Returns:
+        str: The summarized content of the audio file.
+    """
+    return multimodal_tool_template(
+        system_prompt=SUMMARIZE_AUDIO_SYSTEM_PROMPT,
+        model_settings={"model": "gpt-4o-audio-preview", "temperature": 0.5},
+        image=None,
+        audio=audio,
+        additional_user_query=additional_user_query
+    )