Spaces:
Sleeping
Sleeping
Commit ·
4d25efb
1
Parent(s): cb9639f
Audio summarization tools enabled
Browse files- Utils/__init__.py +12 -2
- Utils/bytes_to_base64.py +13 -0
- Utils/chains.py +11 -2
- Utils/extract_file_extension.py +9 -0
- Utils/extract_image_mime_type.py +13 -0
- Utils/get_audio_mime_type_from_audio_file_path.py +17 -0
- Utils/open_file_bytes.py +12 -0
- constants.py +2 -2
- tools/__init__.py +3 -2
- tools/multimodal_llm_tools.py +157 -44
Utils/__init__.py
CHANGED
|
@@ -1,18 +1,28 @@
|
|
| 1 |
from Utils.base64_to_image import base64_to_pil_image
|
| 2 |
from Utils.image_to_base_64 import image_to_base64
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from Utils.chains import (
|
| 4 |
get_chain, execute_chain,
|
| 5 |
TextContent, FileContent,
|
| 6 |
-
Base64ImageContent, UrlImageContent,
|
| 7 |
Message, Content, ROLES_PROMPTS, IMAGE_MIME_TYPES
|
| 8 |
)
|
| 9 |
|
| 10 |
__all__ = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
'base64_to_pil_image',
|
| 12 |
'image_to_base64',
|
|
|
|
| 13 |
'get_chain', 'execute_chain',
|
| 14 |
'TextContent', 'FileContent',
|
| 15 |
-
'Base64ImageContent', 'UrlImageContent',
|
| 16 |
'Message',
|
| 17 |
'ROLES_PROMPTS', 'IMAGE_MIME_TYPES', 'Content',
|
| 18 |
]
|
|
|
|
| 1 |
from Utils.base64_to_image import base64_to_pil_image
|
| 2 |
from Utils.image_to_base_64 import image_to_base64
|
| 3 |
+
from Utils.extract_image_mime_type import extract_image_mime_type
|
| 4 |
+
from Utils.open_file_bytes import open_file_bytes
|
| 5 |
+
from Utils.bytes_to_base64 import bytes_to_base64
|
| 6 |
+
from Utils.extract_file_extension import extract_file_extension
|
| 7 |
+
from Utils.get_audio_mime_type_from_audio_file_path import get_audio_mime_type_from_audio_file_path
|
| 8 |
from Utils.chains import (
|
| 9 |
get_chain, execute_chain,
|
| 10 |
TextContent, FileContent,
|
| 11 |
+
Base64ImageContent, Base64AudioContent, UrlImageContent,
|
| 12 |
Message, Content, ROLES_PROMPTS, IMAGE_MIME_TYPES
|
| 13 |
)
|
| 14 |
|
| 15 |
__all__ = [
|
| 16 |
+
'bytes_to_base64',
|
| 17 |
+
'open_file_bytes',
|
| 18 |
+
'extract_file_extension',
|
| 19 |
+
'get_audio_mime_type_from_audio_file_path',
|
| 20 |
'base64_to_pil_image',
|
| 21 |
'image_to_base64',
|
| 22 |
+
'extract_image_mime_type',
|
| 23 |
'get_chain', 'execute_chain',
|
| 24 |
'TextContent', 'FileContent',
|
| 25 |
+
'Base64ImageContent', 'Base64AudioContent', 'UrlImageContent',
|
| 26 |
'Message',
|
| 27 |
'ROLES_PROMPTS', 'IMAGE_MIME_TYPES', 'Content',
|
| 28 |
]
|
Utils/bytes_to_base64.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
|
| 3 |
+
def bytes_to_base64(data: bytes) -> str:
|
| 4 |
+
"""
|
| 5 |
+
Converte un oggetto bytes in una stringa codificata in base64.
|
| 6 |
+
|
| 7 |
+
Args:
|
| 8 |
+
data (bytes): I dati da convertire.
|
| 9 |
+
|
| 10 |
+
Returns:
|
| 11 |
+
str: La rappresentazione base64 dei dati.
|
| 12 |
+
"""
|
| 13 |
+
return base64.b64encode(data).decode('utf-8')
|
Utils/chains.py
CHANGED
|
@@ -5,10 +5,13 @@ from langchain.chat_models import init_chat_model
|
|
| 5 |
from typing import Literal
|
| 6 |
from pydantic import BaseModel
|
| 7 |
from typing import List, Literal, Union, Any
|
|
|
|
| 8 |
|
| 9 |
-
from Utils
|
| 10 |
|
| 11 |
IMAGE_MIME_TYPES = Literal['image/png', 'image/jpeg', 'image/gif', 'image/webp']
|
|
|
|
|
|
|
| 12 |
class TextContent(BaseModel):
|
| 13 |
type: Literal["text"] = "text"
|
| 14 |
text: str
|
|
@@ -25,12 +28,18 @@ class Base64ImageContent(BaseModel):
|
|
| 25 |
mime_type: IMAGE_MIME_TYPES = "image/png"
|
| 26 |
data: str
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
class UrlImageContent(BaseModel):
|
| 29 |
type: Literal["image"] = "image"
|
| 30 |
source_type: Literal["url"] = "url"
|
| 31 |
url: str
|
| 32 |
|
| 33 |
-
Content = Union[TextContent, TextContent, FileContent, Base64ImageContent, UrlImageContent]
|
| 34 |
ROLES_PROMPTS = Literal['user', 'system', 'assistant']
|
| 35 |
|
| 36 |
class Message(BaseModel):
|
|
|
|
| 5 |
from typing import Literal
|
| 6 |
from pydantic import BaseModel
|
| 7 |
from typing import List, Literal, Union, Any
|
| 8 |
+
import re
|
| 9 |
|
| 10 |
+
from Utils import image_to_base64, extract_file_extension
|
| 11 |
|
| 12 |
IMAGE_MIME_TYPES = Literal['image/png', 'image/jpeg', 'image/gif', 'image/webp']
|
| 13 |
+
AUDIO_MIME_TYPES = Literal[ "audio/mp3", "audio/wav", "audio/flac","audio/aac", "audio/ogg"]
|
| 14 |
+
|
| 15 |
class TextContent(BaseModel):
|
| 16 |
type: Literal["text"] = "text"
|
| 17 |
text: str
|
|
|
|
| 28 |
mime_type: IMAGE_MIME_TYPES = "image/png"
|
| 29 |
data: str
|
| 30 |
|
| 31 |
+
class Base64AudioContent(BaseModel):
|
| 32 |
+
type: Literal["audio"] = "audio"
|
| 33 |
+
source_type: Literal["base64"] = "base64"
|
| 34 |
+
mime_type: AUDIO_MIME_TYPES = "audio/mp3"
|
| 35 |
+
data: str
|
| 36 |
+
|
| 37 |
class UrlImageContent(BaseModel):
|
| 38 |
type: Literal["image"] = "image"
|
| 39 |
source_type: Literal["url"] = "url"
|
| 40 |
url: str
|
| 41 |
|
| 42 |
+
Content = Union[TextContent, TextContent, FileContent, Base64ImageContent, Base64AudioContent, UrlImageContent]
|
| 43 |
ROLES_PROMPTS = Literal['user', 'system', 'assistant']
|
| 44 |
|
| 45 |
class Message(BaseModel):
|
Utils/extract_file_extension.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
def extract_file_extension(filename):
|
| 4 |
+
"""
|
| 5 |
+
Estrae l'estensione del file dal nome del file.
|
| 6 |
+
Restituisce l'estensione inclusa di punto (es: '.txt'), oppure una stringa vuota se non c'è estensione.
|
| 7 |
+
"""
|
| 8 |
+
_, ext = os.path.splitext(filename)
|
| 9 |
+
return ext
|
Utils/extract_image_mime_type.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
from io import BytesIO
|
| 3 |
+
from PIL.ImageFile import ImageFile
|
| 4 |
+
from PIL import Image
|
| 5 |
+
|
| 6 |
+
from Utils.chains import IMAGE_MIME_TYPES
|
| 7 |
+
|
| 8 |
+
def extract_image_mime_type(img:ImageFile) -> IMAGE_MIME_TYPES:
|
| 9 |
+
img_mime_type : str | None = img.format
|
| 10 |
+
assert img_mime_type is not None, "The image format could not be determined. Please provide a valid image file."
|
| 11 |
+
img_mime_type = f'image/{img_mime_type.lower()}' if hasattr(img, 'format') else 'image/webp'
|
| 12 |
+
assert img_mime_type in ['image/jpeg', 'image/jpg', 'image/png', 'image/webp'], "Unsupported image format. Supported formats are: jpeg, jpg, png, webp."
|
| 13 |
+
return img_mime_type # type: ignore
|
Utils/get_audio_mime_type_from_audio_file_path.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from Utils.chains import AUDIO_MIME_TYPES
|
| 2 |
+
from Utils.extract_file_extension import extract_file_extension
|
| 3 |
+
|
| 4 |
+
AUDIO_MIME_TYPE_MAPPING = {
|
| 5 |
+
"mp3" : "audio/mp3",
|
| 6 |
+
"wav" : "audio/wav",
|
| 7 |
+
"flac" : "audio/flac",
|
| 8 |
+
"aac" : "audio/aac",
|
| 9 |
+
"ogg" : "audio/ogg"
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
def get_audio_mime_type_from_audio_file_path(audio_file_path: str) -> AUDIO_MIME_TYPES:
|
| 13 |
+
"""
|
| 14 |
+
Returns the MIME type for the given audio file extension.
|
| 15 |
+
"""
|
| 16 |
+
extension = extract_file_extension(audio_file_path).replace('.', '').lower()
|
| 17 |
+
return AUDIO_MIME_TYPE_MAPPING[extension]
|
Utils/open_file_bytes.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def open_file_bytes(file_path: str) -> bytes:
|
| 2 |
+
"""
|
| 3 |
+
Opens a file and returns its content as bytes.
|
| 4 |
+
|
| 5 |
+
Args:
|
| 6 |
+
file_path (str): The path to the file to be opened.
|
| 7 |
+
|
| 8 |
+
Returns:
|
| 9 |
+
bytes: The content of the file as bytes.
|
| 10 |
+
"""
|
| 11 |
+
with open(file_path, 'rb') as file:
|
| 12 |
+
return file.read()
|
constants.py
CHANGED
|
@@ -8,7 +8,7 @@ from tools import (
|
|
| 8 |
# search tools
|
| 9 |
tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
|
| 10 |
# multimodal tools
|
| 11 |
-
describe_image_tool,
|
| 12 |
)
|
| 13 |
|
| 14 |
load_dotenv('.env')
|
|
@@ -37,7 +37,7 @@ AGENT_TOOLS = [
|
|
| 37 |
# search tools
|
| 38 |
tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
|
| 39 |
# multimodal tools
|
| 40 |
-
describe_image_tool,
|
| 41 |
]
|
| 42 |
|
| 43 |
LANGFUSE_API_KEY = os.getenv("LANGFUSE_API_KEY")
|
|
|
|
| 8 |
# search tools
|
| 9 |
tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
|
| 10 |
# multimodal tools
|
| 11 |
+
describe_image_tool, extract_text_from_image_tool, summarize_audio_content_tool
|
| 12 |
)
|
| 13 |
|
| 14 |
load_dotenv('.env')
|
|
|
|
| 37 |
# search tools
|
| 38 |
tavily_search_tool, google_serper_web_search, custom_youtube_transcription_tool,
|
| 39 |
# multimodal tools
|
| 40 |
+
describe_image_tool, extract_text_from_image_tool, summarize_audio_content_tool
|
| 41 |
]
|
| 42 |
|
| 43 |
LANGFUSE_API_KEY = os.getenv("LANGFUSE_API_KEY")
|
tools/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@ from tools.calculator import calculator_tool, sum, sub, multiply, divide
|
|
| 2 |
from tools.reverse_words import reverse_words
|
| 3 |
from tools.websearch import tavily_search_tool, google_serper_web_search
|
| 4 |
from tools.youtube_tool import custom_youtube_transcription_tool
|
| 5 |
-
from tools.multimodal_llm_tools import describe_image_tool,
|
| 6 |
|
| 7 |
__all__ = [
|
| 8 |
# math tools
|
|
@@ -19,5 +19,6 @@ __all__ = [
|
|
| 19 |
'custom_youtube_transcription_tool',
|
| 20 |
# multimodal description tools
|
| 21 |
'describe_image_tool',
|
| 22 |
-
'
|
|
|
|
| 23 |
]
|
|
|
|
| 2 |
from tools.reverse_words import reverse_words
|
| 3 |
from tools.websearch import tavily_search_tool, google_serper_web_search
|
| 4 |
from tools.youtube_tool import custom_youtube_transcription_tool
|
| 5 |
+
from tools.multimodal_llm_tools import describe_image_tool, extract_text_from_image_tool, summarize_audio_content_tool
|
| 6 |
|
| 7 |
__all__ = [
|
| 8 |
# math tools
|
|
|
|
| 19 |
'custom_youtube_transcription_tool',
|
| 20 |
# multimodal description tools
|
| 21 |
'describe_image_tool',
|
| 22 |
+
'extract_text_from_image_tool',
|
| 23 |
+
'summarize_audio_content_tool',
|
| 24 |
]
|
tools/multimodal_llm_tools.py
CHANGED
|
@@ -6,36 +6,41 @@ from functools import partial
|
|
| 6 |
import os
|
| 7 |
|
| 8 |
from Utils import *
|
|
|
|
| 9 |
|
| 10 |
-
def
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
prompt_messages = [Message(role = 'system', content=[TextContent(text = system_prompt)]), ]
|
| 19 |
-
|
| 20 |
-
if isinstance(image, str):
|
| 21 |
-
if os.path.exists(image): # the image was passed as a path
|
| 22 |
-
image = Image.open(image)
|
| 23 |
-
img = Image.open(image) if isinstance(image, str) else image
|
| 24 |
-
else: # we passed a base64 string
|
| 25 |
-
img = base64_to_pil_image(image)
|
| 26 |
-
elif isinstance(image, ImageFile):
|
| 27 |
-
img = image
|
| 28 |
user_message_content = []
|
| 29 |
-
user_message_content
|
| 30 |
-
|
| 31 |
-
data=image_to_base64(image),
|
| 32 |
-
mime_type=extract_image_mime_type(img) # TODO: fix the linter error
|
| 33 |
-
))
|
| 34 |
-
if additional_user_query:
|
| 35 |
-
user_message_content.append(TextContent(text=additional_user_query))
|
| 36 |
user_message = Message(role = 'user', content = user_message_content)
|
| 37 |
prompt_messages.append(user_message)
|
| 38 |
-
image_descriptor_llm_chain = get_chain(model_settings =
|
| 39 |
try:
|
| 40 |
llm_image_description : str = execute_chain(chain = image_descriptor_llm_chain, messages=prompt_messages)
|
| 41 |
except Exception as e:
|
|
@@ -52,23 +57,131 @@ EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT = ''.join([
|
|
| 52 |
'Additionally, the user may give you some additional info in a query',
|
| 53 |
'If so, use it to give a more precise answer'
|
| 54 |
])
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
import os
|
| 7 |
|
| 8 |
from Utils import *
|
| 9 |
+
from langchain_core.tools import tool
|
| 10 |
|
| 11 |
+
def parse_multimodal_contents(**kwargs) -> list[Content]:
|
| 12 |
+
"""
|
| 13 |
+
Parses the contents of the multimodal message.
|
| 14 |
+
"""
|
| 15 |
+
contents = []
|
| 16 |
+
if (image := kwargs.get('image')):
|
| 17 |
+
if isinstance(image, str):
|
| 18 |
+
if os.path.exists(image): # the image was passed as a path
|
| 19 |
+
img = Image.open(image) if isinstance(image, str) else image
|
| 20 |
+
else: # we passed a base64 string
|
| 21 |
+
img = base64_to_pil_image(image)
|
| 22 |
+
elif isinstance(image, ImageFile):
|
| 23 |
+
img = image
|
| 24 |
+
contents.append(Base64ImageContent(data=image_to_base64(image), mime_type=extract_image_mime_type(img)))
|
| 25 |
+
if (audio_path :=kwargs.get('audio')):
|
| 26 |
+
audio_base64 = bytes_to_base64(open_file_bytes(audio_path))
|
| 27 |
+
contents.append(Base64AudioContent(data=audio_base64, mime_type=get_audio_mime_type_from_audio_file_path(audio_path)))
|
| 28 |
+
return contents
|
| 29 |
|
| 30 |
+
def multimodal_tool_template(
|
| 31 |
+
system_prompt : str,
|
| 32 |
+
image: ImageFile | str | None,
|
| 33 |
+
audio: str | None,
|
| 34 |
+
model_settings : dict = {"model": "gpt-4.1", "temperature": 0.5},
|
| 35 |
+
additional_user_query : Optional[str] = "",
|
| 36 |
+
) -> str:
|
| 37 |
prompt_messages = [Message(role = 'system', content=[TextContent(text = system_prompt)]), ]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
user_message_content = []
|
| 39 |
+
user_message_content += parse_multimodal_contents(image = image, audio = audio)
|
| 40 |
+
if additional_user_query:user_message_content.append(TextContent(text=additional_user_query))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
user_message = Message(role = 'user', content = user_message_content)
|
| 42 |
prompt_messages.append(user_message)
|
| 43 |
+
image_descriptor_llm_chain = get_chain(model_settings = model_settings)
|
| 44 |
try:
|
| 45 |
llm_image_description : str = execute_chain(chain = image_descriptor_llm_chain, messages=prompt_messages)
|
| 46 |
except Exception as e:
|
|
|
|
| 57 |
'Additionally, the user may give you some additional info in a query',
|
| 58 |
'If so, use it to give a more precise answer'
|
| 59 |
])
|
| 60 |
+
SUMMARIZE_AUDIO_SYSTEM_PROMPT = ''.join([
|
| 61 |
+
'Summarize what has been said in this audio file',
|
| 62 |
+
])
|
| 63 |
+
|
| 64 |
+
# describe_image_tool = Tool(
|
| 65 |
+
# name = "DescribeImageTool",
|
| 66 |
+
# description="""
|
| 67 |
+
# This tool describes the content of an image.
|
| 68 |
+
# The image can be passed as a base64 string, a PIL image or as a path to an image file.
|
| 69 |
+
# The images, if passed as paths, are in your mount disk.
|
| 70 |
+
# Additionally, this tool can take a user query that can be used to provide a more precise answer.
|
| 71 |
+
|
| 72 |
+
# Args:
|
| 73 |
+
# image (str | PIL.ImageFile): The image to be described. This parameter is required.
|
| 74 |
+
# additional_user_query (str, optional): Additional information or query from the user to refine the description.
|
| 75 |
+
# Returns:
|
| 76 |
+
# str: The description of the image.
|
| 77 |
+
# Note:
|
| 78 |
+
# The 'image' parameter must be provided, otherwise the tool will not work as expected.
|
| 79 |
+
# """,
|
| 80 |
+
# func = lambda *args, **kwargs: image_base_tool_template(
|
| 81 |
+
# system_prompt=DESCRIBE_IMAGE_SYSTEM_PROMPT,
|
| 82 |
+
# audio=None,
|
| 83 |
+
# **kwargs)
|
| 84 |
+
# # partial(image_base_tool_template, system_prompt = DESCRIBE_IMAGE_SYSTEM_PROMPT, audio = None)
|
| 85 |
+
# )
|
| 86 |
+
# extract_text_from_image = Tool(
|
| 87 |
+
# name = "ExtractImageTextTool",
|
| 88 |
+
# description="""
|
| 89 |
+
# This tool extracts the text contained inside of an image.
|
| 90 |
+
# The image can be passed as a base64 string, a PIL image or as a path to an image file.
|
| 91 |
+
# The images, if passed as paths, are in your mount disk.
|
| 92 |
+
# Additionally, this tool can take a user query that can be used to provide a more precise answer.
|
| 93 |
+
|
| 94 |
+
# Args:
|
| 95 |
+
# image (str | PIL.ImageFile): The image to be described. This parameter is required.
|
| 96 |
+
# additional_user_query (str, optional): Additional information or query from the user to refine the description.
|
| 97 |
+
# Returns:
|
| 98 |
+
# str: The description of the image.
|
| 99 |
+
# Note:
|
| 100 |
+
# The 'image' parameter must be provided, otherwise the tool will not work as expected.
|
| 101 |
+
# """,
|
| 102 |
+
# func = lambda *args, **kwargs: image_base_tool_template(
|
| 103 |
+
# system_prompt=EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT,
|
| 104 |
+
# audio=None,
|
| 105 |
+
# **kwargs)
|
| 106 |
+
# # partial(image_base_tool_template, system_prompt = EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT, audio = None)
|
| 107 |
+
# )
|
| 108 |
+
# summarize_audio_content = Tool(
|
| 109 |
+
# name = "AnalizeAudioContentTool",
|
| 110 |
+
# description="""
|
| 111 |
+
# This tool analizes and summarizes the content of an audio file.
|
| 112 |
+
|
| 113 |
+
# Args:
|
| 114 |
+
# audio (str): The path where to extract the audio from.
|
| 115 |
+
# additional_user_query (str, optional): Additional information or query from the user to refine the description.
|
| 116 |
+
# Returns:
|
| 117 |
+
# str: The audio summarized.
|
| 118 |
+
# Note:
|
| 119 |
+
# The 'audio' parameter must be provided, otherwise the tool will not work as expected.
|
| 120 |
+
# """,
|
| 121 |
+
# func = lambda *args, **kwargs: image_base_tool_template(
|
| 122 |
+
# system_prompt=SUMMARIZE_AUDIO_SYSTEM_PROMPT,
|
| 123 |
+
# image=None,
|
| 124 |
+
# **kwargs)
|
| 125 |
+
# # partial(image_base_tool_template, system_prompt = SUMMARIZE_AUDIO_SYSTEM_PROMPT, image = None)
|
| 126 |
+
# )
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
@tool("DescribeImageTool", parse_docstring=True)
|
| 130 |
+
def describe_image_tool(image: ImageFile | str, additional_user_query: Optional[str] = "") -> str:
|
| 131 |
+
"""Describes the content of an image. The image can be a base64 string, a PIL image, or a path to an image file.
|
| 132 |
+
Optionally, a user query can be provided to refine the description.
|
| 133 |
+
|
| 134 |
+
Args:
|
| 135 |
+
image (ImageFile | str): path to the image or base64 string of the image or Opened ImageFile.
|
| 136 |
+
additional_user_query (Optional[str], optional): An additional user string query. Defaults to "".
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
str: The description of the image.
|
| 140 |
+
"""
|
| 141 |
+
return multimodal_tool_template(
|
| 142 |
+
system_prompt=DESCRIBE_IMAGE_SYSTEM_PROMPT,
|
| 143 |
+
model_settings={"model": "gpt-4.1", "temperature": 0.5},
|
| 144 |
+
image=image,
|
| 145 |
+
audio=None,
|
| 146 |
+
additional_user_query=additional_user_query
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
@tool("ExtractImageTextTool", parse_docstring=True)
|
| 150 |
+
def extract_text_from_image_tool(image: ImageFile | str, additional_user_query: Optional[str] = "") -> str:
|
| 151 |
+
"""Extracts the text contained inside an image. The image can be a base64 string, a PIL image, or a path to an image file.
|
| 152 |
+
Optionally, a user query can be provided to refine the extraction.
|
| 153 |
+
|
| 154 |
+
Args:
|
| 155 |
+
image (ImageFile | str): path to the image or base64 string of the image or Opened ImageFile.
|
| 156 |
+
additional_user_query (Optional[str], optional): An additional user string query. Defaults to "".
|
| 157 |
+
|
| 158 |
+
Returns:
|
| 159 |
+
str: The extracted text from the image.
|
| 160 |
+
"""
|
| 161 |
+
return multimodal_tool_template(
|
| 162 |
+
system_prompt=EXTRACT_TEXT_FROM_IMAGE_SYSTEM_PROMPT,
|
| 163 |
+
model_settings={"model": "gpt-4o-mini-audio", "temperature": 0.5, "modalities" : ["text"]},
|
| 164 |
+
image=image,
|
| 165 |
+
audio=None,
|
| 166 |
+
additional_user_query=additional_user_query
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
@tool("AnalizeAudioContentTool", parse_docstring=True)
|
| 170 |
+
def summarize_audio_content_tool(audio: str, additional_user_query: Optional[str] = "") -> str:
|
| 171 |
+
"""Analyzes and summarizes the content of an audio file. The audio must be provided as a file path.
|
| 172 |
+
Optionally, a user query can be provided to refine the summary.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
audio (str): The path to the audio file to be summarized.
|
| 176 |
+
additional_user_query (Optional[str], optional): An additional user string query. Defaults to "".
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
str: The summarized content of the audio file.
|
| 180 |
+
"""
|
| 181 |
+
return multimodal_tool_template(
|
| 182 |
+
system_prompt=SUMMARIZE_AUDIO_SYSTEM_PROMPT,
|
| 183 |
+
model_settings={"model": "gpt-4o-audio-preview", "temperature": 0.5},
|
| 184 |
+
image=None,
|
| 185 |
+
audio=audio,
|
| 186 |
+
additional_user_query=additional_user_query
|
| 187 |
+
)
|