lwant's picture
Add `YoutubeTranscriptReader` tool integration, update dependencies, and enhance agent functionality.
84c7ca2
raw
history blame
3.03 kB
from llama_index.core.schema import ImageDocument
from llama_index.core.tools import FunctionTool
from llama_index.core.tools.ondemand_loader_tool import OnDemandLoaderTool
from llama_index.core.tools.tool_spec.base import BaseToolSpec
from llama_index.core.tools.tool_spec.load_and_search import LoadAndSearchToolSpec
from llama_index.multi_modal_llms.mistralai import MistralAIMultiModal
from llama_index.multi_modal_llms.nebius import NebiusMultiModal
from llama_index.readers.web import SimpleWebPageReader
from llama_index.readers.youtube_transcript import YoutubeTranscriptReader
from tavily import AsyncTavilyClient
from gaia_solving_agent import TAVILY_API_KEY, NEBIUS_API_KEY, MISTRAL_API_KEY
def load_and_search_tools_from_toolspec(tool_spec: BaseToolSpec) -> list[FunctionTool]:
tools_list = []
for tool in tool_spec.to_tool_list():
tools_list.extend(LoadAndSearchToolSpec.from_defaults(tool).to_tool_list())
return tools_list
async def tavily_search_web(query: str) -> str:
"""Useful for using the web to answer questions."""
if TAVILY_API_KEY is None or "x" in TAVILY_API_KEY:
raise ValueError("Tavily API key not set.")
client = AsyncTavilyClient(api_key=TAVILY_API_KEY)
return str(await client.search(query))
async def vllm_ask_image(query: str, images: ImageDocument | list[ImageDocument]) -> str:
"""
Asynchronously processes a visual-linguistic query paired with image data
and returns corresponding results. This function leverages visual
understanding and language processing to answer the provided query based
on the content of the given image(s).
Parameters:
query: str
The question or request related to the provided image(s).
images: ImageDocument | list[ImageDocument]
Image data provided as a llamaindex ImageDocument or list of.
Returns:
str
The result or response to the provided query based on the processed
image content.
"""
multimodal_llm = MistralAIMultiModal(
model="mistral-small-2506",
api_key=MISTRAL_API_KEY,
temperature=.1,
max_retries=5,
)
if not isinstance(images, list):
images = [images]
vllm_output = multimodal_llm.complete(
prompt = query,
image_documents=images
)
return vllm_output.text
simple_web_page_reader_tool = OnDemandLoaderTool.from_defaults(
SimpleWebPageReader(html_to_text=True),
name="simple_web_page_reader_tool",
description="Tool for loading content from a web page and return it as text",
)
simple_web_page_reader_toolspec = LoadAndSearchToolSpec.from_defaults(simple_web_page_reader_tool)
youtube_transcript_reader_tool = OnDemandLoaderTool.from_defaults(
YoutubeTranscriptReader(),
name="youtube_transcript_reader_tool",
description="Tool for loading the audio transcript from a youtube video and return it as text",
)
youtube_transcript_reader_toolspec = LoadAndSearchToolSpec.from_defaults(youtube_transcript_reader_tool)