import os
import pandas as pd
import tempfile
import typing
from base64 import b64encode
from io import StringIO
import httpx
from anyio import Path
from asyncer import asyncify
from langchain_community.document_loaders import ArxivLoader
from langchain_community.document_loaders import WikipediaLoader
from langchain_core.messages import HumanMessage
from langchain_tavily import TavilyExtract
from langchain_tavily import TavilySearch
from langgraph.prebuilt import create_react_agent
from langgraph.prebuilt import InjectedState
from langchain.tools import BaseTool
from langchain.tools import tool
from pydantic import Field
from typing_extensions import Annotated
from utils import get_llm
from config import GOOGLE_API_KEY, AGENT_MODEL_NAME, TAVILY_API_KEY
MULTIMODAL_FILE_ANALYZER_PROMPT = """
You are a specialized file analysis AI assistant focused on extracting information from various file formats including images, videos, audio, and structured data.
Core Analysis Guidelines:
- Systematic processing: Analyze file contents step by step
- Precise responses: Provide answers in the most concise format - raw numbers, single words, or comma-delimited lists
- Format requirements:
* Numbers: No formatting (no commas, units, or symbols)
* Lists: Pure comma-separated values
* Text: Minimal words, no explanations
- Analysis approach:
* Images: Focus on visual elements, objects, text, and scene composition
* Audio: Identify sounds, speech, music, and audio characteristics
* Video: Analyze visual content, motion, and temporal elements
* Excel/CSV: Extract relevant data points and patterns
- Verification focus: Base answers solely on file contents
- Answer format: Always prefix with 'FINAL ANSWER: '
- Counting tasks: Return only the count
- Listing tasks: Return only the items
- Sorting tasks: Return only the ordered list
Example Responses:
Q: Count people in image? A: 3
Q: List colors in logo? A: blue, red, white
Q: Main topic of audio? A: weather forecast
Q: Excel total sales? A: 15420
Q: Video duration? A: 45
"""
class SmolagentToolWrapper(BaseTool):
"""Smol wrapper to allow Langchain/Graph to leverage smolagents tools"""
wrapped_tool: object = Field(description="Smolagents tool (wrapped)")
def __init__(self, tool):
super().__init__(
name=tool.name,
description=tool.description,
return_direct=False,
wrapped_tool=tool,
)
def _run(self, query: str) -> str:
try:
return self.wrapped_tool(query)
except Exception as e:
return f"Error using SmolagentToolWrapper: {str(e)}"
def _arun(self, *args: typing.Any, **kwargs: typing.Any) -> typing.Any:
"""Async version of the tool"""
return asyncify(self._run, cancellable=True)(*args, **kwargs)
tavily_extract_tool = TavilyExtract(tavily_api_key=TAVILY_API_KEY)
@tool("search-tavily-tool", parse_docstring=True)
async def search_tavily(
query: str,
state: Annotated[dict, InjectedState],
included_domains: list[str] = None,
max_results: int = 5,
) -> dict[str, str]:
"""
Search the web using Tavily API with optional domain filtering.
This function performs a search using the Tavily search engine and returns formatted results.
You can specify domains to include in the search results for more targeted information.
Args:
query (str): The search query to search the web for
included_domains (list[str], optional): List of domains to include in search results
(e.g., ["wikipedia.org", "cnn.com"]). Defaults to None.
max_results (int, optional): Maximum number of results to return. Defaults to 5.
Returns:
dict[str, str]: A dictionary with key 'tavily_results' containing formatted search results.
Each result includes document source, page information, and content.
Example:
results = await search_tavily("How many albums did Michael Jackson produce", included_domains=[], topic="general")
# Returns filtered results about Michael Jackson
"""
# Configure Tavily search with provided parameters
tavily_search_tool = TavilySearch(
tavily_api_key=TAVILY_API_KEY,
max_results=max_results,
topic="general",
include_domains=included_domains if included_domains else None,
search_depth="advanced",
include_answer="advanced",
)
# Execute search
search_docs = await tavily_search_tool.arun(state["question"])
# Format results
formatted_search_docs = "\n\n---\n\n".join(
[
f'{doc.get("title", "No Title")}\n{doc.get("content", "")}\n'
for doc in search_docs.get("results", [])
]
)
results = {"tavily_results": formatted_search_docs}
answer = search_docs.get("answer", None)
if answer:
results["tavily_answer"] = answer
return results
@tool("search-arxiv-tool", parse_docstring=True)
async def search_arxiv(query: str, max_num_result: int = 5) -> dict[str, str]:
"""
Search arXiv for academic papers matching the provided query.
This function queries the arXiv database for scholarly articles related to the
search query and returns a formatted collection of the results.
Args:
query (str): The search query to find relevant academic papers.
max_num_result (int, optional): Maximum number of results to return. Defaults to 5.
Returns:
dict[str, str]: A dictionary with key 'arxiv_results' containing formatted search results.
Each result includes document source, page information, and content.
Example:
results = await search_arxiv("quantum computing", 3)
# Returns dictionary with up to 3 formatted arXiv papers about quantum computing
"""
search_docs = await ArxivLoader(query=query, load_max_docs=max_num_result).aload()
formatted_search_docs = "\n\n---\n\n".join(
[
f'\n{doc.page_content}\n'
for doc in search_docs
]
)
return {"arvix_results": formatted_search_docs}
@tool("search-wikipedia-tool", parse_docstring=True)
async def search_wikipedia(query: str, max_num_result: int = 5) -> dict[str, str]:
"""
Search Wikipedia for articles matching the provided query.
This function queries the Wikipedia database for articles related to the
search term and returns a formatted collection of the results.
Args:
query (str): The search query to find relevant Wikipedia articles.
max_num_result (int, optional): Maximum number of results to return. Defaults to 5.
Returns:
dict[str, str]: A dictionary with key 'wikipedia_results' containing formatted search results.
Each result includes document source, page information, and content.
Example:
results = await search_wikipedia("neural networks", 3)
# Returns dictionary with up to 3 formatted Wikipedia articles about neural networks
"""
search_docs = await WikipediaLoader(
query=query,
load_max_docs=max_num_result,
load_all_available_meta=True,
doc_content_chars_max=128000,
).aload()
#print(search_docs)
formatted_search_docs = "\n\n---\n\n".join(
[
f'\n{doc.page_content}\n'
for doc in search_docs
]
)
return {"wikipedia_results": formatted_search_docs}
@tool("download-file-for-task-tool", parse_docstring=True)
async def download_file_for_task(task_id: str, filename: str | None = None) -> str:
"""
Download a file for task_id, save to a temporary file, and return path
Args:
task_id: The task id file to download
filename: Optional filename (will be generated if not provided)
Returns:
String path to the downloaded file
"""
if filename is None:
filename = task_id
temp_dir = Path(tempfile.gettempdir())
filepath = temp_dir / filename
url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
async with httpx.AsyncClient() as client:
async with client.stream("GET", url) as response:
response.raise_for_status()
async with await filepath.open("wb") as f:
async for chunk in response.aiter_bytes(chunk_size=4096):
await f.write(chunk)
return str(filepath)
@tool("read-file-contents-tool", parse_docstring=True)
async def read_file_contents(file_path: str) -> str:
"""
Read a file and return its contents
Args:
file_path: String path to file to read
Returns:
Contents of the file at file_path
"""
path = Path(file_path)
return await path.read_text()
@tool("analyze-image-tool", parse_docstring=True)
async def analyze_image(state: Annotated[dict, InjectedState], image_path: str) -> str:
"""
Analyze the image at image_path
Args:
image_path: String path where the image file is located on disk
Returns:
Answer to the question about the image file
"""
path = Path(image_path)
async with await path.open("rb") as rb:
img_base64 = b64encode(await rb.read()).decode("utf-8")
llm = get_llm(
llm_provider_api_key=GOOGLE_API_KEY,
model_name=AGENT_MODEL_NAME,
)
file_agent = create_react_agent(
model=llm,
tools=[],
prompt=MULTIMODAL_FILE_ANALYZER_PROMPT
)
message = HumanMessage(
content=[
{"type": "text", "text": state["question"]},
{
"type": "image",
"source_type": "base64",
"mime_type": "image/png",
"data": img_base64,
},
]
)
messages = await file_agent.ainvoke({"messages": [message]})
return messages["messages"][-1].content
@tool("analyze-excel-tool", parse_docstring=True)
async def analyze_excel(state: Annotated[dict, InjectedState], excel_path: str) -> str:
"""
Analyze the excel file at excel_path
Args:
excel_path: String path where the excel file is located on disk
Returns:
Answer to the question about the excel file
"""
df = pd.read_excel(excel_path)
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)
csv_contents = csv_buffer.getvalue()
csv_contents_bytes = csv_contents.encode("utf-8")
csv_contents_base64 = b64encode(csv_contents_bytes).decode("utf-8")
llm = get_llm(
llm_provider_api_key=GOOGLE_API_KEY,
model_name=AGENT_MODEL_NAME,
)
file_agent = create_react_agent(
model=llm,
tools=[],
prompt=MULTIMODAL_FILE_ANALYZER_PROMPT
)
message = HumanMessage(
content=[
{"type": "text", "text": state["question"]},
{
"type": "file",
"source_type": "base64",
"mime_type": "text/csv",
"data": csv_contents_base64,
},
],
)
messages = await file_agent.ainvoke({"messages": [message]})
return messages["messages"][-1].content
@tool("analyze-audio-tool", parse_docstring=True)
async def analyze_audio(state: Annotated[dict, InjectedState], audio_path: str) -> str:
"""
Analyze the audio at audio_path
Args:
audio_path: String path where the audio file is located on disk
Returns:
Answer to the question about the audio file
"""
audio_mime_type = "audio/mpeg"
path = Path(audio_path)
async with await path.open("rb") as rb:
encoded_audio = b64encode(await rb.read()).decode("utf-8")
llm = get_llm(
llm_provider_api_key=GOOGLE_API_KEY,
model_name=AGENT_MODEL_NAME,
)
file_agent = create_react_agent(
model=llm,
tools=[],
prompt=MULTIMODAL_FILE_ANALYZER_PROMPT
)
message = HumanMessage(
content=[
{"type": "text", "text": state["question"]},
{"type": "media", "data": encoded_audio, "mime_type": audio_mime_type},
],
)
messages = await file_agent.ainvoke({"messages": [message]})
return messages["messages"][-1].content
@tool("analyze-video-tool", parse_docstring=True)
async def analyze_video(state: Annotated[dict, InjectedState], video_url: str) -> str:
"""
Analyze the video at video_url
Args:
video_url: URL where the video is located
Returns:
Answer to the question about the video url
"""
llm = get_llm(
llm_provider_api_key=GOOGLE_API_KEY,
model_name=AGENT_MODEL_NAME,
)
file_agent = create_react_agent(
model=llm,
tools=[],
prompt=MULTIMODAL_FILE_ANALYZER_PROMPT
)
message = HumanMessage(
content=[
{"type": "text", "text": state["question"]},
{
"type": "media",
"mime_type": "video/mp4",
"file_uri": video_url,
},
],
)
messages = await file_agent.ainvoke({"messages": [message]})
return messages["messages"][-1].content