Katya Beresneva
fix
13317d6
import os
from langchain_core.messages import HumanMessage
from langchain_core.runnables.config import RunnableConfig
from langgraph.checkpoint.memory import MemorySaver
from langchain.globals import set_debug
from langchain.globals import set_verbose
from langgraph.prebuilt import create_react_agent
from langgraph.prebuilt import ToolNode
from langgraph.prebuilt.chat_agent_executor import AgentState
from smolagents import DuckDuckGoSearchTool
from smolagents import PythonInterpreterTool
from tools import analyze_audio
from tools import analyze_excel
from tools import analyze_image
from tools import analyze_video
from tools import download_file_for_task
from tools import read_file_contents
from tools import search_arxiv
from tools import search_tavily
from tools import search_wikipedia
from tools import SmolagentToolWrapper
from tools import tavily_extract_tool
from utils import get_llm
from config import GOOGLE_API_KEY, AGENT_MODEL_NAME
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
if not GOOGLE_API_KEY:
raise ValueError("GOOGLE_API_KEY environment variable is not set.")
AGENT_MODEL_NAME = os.getenv("AGENT_MODEL_NAME", "gemini-2.0-flash-lite")
MULTIMODAL_TASK_SOLVER_PROMPT = """
You are a specialized multimodal task-solving AI assistant capable of handling complex data analysis and information retrieval tasks.
Core Operating Guidelines:
- Employ systematic analysis: Break down problems into logical steps
- Maintain brevity: Provide answers in the most concise format possible - raw numbers, single words, or comma-delimited lists
- Format compliance:
* Numbers: No commas, units, or currency symbols
* Lists: Pure comma-separated values without additional text
* Text: Bare minimum words, no sentences or explanations
- Tool utilization:
* For multimedia content (images, audio, video) - use dedicated analysis tools
* For data processing (Excel, structured data) - use appropriate parsers
* For information retrieval - leverage search tools
- Verification principle: Never guess - use available tools to verify information
- Code usage: Implement Python code for calculations and data transformations
- Answer format: Always prefix final answers with 'FINAL ANSWER: '
- Counting queries: Return only the numerical count
- Listing queries: Return only the comma-separated items
- Sorting queries: Return only the ordered list
Sample Responses:
Q: Current Bitcoin price in USD? A: 47392
Q: Sort these colors: blue, red, azure A: azure, blue, red
Q: Capital of France? A: Paris
Q: Count vowels in 'hello' A: 2
Q: Temperature scale used in USA? A: Fahrenheit
Q: List prime numbers under 10 A: 2, 3, 5, 7
Q: Most streamed artist 2023? A: Taylor Swift
"""
#set_debug(True)
#set_verbose(True)
class MultiModalTaskState(AgentState):
task_identifier: str
query_text: str
input_file_path: str
class MultiModalAgent:
def __init__(self, model_name: str | None = None):
if model_name is None:
model_name = AGENT_MODEL_NAME
llm = self._get_llm(model_name)
tools = self._get_tools()
self.agent = create_react_agent(
model=llm,
tools=tools,
prompt=MULTIMODAL_TASK_SOLVER_PROMPT,
checkpointer=MemorySaver()
)
def _get_llm(self, model_name: str):
return get_llm(
llm_provider_api_key=GOOGLE_API_KEY,
model_name=model_name,
)
def _get_tools(self):
tools = [
SmolagentToolWrapper(DuckDuckGoSearchTool()),
SmolagentToolWrapper(PythonInterpreterTool()),
download_file_for_task,
read_file_contents,
analyze_audio,
analyze_image,
analyze_excel,
analyze_video,
search_arxiv,
search_tavily,
search_wikipedia,
tavily_extract_tool,
]
return ToolNode(tools)
async def __call__(
self, task_identifier: str, query_text: str, input_file_path: str | None = None
) -> str:
execution_config = RunnableConfig(
recursion_limit=64,
configurable={ "thread_id": task_identifier }
)
if not input_file_path:
input_file_path = "None - no file present"
user_input = HumanMessage(
content=
[
{
"type": "text",
"text": f"Task Id: {task_identifier}, Question: {query_text}, Filename: {input_file_path}. If a filename is present (and is not 'None'), download the file for the task that's referenced in the question. If there isn't a filename present, please use tools where applicable."
}
]
)
response = await self.agent.ainvoke(
{
"messages": [user_input],
"question": query_text,
"task_id": task_identifier,
"file_name": input_file_path
}, execution_config)
final_response = response['messages'][-1].content
if "FINAL ANSWER: " in final_response:
return final_response.split("FINAL ANSWER: ", 1)[1].strip()
else:
return final_response