import os from langchain_core.messages import HumanMessage from langchain_core.runnables.config import RunnableConfig from langgraph.checkpoint.memory import MemorySaver from langchain.globals import set_debug from langchain.globals import set_verbose from langgraph.prebuilt import create_react_agent from langgraph.prebuilt import ToolNode from langgraph.prebuilt.chat_agent_executor import AgentState from smolagents import DuckDuckGoSearchTool from smolagents import PythonInterpreterTool from tools import analyze_audio from tools import analyze_excel from tools import analyze_image from tools import analyze_video from tools import download_file_for_task from tools import read_file_contents from tools import search_arxiv from tools import search_tavily from tools import search_wikipedia from tools import SmolagentToolWrapper from tools import tavily_extract_tool from utils import get_llm from config import GOOGLE_API_KEY, AGENT_MODEL_NAME GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "") if not GOOGLE_API_KEY: raise ValueError("GOOGLE_API_KEY environment variable is not set.") AGENT_MODEL_NAME = os.getenv("AGENT_MODEL_NAME", "gemini-2.0-flash-lite") MULTIMODAL_TASK_SOLVER_PROMPT = """ You are a specialized multimodal task-solving AI assistant capable of handling complex data analysis and information retrieval tasks. Core Operating Guidelines: - Employ systematic analysis: Break down problems into logical steps - Maintain brevity: Provide answers in the most concise format possible - raw numbers, single words, or comma-delimited lists - Format compliance: * Numbers: No commas, units, or currency symbols * Lists: Pure comma-separated values without additional text * Text: Bare minimum words, no sentences or explanations - Tool utilization: * For multimedia content (images, audio, video) - use dedicated analysis tools * For data processing (Excel, structured data) - use appropriate parsers * For information retrieval - leverage search tools - Verification principle: Never guess - use available tools to verify information - Code usage: Implement Python code for calculations and data transformations - Answer format: Always prefix final answers with 'FINAL ANSWER: ' - Counting queries: Return only the numerical count - Listing queries: Return only the comma-separated items - Sorting queries: Return only the ordered list Sample Responses: Q: Current Bitcoin price in USD? A: 47392 Q: Sort these colors: blue, red, azure A: azure, blue, red Q: Capital of France? A: Paris Q: Count vowels in 'hello' A: 2 Q: Temperature scale used in USA? A: Fahrenheit Q: List prime numbers under 10 A: 2, 3, 5, 7 Q: Most streamed artist 2023? A: Taylor Swift """ #set_debug(True) #set_verbose(True) class MultiModalTaskState(AgentState): task_identifier: str query_text: str input_file_path: str class MultiModalAgent: def __init__(self, model_name: str | None = None): if model_name is None: model_name = AGENT_MODEL_NAME llm = self._get_llm(model_name) tools = self._get_tools() self.agent = create_react_agent( model=llm, tools=tools, prompt=MULTIMODAL_TASK_SOLVER_PROMPT, checkpointer=MemorySaver() ) def _get_llm(self, model_name: str): return get_llm( llm_provider_api_key=GOOGLE_API_KEY, model_name=model_name, ) def _get_tools(self): tools = [ SmolagentToolWrapper(DuckDuckGoSearchTool()), SmolagentToolWrapper(PythonInterpreterTool()), download_file_for_task, read_file_contents, analyze_audio, analyze_image, analyze_excel, analyze_video, search_arxiv, search_tavily, search_wikipedia, tavily_extract_tool, ] return ToolNode(tools) async def __call__( self, task_identifier: str, query_text: str, input_file_path: str | None = None ) -> str: execution_config = RunnableConfig( recursion_limit=64, configurable={ "thread_id": task_identifier } ) if not input_file_path: input_file_path = "None - no file present" user_input = HumanMessage( content= [ { "type": "text", "text": f"Task Id: {task_identifier}, Question: {query_text}, Filename: {input_file_path}. If a filename is present (and is not 'None'), download the file for the task that's referenced in the question. If there isn't a filename present, please use tools where applicable." } ] ) response = await self.agent.ainvoke( { "messages": [user_input], "question": query_text, "task_id": task_identifier, "file_name": input_file_path }, execution_config) final_response = response['messages'][-1].content if "FINAL ANSWER: " in final_response: return final_response.split("FINAL ANSWER: ", 1)[1].strip() else: return final_response