| | import os |
| | from langchain_core.messages import HumanMessage |
| | from langchain_core.runnables.config import RunnableConfig |
| | from langgraph.checkpoint.memory import MemorySaver |
| | from langchain.globals import set_debug |
| | from langchain.globals import set_verbose |
| | from langgraph.prebuilt import create_react_agent |
| | from langgraph.prebuilt import ToolNode |
| | from langgraph.prebuilt.chat_agent_executor import AgentState |
| |
|
| | from smolagents import DuckDuckGoSearchTool |
| | from smolagents import PythonInterpreterTool |
| | from tools import analyze_audio |
| | from tools import analyze_excel |
| | from tools import analyze_image |
| | from tools import analyze_video |
| | from tools import download_file_for_task |
| | from tools import read_file_contents |
| | from tools import search_arxiv |
| | from tools import search_tavily |
| | from tools import search_wikipedia |
| | from tools import SmolagentToolWrapper |
| | from tools import tavily_extract_tool |
| | from utils import get_llm |
| | from config import GOOGLE_API_KEY, AGENT_MODEL_NAME |
| |
|
| |
|
| | GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "") |
| | if not GOOGLE_API_KEY: |
| | raise ValueError("GOOGLE_API_KEY environment variable is not set.") |
| |
|
| | AGENT_MODEL_NAME = os.getenv("AGENT_MODEL_NAME", "gemini-2.0-flash-lite") |
| |
|
| | MULTIMODAL_TASK_SOLVER_PROMPT = """ |
| | You are a specialized multimodal task-solving AI assistant capable of handling complex data analysis and information retrieval tasks. |
| | Core Operating Guidelines: |
| | - Employ systematic analysis: Break down problems into logical steps |
| | - Maintain brevity: Provide answers in the most concise format possible - raw numbers, single words, or comma-delimited lists |
| | - Format compliance: |
| | * Numbers: No commas, units, or currency symbols |
| | * Lists: Pure comma-separated values without additional text |
| | * Text: Bare minimum words, no sentences or explanations |
| | - Tool utilization: |
| | * For multimedia content (images, audio, video) - use dedicated analysis tools |
| | * For data processing (Excel, structured data) - use appropriate parsers |
| | * For information retrieval - leverage search tools |
| | - Verification principle: Never guess - use available tools to verify information |
| | - Code usage: Implement Python code for calculations and data transformations |
| | - Answer format: Always prefix final answers with 'FINAL ANSWER: ' |
| | - Counting queries: Return only the numerical count |
| | - Listing queries: Return only the comma-separated items |
| | - Sorting queries: Return only the ordered list |
| | |
| | Sample Responses: |
| | Q: Current Bitcoin price in USD? A: 47392 |
| | Q: Sort these colors: blue, red, azure A: azure, blue, red |
| | Q: Capital of France? A: Paris |
| | Q: Count vowels in 'hello' A: 2 |
| | Q: Temperature scale used in USA? A: Fahrenheit |
| | Q: List prime numbers under 10 A: 2, 3, 5, 7 |
| | Q: Most streamed artist 2023? A: Taylor Swift |
| | """ |
| |
|
| | |
| | |
| |
|
| |
|
| | class MultiModalTaskState(AgentState): |
| | task_identifier: str |
| | query_text: str |
| | input_file_path: str |
| |
|
| |
|
| | class MultiModalAgent: |
| | def __init__(self, model_name: str | None = None): |
| | if model_name is None: |
| | model_name = AGENT_MODEL_NAME |
| | llm = self._get_llm(model_name) |
| | tools = self._get_tools() |
| | self.agent = create_react_agent( |
| | model=llm, |
| | tools=tools, |
| | prompt=MULTIMODAL_TASK_SOLVER_PROMPT, |
| | checkpointer=MemorySaver() |
| | ) |
| |
|
| | def _get_llm(self, model_name: str): |
| | return get_llm( |
| | llm_provider_api_key=GOOGLE_API_KEY, |
| | model_name=model_name, |
| | ) |
| |
|
| | def _get_tools(self): |
| | tools = [ |
| | SmolagentToolWrapper(DuckDuckGoSearchTool()), |
| | SmolagentToolWrapper(PythonInterpreterTool()), |
| | download_file_for_task, |
| | read_file_contents, |
| | analyze_audio, |
| | analyze_image, |
| | analyze_excel, |
| | analyze_video, |
| | search_arxiv, |
| | search_tavily, |
| | search_wikipedia, |
| | tavily_extract_tool, |
| | ] |
| | return ToolNode(tools) |
| |
|
| | async def __call__( |
| | self, task_identifier: str, query_text: str, input_file_path: str | None = None |
| | ) -> str: |
| |
|
| | execution_config = RunnableConfig( |
| | recursion_limit=64, |
| | configurable={ "thread_id": task_identifier } |
| | ) |
| |
|
| | if not input_file_path: |
| | input_file_path = "None - no file present" |
| |
|
| | user_input = HumanMessage( |
| | content= |
| | [ |
| | { |
| | "type": "text", |
| | "text": f"Task Id: {task_identifier}, Question: {query_text}, Filename: {input_file_path}. If a filename is present (and is not 'None'), download the file for the task that's referenced in the question. If there isn't a filename present, please use tools where applicable." |
| | } |
| | ] |
| | ) |
| |
|
| | response = await self.agent.ainvoke( |
| | { |
| | "messages": [user_input], |
| | "question": query_text, |
| | "task_id": task_identifier, |
| | "file_name": input_file_path |
| | }, execution_config) |
| |
|
| | final_response = response['messages'][-1].content |
| | if "FINAL ANSWER: " in final_response: |
| | return final_response.split("FINAL ANSWER: ", 1)[1].strip() |
| | else: |
| | return final_response |
| |
|
| |
|