Spaces:

kberasneva
/

Final_Assignment_Template

Sleeping

File size: 5,231 Bytes

import os
from langchain_core.messages import HumanMessage
from langchain_core.runnables.config import RunnableConfig
from langgraph.checkpoint.memory import MemorySaver
from langchain.globals import set_debug
from langchain.globals import set_verbose
from langgraph.prebuilt import create_react_agent
from langgraph.prebuilt import ToolNode
from langgraph.prebuilt.chat_agent_executor import AgentState

from smolagents import DuckDuckGoSearchTool
from smolagents import PythonInterpreterTool
from tools import analyze_audio
from tools import analyze_excel
from tools import analyze_image
from tools import analyze_video
from tools import download_file_for_task
from tools import read_file_contents
from tools import search_arxiv
from tools import search_tavily
from tools import search_wikipedia
from tools import SmolagentToolWrapper
from tools import tavily_extract_tool
from utils import get_llm
from config import GOOGLE_API_KEY, AGENT_MODEL_NAME


GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY", "")
if not GOOGLE_API_KEY:
    raise ValueError("GOOGLE_API_KEY environment variable is not set.")

AGENT_MODEL_NAME = os.getenv("AGENT_MODEL_NAME", "gemini-2.0-flash-lite")

MULTIMODAL_TASK_SOLVER_PROMPT = """
You are a specialized multimodal task-solving AI assistant capable of handling complex data analysis and information retrieval tasks.
Core Operating Guidelines:
- Employ systematic analysis: Break down problems into logical steps
- Maintain brevity: Provide answers in the most concise format possible - raw numbers, single words, or comma-delimited lists
- Format compliance:
  * Numbers: No commas, units, or currency symbols
  * Lists: Pure comma-separated values without additional text
  * Text: Bare minimum words, no sentences or explanations
- Tool utilization: 
  * For multimedia content (images, audio, video) - use dedicated analysis tools
  * For data processing (Excel, structured data) - use appropriate parsers
  * For information retrieval - leverage search tools
- Verification principle: Never guess - use available tools to verify information
- Code usage: Implement Python code for calculations and data transformations
- Answer format: Always prefix final answers with 'FINAL ANSWER: '
- Counting queries: Return only the numerical count
- Listing queries: Return only the comma-separated items
- Sorting queries: Return only the ordered list

Sample Responses:
Q: Current Bitcoin price in USD? A: 47392
Q: Sort these colors: blue, red, azure A: azure, blue, red
Q: Capital of France? A: Paris
Q: Count vowels in 'hello' A: 2
Q: Temperature scale used in USA? A: Fahrenheit
Q: List prime numbers under 10 A: 2, 3, 5, 7
Q: Most streamed artist 2023? A: Taylor Swift
"""

#set_debug(True)
#set_verbose(True)


class MultiModalTaskState(AgentState):
    task_identifier: str
    query_text: str
    input_file_path: str


class MultiModalAgent:
    def __init__(self, model_name: str | None = None):
        if model_name is None:
            model_name = AGENT_MODEL_NAME
        llm = self._get_llm(model_name)
        tools = self._get_tools()
        self.agent = create_react_agent(
            model=llm,
            tools=tools,
            prompt=MULTIMODAL_TASK_SOLVER_PROMPT,
            checkpointer=MemorySaver()
        )

    def _get_llm(self, model_name: str):
        return get_llm(
            llm_provider_api_key=GOOGLE_API_KEY,
            model_name=model_name,
        )

    def _get_tools(self):
        tools = [
            SmolagentToolWrapper(DuckDuckGoSearchTool()),
            SmolagentToolWrapper(PythonInterpreterTool()),
            download_file_for_task,
            read_file_contents,
            analyze_audio,
            analyze_image,
            analyze_excel,
            analyze_video,
            search_arxiv,
            search_tavily,
            search_wikipedia,
            tavily_extract_tool,
        ]
        return ToolNode(tools)

    async def __call__(
        self, task_identifier: str, query_text: str, input_file_path: str | None = None
    ) -> str:

        execution_config = RunnableConfig(
            recursion_limit=64,
            configurable={ "thread_id": task_identifier }
        )

        if not input_file_path:
            input_file_path = "None - no file present"

        user_input = HumanMessage(
            content=
            [
                {
                    "type": "text",
                    "text": f"Task Id: {task_identifier}, Question: {query_text}, Filename: {input_file_path}. If a filename is present (and is not 'None'), download the file for the task that's referenced in the question. If there isn't a filename present, please use tools where applicable."
                }
            ]
        )

        response = await self.agent.ainvoke(
            {
                "messages": [user_input],
                "question": query_text,
                "task_id": task_identifier,
                "file_name": input_file_path
            }, execution_config)

        final_response = response['messages'][-1].content
        if "FINAL ANSWER: " in final_response:
            return final_response.split("FINAL ANSWER: ", 1)[1].strip()
        else:
            return final_response