akrstova commited on
Commit
a52d511
·
1 Parent(s): a3aa2a4

Add image processing tool

Browse files
Files changed (2) hide show
  1. agent.py +2 -1
  2. tools/image_video_tools.py +45 -0
agent.py CHANGED
@@ -8,6 +8,7 @@ from langchain_core.messages import SystemMessage, HumanMessage
8
  from langchain_core.tools import tool
9
  from tools.math_tools import add, subtract, multiply, divide, modulus, power, sqrt
10
  from tools.search_tools import search_wikipedia, web_search
 
11
 
12
 
13
  def build_graph():
@@ -20,7 +21,7 @@ def build_graph():
20
  max_retries=2,
21
  google_api_key=os.getenv("GOOGLE_API_KEY") # Get API key from environment variable
22
  )
23
- tools = [add, subtract, multiply, divide, modulus, power, sqrt, web_search, search_wikipedia]
24
 
25
  llm_with_tools = llm.bind_tools(tools)
26
 
 
8
  from langchain_core.tools import tool
9
  from tools.math_tools import add, subtract, multiply, divide, modulus, power, sqrt
10
  from tools.search_tools import search_wikipedia, web_search
11
+ from tools.image_video_tools import query_image
12
 
13
 
14
  def build_graph():
 
21
  max_retries=2,
22
  google_api_key=os.getenv("GOOGLE_API_KEY") # Get API key from environment variable
23
  )
24
+ tools = [add, subtract, multiply, divide, modulus, power, sqrt, web_search, search_wikipedia, query_image]
25
 
26
  llm_with_tools = llm.bind_tools(tools)
27
 
tools/image_video_tools.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This module contains tools for processing images or videos."""
2
+
3
+ import os
4
+ import base64
5
+ import mimetypes
6
+ from langchain_core.tools import tool
7
+ from langchain_google_genai import ChatGoogleGenerativeAI
8
+
9
+ @tool
10
+ def query_image(image_path: str, query: str) -> str:
11
+ """Uses a multimodal LLM to answer a query for a given image.
12
+
13
+ Args:
14
+ image_path (str): The path to the image to process
15
+ query (str): The query to be answered based on the image
16
+
17
+ Returns:
18
+ str: Answer of the query based on the image
19
+ """
20
+ llm = ChatGoogleGenerativeAI(
21
+ model="gemini-2.0-flash-001",
22
+ temperature=0.8,
23
+ max_tokens=None,
24
+ timeout=None,
25
+ max_retries=2,
26
+ google_api_key=os.getenv("GOOGLE_API_KEY") # Get API key from environment variable
27
+ )
28
+ with open(image_path, "rb") as f:
29
+ image_bytes = f.read()
30
+
31
+ mime_type = mimetypes.guess_type(image_path)[0] or "image/jpeg"
32
+ image_b64 = base64.b64encode(image_bytes).decode("utf-8")
33
+ image_dict = {
34
+ "mime_type": mime_type,
35
+ "data": image_b64
36
+ }
37
+
38
+ response = llm.invoke(
39
+ input=query,
40
+ images=[image_dict]
41
+ )
42
+ return response.content
43
+
44
+
45
+