giulia-fontanella's picture
Update tools.py
7a40984 verified
raw
history blame
5.3 kB
import base64
from langchain_core.messages import AnyMessage, HumanMessage, AIMessage
from langchain.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.document_loaders import WikipediaLoader
from langchain_community.document_loaders import ArxivLoader
@tool
def extract_text(img_path: str) -> str:
"""
Extract text from an image file using a multimodal model.
Args:
img_path: A string representing the url of an image (e.g., PNG, JPEG).
Returns:
A single string containing the concatenated text extracted from the image.
"""
all_text = ""
try:
# Read image and encode as base64
with open(img_path, "rb") as image_file:
image_bytes = image_file.read()
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
# Prepare the prompt including the base64 image data
message = [
HumanMessage(
content=[
{
"type": "text",
"text": (
"Extract all the text from this image. "
"Return only the extracted text, no explanations."
),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
]
)
]
# Call the vision-capable model
response = vision_llm.invoke(message)
# Append extracted text
all_text += response.content + "\n\n"
return all_text.strip()
except Exception as e:
error_msg = f"Error extracting text: {str(e)}"
print(error_msg)
return ""
@tool
def describe_image(img_path: str, query: str) -> str:
"""
Generate a detailed description of an image using a multimodal model.
This function reads a image from an url, encodes it, and sends it to a
vision-capable language model to obtain a comprehensive, natural language
description of the image's content, including its objects, actions, and context,
following a specific query.
Args:
img_path: A string representing the url of an image (e.g., PNG, JPEG).
query: Information to extract from the image.
Returns:
A single string containing a detailed description of the image.
"""
try:
# Read image and encode as base64
with open(img_path, "rb") as image_file:
image_bytes = image_file.read()
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
# Prepare message payload
message = [
HumanMessage(
content=[
{
"type": "text",
"text": (
f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}" ),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
]
)
]
response = vision_llm.invoke(message)
return response.content.strip()
except Exception as e:
error_msg = f"Error describing image: {str(e)}"
print(error_msg)
return ""
@tool
def wiki_search(query: str) -> str:
"""Search Wikipedia for a query and return maximum 2 results.
Args:
query: The search query."""
search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
formatted_search_docs = "\n\n---\n\n".join(
[
f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
for doc in search_docs
])
return {"wiki_results": formatted_search_docs}
@tool
def web_search(query: str) -> str:
"""Search Tavily for a query and return maximum 3 results.
Args:
query: The search query."""
search_docs = TavilySearchResults(max_results=3).invoke(query=query)
formatted_search_docs = "\n\n---\n\n".join(
[
f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
for doc in search_docs
])
return {"web_results": formatted_search_docs}
@tool
def arxiv_search(query: str) -> str:
"""Search Arxiv for a query and return maximum 3 result.
Args:
query: The search query."""
search_docs = ArxivLoader(query=query, load_max_docs=3).load()
formatted_search_docs = "\n\n---\n\n".join(
[
f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content[:1000]}\n</Document>'
for doc in search_docs
])
return {"arvix_results": formatted_search_docs}