import base64
import pandas as pd
from langchain_core.messages import HumanMessage
from langchain.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
import yt_dlp
import ffmpeg
@tool
def read_excel(file_path: str) -> str:
"""
Extract readable text from an Excel file (.xlsx or .xls).
Args:
file_path: Path to the Excel file.
Returns:
A string representation of all sheets and their content.
"""
try:
df_dict = pd.read_excel(file_path, sheet_name=None) # Read all sheets
result = []
for sheet_name, sheet_df in df_dict.items():
sheet_text = sheet_df.to_string(index=False)
result.append(f"Sheet: {sheet_name}\n{sheet_text}")
return "\n\n".join(result)
except Exception as e:
return f"Error reading Excel file: {str(e)}"
@tool
def read_python(file_path: str) -> str:
"""
Extract source code from a Python (.py) file.
Args:
file_path: Path to the Python file.
Returns:
A string containing the full source code of the file.
"""
try:
with open(file_path, "r", encoding="utf-8") as f:
return f.read()
except Exception as e:
return f"Error reading Python file: {str(e)}"
class ExtractTextFromImage:
def __init__(self, multimodal_model):
self.multimodal_model = multimodal_model
def __call__(self, img_path: str) -> str:
"""
Extract text from an image file.
Args:
img_path: A string representing the path to an image (e.g., PNG, JPEG).
Returns:
A single string containing the concatenated text extracted from the image.
"""
all_text = ""
try:
# Read image and encode as base64
with open(img_path, "rb") as image_file:
image_bytes = image_file.read()
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
# Prepare the prompt including the base64 image data
message = [
HumanMessage(
content=[
{
"type": "text",
"text": (
"Extract all the text from this image. "
"Return only the extracted text, no explanations."
),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
]
)
]
# Call the vision-capable model
response = self.multimodal_model.invoke(message)
# Append extracted text
all_text += response.content + "\n\n"
return all_text.strip()
except Exception as e:
error_msg = f"Error extracting text: {str(e)}"
print(error_msg)
return ""
class DescribeImage:
def __init__(self, multimodal_model):
self.multimodal_model = multimodal_model
def __call__(self, img_path: str, query: str) -> str:
"""
Generate a detailed description of an image.
This function reads a image from an url, encodes it, and sends it to a
vision-capable language model to obtain a comprehensive, natural language
description of the image's content, including its objects, actions, and context,
following a specific query.
Args:
img_path: A string representing the path to an image (e.g., PNG, JPEG).
query: Information to extract from the image.
Returns:
A single string containing a detailed description of the image.
"""
try:
# Read image and encode as base64
with open(img_path, "rb") as image_file:
image_bytes = image_file.read()
image_base64 = base64.b64encode(image_bytes).decode("utf-8")
# Prepare message payload
message = [
HumanMessage(
content=[
{
"type": "text",
"text": (
f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}" ),
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
},
},
]
)
]
response = self.multimodal_model.invoke(message)
return response.content.strip()
except Exception as e:
error_msg = f"Error describing image: {str(e)}"
print(error_msg)
return ""
class TranscribeAudio:
def __init__(self, multimodal_model):
self.multimodal_model = multimodal_model
def __call__(self, audio_path: str, query:str) -> str:
"""
Transcribe an MP3 file.
Args:
audio_path: Path to the MP3 audio file.
Returns:
Transcribed text as a string.
"""
try:
with open(audio_path, "rb") as audio_file:
audio_bytes = audio_file.read()
audio_data = AudioFile(
mime_type="audio/mpeg", # MP3 MIME type
data=audio_bytes
)
message = [
HumanMessage(
content=[
{
"type": "text",
"text": (
"Transcribe the speech from this audio file. "
"Return only the transcribed text, with no extra commentary."
),
},
{
"type": "audio",
"audio": audio_data,
},
]
)
]
response = self.audio_llm.invoke(message)
return response.content.strip()
except Exception as e:
error_msg = f"Error transcribing audio: {str(e)}"
print(error_msg)
return ""
@tool
def download_youtube_video(youtube_url: str, output_path: str) -> str:
"""
Download a YouTube video as an MP4 file.
Args:
youtube_url: The YouTube video URL.
output_path: Desired output path for the downloaded MP4 file.
Returns:
Path to the saved video file.
"""
ydl_opts = {
'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best',
'outtmpl': output_path,
'merge_output_format': 'mp4',
'quiet': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([youtube_url])
return output_path
@tool
def extract_audio_from_video(video_path: str, audio_output: str) -> str:
"""
Extracts audio from an MP4 video file and saves it as MP3.
Args:
video_path: Path to the input MP4 video file.
audio_output: Path for the output MP3 file.
Returns:
Path to the audio file.
"""
try:
(
ffmpeg
.input(video_path)
.output(audio_output, format='mp3', acodec='libmp3lame', t=60) # limit to 60 sec
.overwrite_output()
.run(quiet=True)
)
return audio_output
except ffmpeg.Error as e:
raise RuntimeError(f"FFmpeg error: {e.stderr.decode()}") from e
@tool
def wiki_search(query: str) -> str:
"""Search Wikipedia for a query and return maximum 2 results.
Args:
query: The search query."""
search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
formatted_search_docs = "\n\n---\n\n".join(
[
f'\n{doc.page_content}\n'
for doc in search_docs
])
return {"wiki_results": formatted_search_docs}
@tool
def web_search(query: str) -> str:
"""Search Tavily for a query and return maximum 3 results.
Args:
query: The search query."""
search_docs = TavilySearchResults(max_results=3).invoke(query)
formatted_search_docs = "\n\n---\n\n".join(
[
f'\n{doc.page_content}\n'
for doc in search_docs
])
return {"web_results": formatted_search_docs}
@tool
def arxiv_search(query: str) -> str:
"""Search Arxiv for a query and return maximum 3 result.
Args:
query: The search query."""
search_docs = ArxivLoader(query=query, load_max_docs=3).load()
formatted_search_docs = "\n\n---\n\n".join(
[
f'\n{doc.page_content[:1000]}\n'
for doc in search_docs
])
return {"arvix_results": formatted_search_docs}