import base64 import ffmpeg import pandas as pd import whisper import yt_dlp from langchain.tools import tool from langchain.tools.tavily_search import TavilySearchResults from langchain_community.document_loaders import ArxivLoader, WikipediaLoader from langchain_core.messages import HumanMessage from typing import List from functools import reduce import operator import contextlib import os @tool def read_excel(file_path: str) -> str: """Extract readable text from an Excel file (.xlsx or .xls). Args: file_path: Path to the Excel file. Returns: A string representation of all sheets and their content. """ try: df_dict = pd.read_excel(file_path, sheet_name=None) # Read all sheets result = [] for sheet_name, sheet_df in df_dict.items(): sheet_text = sheet_df.to_json(orient="records", lines=False) result.append({f"Sheet: {sheet_name}": sheet_text}) full_text = "" for sheet in result: for sheet_name, sheet_data in sheet.items(): full_text += f"{sheet_name}\n{sheet_data}\n\n" return full_text except Exception as e: return f"Error reading Excel file: {str(e)}" @tool def read_python(file_path: str) -> str: """Extract source code from a Python (.py) file. Args: file_path: Path to the Python file. Returns: A string containing the full source code of the file. """ try: with open(file_path, "r", encoding="utf-8") as f: return f.read() except Exception as e: return f"Error reading Python file: {str(e)}" class ExtractTextFromImage: """Class to initialize the extract_text_from_image tool.""" def __init__(self, multimodal_model): """Initialize multimodal model.""" self.multimodal_model = multimodal_model def __call_extract_text_from_image__(self, img_path: str) -> str: """Extract text from an image file. Args: img_path: A string representing the path to an image (e.g., PNG, JPEG). Returns: A single string containing the concatenated text extracted from the image. """ all_text = "" try: # Read image and encode as base64 with open(img_path, "rb") as image_file: image_bytes = image_file.read() image_base64 = base64.b64encode(image_bytes).decode("utf-8") # Prepare the prompt including the base64 image data message = [ HumanMessage( content=[ { "type": "text", "text": ( "Extract all the text from this image. " "Return only the extracted text, no explanations." ), }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" }, }, ] ) ] # Call the vision-capable model response = self.multimodal_model.invoke(message) # Append extracted text all_text += response.content + "\n\n" return all_text.strip() except Exception as e: error_msg = f"Error extracting text: {str(e)}" print(error_msg) return "" class DescribeImage: """Class to initialize the describe_image tool.""" def __init__(self, multimodal_model): """Initialize multimodal model.""" self.multimodal_model = multimodal_model def __call_describe_image__(self, img_path: str, query: str) -> str: """Generate a detailed description of an image. This function reads a image from an url, encodes it, and sends it to a vision-capable language model to obtain a comprehensive, natural language description of the image's content, including its objects, actions, and context, following a specific query. Args: img_path: A string representing the path to an image (e.g., PNG, JPEG). query: Information to extract from the image. Returns: A single string containing a detailed description of the image. """ try: # Read image and encode as base64 with open(img_path, "rb") as image_file: image_bytes = image_file.read() image_base64 = base64.b64encode(image_bytes).decode("utf-8") # Prepare message payload message = [ HumanMessage( content=[ { "type": "text", "text": ( f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}" ), }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" }, }, ] ) ] response = self.multimodal_model.invoke(message) return response.content.strip() except Exception as e: error_msg = f"Error describing image: {str(e)}" print(error_msg) return "" @tool def transcribe_audio(audio_path: str) -> str: """Transcribe an MP3 file. Args: audio_path: Path to the MP3 audio file. Returns: Transcribed text as a string. """ try: model = whisper.load_model("small") # or "tiny", "small", "medium", "large" result = model.transcribe(audio_path) return result except Exception as e: error_msg = f"Error transcribing audio: {str(e)}" print(error_msg) return "" @tool def download_youtube_video(youtube_url: str, output_path: str) -> str: """Download a YouTube video as an MP4 file. Args: youtube_url: The YouTube video URL. output_path: Desired output path for the downloaded MP4 file. Returns: Path to the saved video file. """ ydl_opts = { "format": "bestvideo+bestaudio/best", "outtmpl": output_path, "merge_output_format": "mp4", "quiet": True, } with contextlib.redirect_stderr(open(os.devnull, "w")): with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([youtube_url]) return output_path @tool def extract_audio_from_video(video_path: str, audio_output: str) -> str: """Extracts audio from an MP4 video file and saves it as MP3. Args: video_path: Path to the input MP4 video file. audio_output: Path for the output MP3 file. Returns: Path to the audio file. """ try: ( ffmpeg.input(video_path) .output( audio_output, format="mp3", acodec="libmp3lame", t=60 ) # limit to 60 sec .overwrite_output() .run(quiet=True) ) return audio_output except Exception as e: error_msg = f"Error transcribing audio: {str(e)}" print(error_msg) return "" @tool def wiki_search(query: str) -> str: """Search Wikipedia for a query and return maximum 2 results. Args: query: The search query. """ search_docs = WikipediaLoader(query=query, load_max_docs=2).load() formatted_search_docs = "\n\n---\n\n".join( [ f'\n{doc.page_content}\n' for doc in search_docs ] ) return {"wiki_results": formatted_search_docs} @tool def web_search(query: str) -> str: """Search Tavily for a query and return maximum 3 results. Args: query: The search query. """ search_docs = TavilySearchResults(max_results=3).invoke(query) formatted_search_docs = "\n\n---\n\n".join( [ f'\n{doc["content"]}\n' for doc in search_docs ] ) return {"web_results": formatted_search_docs} @tool def arxiv_search(query: str) -> str: """Search Arxiv for a paper. Args: query: The search query to retrieve a specific paper, consisting of title and/or authors name and/or year of publication. """ search_docs = ArxivLoader(query=query, load_max_docs=2).load() formatted_search_docs = "\n\n---\n\n".join( [ ( f'\n' f'Summary: {doc.metadata.get("Summary", "")}\n\n' f"{doc.page_content}\n" f"" ) for doc in search_docs ] ) return {"arvix_results": formatted_search_docs} @tool def add(numbers: List[float]) -> float: """Calculates the sum of a list of numbers. Args: numbers: A list of numeric values to be summed. Returns: The sum of all numbers in the list. """ return sum(numbers) @tool def multiply(numbers: List[float]) -> float: """Calculates the product of a list of numbers. Args: numbers: A list of numeric values to be multiplied. Returns: The product of all numbers in the list. """ return reduce(operator.mul, numbers, 1.0) @tool def divide(a: int, b: int) -> float: """Divide a and b. Args: a: first number b: second number """ return a / b