import base64 import pandas as pd from langchain_core.messages import HumanMessage from langchain.tools import tool from langchain_community.tools.tavily_search import TavilySearchResults from langchain_community.document_loaders import WikipediaLoader, ArxivLoader import yt_dlp import ffmpeg @tool def read_excel(file_path: str) -> str: """ Extract readable text from an Excel file (.xlsx or .xls). Args: file_path: Path to the Excel file. Returns: A string representation of all sheets and their content. """ try: df_dict = pd.read_excel(file_path, sheet_name=None) # Read all sheets result = [] for sheet_name, sheet_df in df_dict.items(): sheet_text = sheet_df.to_string(index=False) result.append(f"Sheet: {sheet_name}\n{sheet_text}") return "\n\n".join(result) except Exception as e: return f"Error reading Excel file: {str(e)}" @tool def read_python(file_path: str) -> str: """ Extract source code from a Python (.py) file. Args: file_path: Path to the Python file. Returns: A string containing the full source code of the file. """ try: with open(file_path, "r", encoding="utf-8") as f: return f.read() except Exception as e: return f"Error reading Python file: {str(e)}" class ExtractTextFromImage: def __init__(self, multimodal_model): self.multimodal_model = multimodal_model def __call__(self, img_path: str) -> str: """ Extract text from an image file. Args: img_path: A string representing the path to an image (e.g., PNG, JPEG). Returns: A single string containing the concatenated text extracted from the image. """ all_text = "" try: # Read image and encode as base64 with open(img_path, "rb") as image_file: image_bytes = image_file.read() image_base64 = base64.b64encode(image_bytes).decode("utf-8") # Prepare the prompt including the base64 image data message = [ HumanMessage( content=[ { "type": "text", "text": ( "Extract all the text from this image. " "Return only the extracted text, no explanations." ), }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" }, }, ] ) ] # Call the vision-capable model response = self.multimodal_model.invoke(message) # Append extracted text all_text += response.content + "\n\n" return all_text.strip() except Exception as e: error_msg = f"Error extracting text: {str(e)}" print(error_msg) return "" class DescribeImage: def __init__(self, multimodal_model): self.multimodal_model = multimodal_model def __call__(self, img_path: str, query: str) -> str: """ Generate a detailed description of an image. This function reads a image from an url, encodes it, and sends it to a vision-capable language model to obtain a comprehensive, natural language description of the image's content, including its objects, actions, and context, following a specific query. Args: img_path: A string representing the path to an image (e.g., PNG, JPEG). query: Information to extract from the image. Returns: A single string containing a detailed description of the image. """ try: # Read image and encode as base64 with open(img_path, "rb") as image_file: image_bytes = image_file.read() image_base64 = base64.b64encode(image_bytes).decode("utf-8") # Prepare message payload message = [ HumanMessage( content=[ { "type": "text", "text": ( f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}" ), }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" }, }, ] ) ] response = self.multimodal_model.invoke(message) return response.content.strip() except Exception as e: error_msg = f"Error describing image: {str(e)}" print(error_msg) return "" class TranscribeAudio: def __init__(self, multimodal_model): self.multimodal_model = multimodal_model def __call__(self, audio_path: str, query:str) -> str: """ Transcribe an MP3 file. Args: audio_path: Path to the MP3 audio file. Returns: Transcribed text as a string. """ try: with open(audio_path, "rb") as audio_file: audio_bytes = audio_file.read() audio_data = AudioFile( mime_type="audio/mpeg", # MP3 MIME type data=audio_bytes ) message = [ HumanMessage( content=[ { "type": "text", "text": ( "Transcribe the speech from this audio file. " "Return only the transcribed text, with no extra commentary." ), }, { "type": "audio", "audio": audio_data, }, ] ) ] response = self.audio_llm.invoke(message) return response.content.strip() except Exception as e: error_msg = f"Error transcribing audio: {str(e)}" print(error_msg) return "" @tool def download_youtube_video(youtube_url: str, output_path: str) -> str: """ Download a YouTube video as an MP4 file. Args: youtube_url: The YouTube video URL. output_path: Desired output path for the downloaded MP4 file. Returns: Path to the saved video file. """ ydl_opts = { 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best', 'outtmpl': output_path, 'merge_output_format': 'mp4', 'quiet': True, } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([youtube_url]) return output_path @tool def extract_audio_from_video(video_path: str, audio_output: str) -> str: """ Extracts audio from an MP4 video file and saves it as MP3. Args: video_path: Path to the input MP4 video file. audio_output: Path for the output MP3 file. Returns: Path to the audio file. """ try: ( ffmpeg .input(video_path) .output(audio_output, format='mp3', acodec='libmp3lame', t=60) # limit to 60 sec .overwrite_output() .run(quiet=True) ) return audio_output except ffmpeg.Error as e: raise RuntimeError(f"FFmpeg error: {e.stderr.decode()}") from e @tool def wiki_search(query: str) -> str: """Search Wikipedia for a query and return maximum 2 results. Args: query: The search query.""" search_docs = WikipediaLoader(query=query, load_max_docs=2).load() formatted_search_docs = "\n\n---\n\n".join( [ f'\n{doc.page_content}\n' for doc in search_docs ]) return {"wiki_results": formatted_search_docs} @tool def web_search(query: str) -> str: """Search Tavily for a query and return maximum 3 results. Args: query: The search query.""" search_docs = TavilySearchResults(max_results=3).invoke(query) formatted_search_docs = "\n\n---\n\n".join( [ f'\n{doc.page_content}\n' for doc in search_docs ]) return {"web_results": formatted_search_docs} @tool def arxiv_search(query: str) -> str: """Search Arxiv for a query and return maximum 3 result. Args: query: The search query.""" search_docs = ArxivLoader(query=query, load_max_docs=3).load() formatted_search_docs = "\n\n---\n\n".join( [ f'\n{doc.page_content[:1000]}\n' for doc in search_docs ]) return {"arvix_results": formatted_search_docs}