from typing import List from langchain_core.tools import tool from langchain_community.document_loaders import WikipediaLoader, YoutubeLoader from langchain_community.tools import DuckDuckGoSearchResults from langchain_community.utilities import DuckDuckGoSearchAPIWrapper from langchain_ollama import ChatOllama from langchain_sandbox import PyodideSandbox import base64 from langchain_core.messages import HumanMessage, SystemMessage import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline from docling.document_converter import DocumentConverter from langchain_tavily import TavilySearch doc_converter = DocumentConverter() @tool def wikipedia_search(query: str) -> str: """ Search Wikipedia for a given query and return max 1 result. Args: query: The search query. """ # Simulate a search operation search_docs = WikipediaLoader(query=query, load_max_docs=1).load() docling_docs = [doc_converter.convert(doc.metadata["source"]).document.export_to_markdown() for doc in search_docs] start_indexes = [] for d in docling_docs: start_index = d.find("From Wikipedia") if start_index != -1: start_indexes.append(start_index) else: start_indexes.append(0) formatted_docs = "\n\n---\n\n".join( [ f'\n{docling_doc[start_index:]}\n' for search_doc, docling_doc, start_index in zip(search_docs, docling_docs, start_indexes) ]) return formatted_docs @tool def youtube_transcript(url: str) -> str: """"Returns the transcript of a YouTube video given its URL. This is a text-based tool and should not be used for visual information of the video. Args: url: The YouTube video URL. """ max_tries = 3 for _ in range(max_tries): try: transcripts = YoutubeLoader.from_youtube_url(url, add_video_info=False).load() return f"Video Transcript: {transcripts[0].page_content}" except Exception as e: print(f"Attempt failed: {e}") continue # If all attempts fail, return an error message return "No transcript available. This video might not have a transcript or the URL is invalid." @tool def web_search(query: str) -> str: """ Perform a web search for the given query and return the results. Use this when you need to find current or factual information. Args: query: The search query. """ # Simulate a web search operation tavily_search = TavilySearch(max_results=3) search_docs = tavily_search.invoke(query) # Format formatted_search_docs = "\n\n---\n\n".join( [ f'\n{doc["content"]}\n' for doc in search_docs["results"] ] ) return f"Web search results for '{query}':\n\n{formatted_search_docs}" @tool def add_numbers(numbers: List[float]) -> float: """ Add a list of numbers together. E.g [1, 2, 3] -> 6 Args: numbers: A list of numbers to add. """ return sum(numbers) @tool def multiply_numbers(numbers: List[float]) -> float: """ Multiply a list of numbers together. E.g [3, 2, 3] -> 18 Args: numbers: A list of numbers to multiply. """ result = 1 for number in numbers: result *= number return result vision_llm = ChatOllama(model="gemma3:27b") # might be better to use supervisor method.. @tool def image_question_answering(img_path: str, question: str) -> str: """ Given an image path and a question, return the answer to the question based on the image. Just pass the initial question from the human as a query. Args: img_path: The path to the image. question: The question to ask about the image. """ system_prompt = """ You are a helpful assistant that can answer questions about images. You need to think step by step carefully, provide your thinking process and finish your answer with the following template: FINAL ANSWER: [YOUR FINAL ANSWER] """ try: # Read image and encode as base64 with open(img_path, "rb") as image_file: image_bytes = image_file.read() image_base64 = base64.b64encode(image_bytes).decode("utf-8") question = "Review the chess position provided in the image. It is black's turn. Provide the correct next move for black which guarantees a win. Please provide your response in algebraic notation." # Prepare the prompt including the base64 image data message = [ SystemMessage(content=system_prompt), HumanMessage( content=[ { "type": "text", "text": question, }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{image_base64}" }, }, ] ) ] # Call the vision-capable model response = vision_llm.invoke(message) return response.content except Exception as e: error_msg = f"Error image questioning: {str(e)}" print(error_msg) return error_msg device = "mps" checkpoint = "./whisper-large-v3" model = AutoModelForSpeechSeq2Seq.from_pretrained( checkpoint, torch_dtype=torch.float32, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(checkpoint) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch.float32, device=device, ) @tool def speech_to_text(audio_path: str) -> str: """ Convert speech to text using a given audio file. Not for youtube links. Args: audio_path: The path to the audio file. """ try: result = pipe(audio_path) return result["text"].strip() except Exception as e: result = pipe(audio_path, return_timestamps=True) return result["text"].strip() except Exception as e: return f"Error processing audio file: {str(e)}" @tool def read_file_content(path: str) -> str: """ Read the content of a file (pdf, docs, xlsx, etc.) but also from a URL (like arxiv or websites) and returns it as markdown. Args: file_path: The path to the file. """ try: doc = doc_converter.convert(path).document markdown = doc.export_to_markdown() return f"File Content:\n\n{markdown}" except Exception as e: return f"Error reading file: {str(e)}" sandbox = PyodideSandbox( # Allow Pyodide to install python packages that # might be required. allow_net=True, ) @tool async def run_python_code(input_type: str, input: str) -> str: """ Run Python code in a sandboxed environment. You can provide either a code snippet or a file path. 1. If input_type is "code", input should be a string containing the Python code to run. 2. If input_type is "file", input should be a string containing the path to the file. Args: input_type: The type of input, code or file. input: The Python code to run or the path to the file. """ try: if input_type == "code": code = input elif input_type == "file": with open(input, "r") as file: code = file.read() else: return "Invalid input type. Please provide 'code' or 'file' as input_type." result = await sandbox.execute(code) return f"Result execution: result: {result.result}, stdout: {result.stdout}, stderr: {result.stderr}, status: {result.status}" except Exception as e: return f"Error executing Python code: {str(e)}" @tool def reverse_string(input: str) -> str: """ Reverse a given string. Args: input: The string to reverse. """ return input[::-1] TOOLS = [wikipedia_search, web_search, youtube_transcript, add_numbers, multiply_numbers , image_question_answering, speech_to_text, read_file_content, run_python_code, reverse_string]