from globals import * from global_functions import * from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor import torch import torchaudio.transforms as T import pydub import numpy as np # ------------------------------------------------------ # # CONSTANTS FOR TOOLS # ------------------------------------------------------ # audio_model_dir = './models_for_proj/wav2vec2-base-960h' # ------------------------------------------------------ # # FUNCTIONS FOR TOOLS # ------------------------------------------------------ # def read_mp3(f, normalized=False): """Read MP3 file to numpy array.""" a = pydub.AudioSegment.from_mp3(f) y = np.array(a.get_array_of_samples()) if a.channels == 2: y = y.reshape((-1, 2)) # y = y.mean(axis=1) y = y[:,1] if normalized: return a.frame_rate, np.float32(y) / 2**15 else: return a.frame_rate, y # ------------------------------------------------------ # # MODELS FOR TOOLS # ------------------------------------------------------ # client = Together() # audio model = Wav2Vec2ForCTC.from_pretrained(audio_model_dir) processor = Wav2Vec2Processor.from_pretrained(audio_model_dir) # ------------------------------------------------------ # # TOOLS # ------------------------------------------------------ # # search search_tool = DuckDuckGoSearchRun() # png def describe_image_tool(file_name: str) -> str: """ This tool receives a file name of an image, uploads the image and returns a detailed description of the image. Inputs: file_name as str Outputs: image detailed description as str """ assert '.png' in file_name pic_dir = f'[describe_image_tool] files/{file_name}' getDescriptionPrompt = "What is in the image? describe in detail. Use professional notations when applicable. For example, if the image is a chess position, describe the position of ALL pieces with classical chess algebraic notation. BE PRECISE!" base64_image = encode_image(pic_dir) model_out = client.chat.completions.create( # model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", model="meta-llama/Llama-Vision-Free", messages=[ { "role": "user", "content": [ { "type": "text", "text": getDescriptionPrompt}, { "type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}",}, }, ], } ], stream=False, ) description = model_out.choices[0].message.content # state["messages"] += [HumanMessage(content='Do not use the image. Use the description provided further by tools.')] return f"Do not use the image. Instead, use the description provided further by the tool. Here is the detailed description of the image. {description}" # mp3 def describe_audio_tool(file_name: str) -> str: """ This tool receives a file name of an audio, uploads the audio and returns a detailed description of the audio. Inputs: file_name as str Outputs: audio detailed description as str """ # --------------------------------------------------------------------------- # file_dir = f'files/{file_name}' print(f"[describe_audio_tool] {file_dir=}") audio_input_sr, audio_input_np = read_mp3(file_dir) audio_input_t = torch.tensor(audio_input_np, dtype=torch.float32) target_sr = 16000 resampler = T.Resample(audio_input_sr, target_sr, dtype=audio_input_t.dtype) resampled_audio_input_t: torch.Tensor = resampler(audio_input_t) resampled_audio_input_np = resampled_audio_input_t.numpy() # --------------------------------------------------------------------------- # inputs = processor(resampled_audio_input_np, sampling_rate=16000, return_tensors="pt", padding=True) # Inference with torch.no_grad(): logits = model(**inputs).logits # Decode predicted_ids = torch.argmax(logits, dim=-1) transcription = processor.decode(predicted_ids[0]) return transcription # py def python_repl_tool(file_name: str) -> str: """ This tool receives a file name of a python code and executes it. Then, it returns a an output of the code. Inputs: file_name as str Outputs: code's output as str """ file_dir = f'files/{file_name}' print(f"[python_repl_tool] {file_dir=}") if os.path.exists(file_dir): result = subprocess.run(["python", file_dir], capture_output=True, text=True) return result.stdout else: return 'No such file.' # xlsx def excel_repl_tool(file_name: str) -> str: """ This tool receives a file name of an Excel file and reads it. Then, it returns a string of the content of the file. Inputs: file_name as str Outputs: file's content as str """ file_dir = f'files/{file_name}' print(f"{file_dir=}") loader = UnstructuredExcelLoader(file_dir, mode="elements") docs = loader.load() return docs[0].metadata['text_as_html'] # youtube def youtube_extractor_tool(url: str) -> str: """ This tool receives a url of the youtube video and reads it. Then, it returns a string of the content of the video. Inputs: url as str Outputs: video's content as str """ file_name = 'my_audio_file' ydl_opts = { 'format': 'bestaudio/best', 'outtmpl': f'files/{file_name}.%(ext)s', # <-- set your custom filename here 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192', }], } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) return describe_audio_tool(file_name=f'{file_name}.mp3') # wiki def wikipedia_tool(query: str) -> str: """ This tool receives a query to search inside the Wikipedia website, reads the page and returns the relevant information as a string. Inputs: query as str Outputs: Wikipedia's relevant content as str """ print(f"[wiki tool] {query=}") wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()) respond = wikipedia.run(query) return respond # pdf # web