ArseniyPerchik's picture
Clean state
45b200f
from globals import *
from global_functions import *
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio.transforms as T
import pydub
import numpy as np
# ------------------------------------------------------ #
# CONSTANTS FOR TOOLS
# ------------------------------------------------------ #
audio_model_dir = './models_for_proj/wav2vec2-base-960h'
# ------------------------------------------------------ #
# FUNCTIONS FOR TOOLS
# ------------------------------------------------------ #
def read_mp3(f, normalized=False):
"""Read MP3 file to numpy array."""
a = pydub.AudioSegment.from_mp3(f)
y = np.array(a.get_array_of_samples())
if a.channels == 2:
y = y.reshape((-1, 2))
# y = y.mean(axis=1)
y = y[:,1]
if normalized:
return a.frame_rate, np.float32(y) / 2**15
else:
return a.frame_rate, y
# ------------------------------------------------------ #
# MODELS FOR TOOLS
# ------------------------------------------------------ #
client = Together()
# audio
model = Wav2Vec2ForCTC.from_pretrained(audio_model_dir)
processor = Wav2Vec2Processor.from_pretrained(audio_model_dir)
# ------------------------------------------------------ #
# TOOLS
# ------------------------------------------------------ #
# search
search_tool = DuckDuckGoSearchRun()
# png
def describe_image_tool(file_name: str) -> str:
"""
This tool receives a file name of an image, uploads the image and returns a detailed description of the image.
Inputs: file_name as str
Outputs: image detailed description as str
"""
assert '.png' in file_name
pic_dir = f'[describe_image_tool] files/{file_name}'
getDescriptionPrompt = "What is in the image? describe in detail. Use professional notations when applicable. For example, if the image is a chess position, describe the position of ALL pieces with classical chess algebraic notation. BE PRECISE!"
base64_image = encode_image(pic_dir)
model_out = client.chat.completions.create(
# model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
model="meta-llama/Llama-Vision-Free",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": getDescriptionPrompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}",},
},
],
}
],
stream=False,
)
description = model_out.choices[0].message.content
# state["messages"] += [HumanMessage(content='Do not use the image. Use the description provided further by tools.')]
return f"Do not use the image. Instead, use the description provided further by the tool. Here is the detailed description of the image. {description}"
# mp3
def describe_audio_tool(file_name: str) -> str:
"""
This tool receives a file name of an audio, uploads the audio and returns a detailed description of the audio.
Inputs: file_name as str
Outputs: audio detailed description as str
"""
# --------------------------------------------------------------------------- #
file_dir = f'files/{file_name}'
print(f"[describe_audio_tool] {file_dir=}")
audio_input_sr, audio_input_np = read_mp3(file_dir)
audio_input_t = torch.tensor(audio_input_np, dtype=torch.float32)
target_sr = 16000
resampler = T.Resample(audio_input_sr, target_sr, dtype=audio_input_t.dtype)
resampled_audio_input_t: torch.Tensor = resampler(audio_input_t)
resampled_audio_input_np = resampled_audio_input_t.numpy()
# --------------------------------------------------------------------------- #
inputs = processor(resampled_audio_input_np, sampling_rate=16000, return_tensors="pt", padding=True)
# Inference
with torch.no_grad():
logits = model(**inputs).logits
# Decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
return transcription
# py
def python_repl_tool(file_name: str) -> str:
"""
This tool receives a file name of a python code and executes it. Then, it returns a an output of the code.
Inputs: file_name as str
Outputs: code's output as str
"""
file_dir = f'files/{file_name}'
print(f"[python_repl_tool] {file_dir=}")
if os.path.exists(file_dir):
result = subprocess.run(["python", file_dir], capture_output=True, text=True)
return result.stdout
else:
return 'No such file.'
# xlsx
def excel_repl_tool(file_name: str) -> str:
"""
This tool receives a file name of an Excel file and reads it. Then, it returns a string of the content of the file.
Inputs: file_name as str
Outputs: file's content as str
"""
file_dir = f'files/{file_name}'
print(f"{file_dir=}")
loader = UnstructuredExcelLoader(file_dir, mode="elements")
docs = loader.load()
return docs[0].metadata['text_as_html']
# youtube
def youtube_extractor_tool(url: str) -> str:
"""
This tool receives a url of the youtube video and reads it. Then, it returns a string of the content of the video.
Inputs: url as str
Outputs: video's content as str
"""
file_name = 'my_audio_file'
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': f'files/{file_name}.%(ext)s', # <-- set your custom filename here
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
return describe_audio_tool(file_name=f'{file_name}.mp3')
# wiki
def wikipedia_tool(query: str) -> str:
"""
This tool receives a query to search inside the Wikipedia website, reads the page and returns the relevant information as a string.
Inputs: query as str
Outputs: Wikipedia's relevant content as str
"""
print(f"[wiki tool] {query=}")
wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
respond = wikipedia.run(query)
return respond
# pdf
# web