Final_Assignment_Template

Build error

App Files Files Community

Final_Assignment_Template / tools.py

ArseniyPerchik

Clean state

45b200f 7 months ago

raw

history blame contribute delete

6.29 kB

	from globals import *
	from global_functions import *
	from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
	import torch
	import torchaudio.transforms as T
	import pydub
	import numpy as np

	# ------------------------------------------------------ #
	# CONSTANTS FOR TOOLS
	# ------------------------------------------------------ #
	audio_model_dir = './models_for_proj/wav2vec2-base-960h'


	# ------------------------------------------------------ #
	# FUNCTIONS FOR TOOLS
	# ------------------------------------------------------ #
	def read_mp3(f, normalized=False):
	"""Read MP3 file to numpy array."""
	a = pydub.AudioSegment.from_mp3(f)
	y = np.array(a.get_array_of_samples())
	if a.channels == 2:
	y = y.reshape((-1, 2))
	# y = y.mean(axis=1)
	y = y[:,1]
	if normalized:
	return a.frame_rate, np.float32(y) / 2**15
	else:
	return a.frame_rate, y


	# ------------------------------------------------------ #
	# MODELS FOR TOOLS
	# ------------------------------------------------------ #
	client = Together()

	# audio
	model = Wav2Vec2ForCTC.from_pretrained(audio_model_dir)
	processor = Wav2Vec2Processor.from_pretrained(audio_model_dir)

	# ------------------------------------------------------ #
	# TOOLS
	# ------------------------------------------------------ #
	# search
	search_tool = DuckDuckGoSearchRun()


	# png
	def describe_image_tool(file_name: str) -> str:
	"""
	This tool receives a file name of an image, uploads the image and returns a detailed description of the image.
	Inputs: file_name as str
	Outputs: image detailed description as str
	"""
	assert '.png' in file_name
	pic_dir = f'[describe_image_tool] files/{file_name}'
	getDescriptionPrompt = "What is in the image? describe in detail. Use professional notations when applicable. For example, if the image is a chess position, describe the position of ALL pieces with classical chess algebraic notation. BE PRECISE!"
	base64_image = encode_image(pic_dir)
	model_out = client.chat.completions.create(
	# model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
	model="meta-llama/Llama-Vision-Free",
	messages=[
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": getDescriptionPrompt},
	{
	"type": "image_url",
	"image_url": {"url": f"data:image/jpeg;base64,{base64_image}",},
	},
	],
	}
	],
	stream=False,
	)
	description = model_out.choices[0].message.content
	# state["messages"] += [HumanMessage(content='Do not use the image. Use the description provided further by tools.')]
	return f"Do not use the image. Instead, use the description provided further by the tool. Here is the detailed description of the image. {description}"


	# mp3
	def describe_audio_tool(file_name: str) -> str:
	"""
	This tool receives a file name of an audio, uploads the audio and returns a detailed description of the audio.
	Inputs: file_name as str
	Outputs: audio detailed description as str
	"""
	# --------------------------------------------------------------------------- #
	file_dir = f'files/{file_name}'
	print(f"[describe_audio_tool] {file_dir=}")
	audio_input_sr, audio_input_np = read_mp3(file_dir)
	audio_input_t = torch.tensor(audio_input_np, dtype=torch.float32)
	target_sr = 16000
	resampler = T.Resample(audio_input_sr, target_sr, dtype=audio_input_t.dtype)
	resampled_audio_input_t: torch.Tensor = resampler(audio_input_t)
	resampled_audio_input_np = resampled_audio_input_t.numpy()
	# --------------------------------------------------------------------------- #
	inputs = processor(resampled_audio_input_np, sampling_rate=16000, return_tensors="pt", padding=True)
	# Inference
	with torch.no_grad():
	logits = model(**inputs).logits
	# Decode
	predicted_ids = torch.argmax(logits, dim=-1)
	transcription = processor.decode(predicted_ids[0])
	return transcription


	# py
	def python_repl_tool(file_name: str) -> str:
	"""
	This tool receives a file name of a python code and executes it. Then, it returns a an output of the code.
	Inputs: file_name as str
	Outputs: code's output as str
	"""
	file_dir = f'files/{file_name}'
	print(f"[python_repl_tool] {file_dir=}")
	if os.path.exists(file_dir):
	result = subprocess.run(["python", file_dir], capture_output=True, text=True)
	return result.stdout
	else:
	return 'No such file.'


	# xlsx
	def excel_repl_tool(file_name: str) -> str:
	"""
	This tool receives a file name of an Excel file and reads it. Then, it returns a string of the content of the file.
	Inputs: file_name as str
	Outputs: file's content as str
	"""
	file_dir = f'files/{file_name}'
	print(f"{file_dir=}")
	loader = UnstructuredExcelLoader(file_dir, mode="elements")
	docs = loader.load()
	return docs[0].metadata['text_as_html']


	# youtube
	def youtube_extractor_tool(url: str) -> str:
	"""
	This tool receives a url of the youtube video and reads it. Then, it returns a string of the content of the video.
	Inputs: url as str
	Outputs: video's content as str
	"""
	file_name = 'my_audio_file'
	ydl_opts = {
	'format': 'bestaudio/best',
	'outtmpl': f'files/{file_name}.%(ext)s', # <-- set your custom filename here
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'mp3',
	'preferredquality': '192',
	}],
	}

	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.download([url])
	return describe_audio_tool(file_name=f'{file_name}.mp3')


	# wiki
	def wikipedia_tool(query: str) -> str:
	"""
	This tool receives a query to search inside the Wikipedia website, reads the page and returns the relevant information as a string.
	Inputs: query as str
	Outputs: Wikipedia's relevant content as str
	"""
	print(f"[wiki tool] {query=}")
	wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
	respond = wikipedia.run(query)
	return respond


	# pdf


	# web