Spaces:
Build error
Build error
File size: 6,291 Bytes
45b200f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 |
from globals import *
from global_functions import *
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio.transforms as T
import pydub
import numpy as np
# ------------------------------------------------------ #
# CONSTANTS FOR TOOLS
# ------------------------------------------------------ #
audio_model_dir = './models_for_proj/wav2vec2-base-960h'
# ------------------------------------------------------ #
# FUNCTIONS FOR TOOLS
# ------------------------------------------------------ #
def read_mp3(f, normalized=False):
"""Read MP3 file to numpy array."""
a = pydub.AudioSegment.from_mp3(f)
y = np.array(a.get_array_of_samples())
if a.channels == 2:
y = y.reshape((-1, 2))
# y = y.mean(axis=1)
y = y[:,1]
if normalized:
return a.frame_rate, np.float32(y) / 2**15
else:
return a.frame_rate, y
# ------------------------------------------------------ #
# MODELS FOR TOOLS
# ------------------------------------------------------ #
client = Together()
# audio
model = Wav2Vec2ForCTC.from_pretrained(audio_model_dir)
processor = Wav2Vec2Processor.from_pretrained(audio_model_dir)
# ------------------------------------------------------ #
# TOOLS
# ------------------------------------------------------ #
# search
search_tool = DuckDuckGoSearchRun()
# png
def describe_image_tool(file_name: str) -> str:
"""
This tool receives a file name of an image, uploads the image and returns a detailed description of the image.
Inputs: file_name as str
Outputs: image detailed description as str
"""
assert '.png' in file_name
pic_dir = f'[describe_image_tool] files/{file_name}'
getDescriptionPrompt = "What is in the image? describe in detail. Use professional notations when applicable. For example, if the image is a chess position, describe the position of ALL pieces with classical chess algebraic notation. BE PRECISE!"
base64_image = encode_image(pic_dir)
model_out = client.chat.completions.create(
# model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
model="meta-llama/Llama-Vision-Free",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": getDescriptionPrompt},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{base64_image}",},
},
],
}
],
stream=False,
)
description = model_out.choices[0].message.content
# state["messages"] += [HumanMessage(content='Do not use the image. Use the description provided further by tools.')]
return f"Do not use the image. Instead, use the description provided further by the tool. Here is the detailed description of the image. {description}"
# mp3
def describe_audio_tool(file_name: str) -> str:
"""
This tool receives a file name of an audio, uploads the audio and returns a detailed description of the audio.
Inputs: file_name as str
Outputs: audio detailed description as str
"""
# --------------------------------------------------------------------------- #
file_dir = f'files/{file_name}'
print(f"[describe_audio_tool] {file_dir=}")
audio_input_sr, audio_input_np = read_mp3(file_dir)
audio_input_t = torch.tensor(audio_input_np, dtype=torch.float32)
target_sr = 16000
resampler = T.Resample(audio_input_sr, target_sr, dtype=audio_input_t.dtype)
resampled_audio_input_t: torch.Tensor = resampler(audio_input_t)
resampled_audio_input_np = resampled_audio_input_t.numpy()
# --------------------------------------------------------------------------- #
inputs = processor(resampled_audio_input_np, sampling_rate=16000, return_tensors="pt", padding=True)
# Inference
with torch.no_grad():
logits = model(**inputs).logits
# Decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0])
return transcription
# py
def python_repl_tool(file_name: str) -> str:
"""
This tool receives a file name of a python code and executes it. Then, it returns a an output of the code.
Inputs: file_name as str
Outputs: code's output as str
"""
file_dir = f'files/{file_name}'
print(f"[python_repl_tool] {file_dir=}")
if os.path.exists(file_dir):
result = subprocess.run(["python", file_dir], capture_output=True, text=True)
return result.stdout
else:
return 'No such file.'
# xlsx
def excel_repl_tool(file_name: str) -> str:
"""
This tool receives a file name of an Excel file and reads it. Then, it returns a string of the content of the file.
Inputs: file_name as str
Outputs: file's content as str
"""
file_dir = f'files/{file_name}'
print(f"{file_dir=}")
loader = UnstructuredExcelLoader(file_dir, mode="elements")
docs = loader.load()
return docs[0].metadata['text_as_html']
# youtube
def youtube_extractor_tool(url: str) -> str:
"""
This tool receives a url of the youtube video and reads it. Then, it returns a string of the content of the video.
Inputs: url as str
Outputs: video's content as str
"""
file_name = 'my_audio_file'
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': f'files/{file_name}.%(ext)s', # <-- set your custom filename here
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([url])
return describe_audio_tool(file_name=f'{file_name}.mp3')
# wiki
def wikipedia_tool(query: str) -> str:
"""
This tool receives a query to search inside the Wikipedia website, reads the page and returns the relevant information as a string.
Inputs: query as str
Outputs: Wikipedia's relevant content as str
"""
print(f"[wiki tool] {query=}")
wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
respond = wikipedia.run(query)
return respond
# pdf
# web |