Final_Assignment_Template

Build error

File size: 6,291 Bytes

45b200f

from globals import *
from global_functions import *
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio.transforms as T
import pydub
import numpy as np

# ------------------------------------------------------ #
# CONSTANTS FOR TOOLS
# ------------------------------------------------------ #
audio_model_dir = './models_for_proj/wav2vec2-base-960h'


# ------------------------------------------------------ #
# FUNCTIONS FOR TOOLS
# ------------------------------------------------------ #
def read_mp3(f, normalized=False):
    """Read MP3 file to numpy array."""
    a = pydub.AudioSegment.from_mp3(f)
    y = np.array(a.get_array_of_samples())
    if a.channels == 2:
        y = y.reshape((-1, 2))
        # y = y.mean(axis=1)
        y = y[:,1]
    if normalized:
        return a.frame_rate, np.float32(y) / 2**15
    else:
        return a.frame_rate, y


# ------------------------------------------------------ #
# MODELS FOR TOOLS
# ------------------------------------------------------ #
client = Together()

# audio
model = Wav2Vec2ForCTC.from_pretrained(audio_model_dir)
processor = Wav2Vec2Processor.from_pretrained(audio_model_dir)

# ------------------------------------------------------ #
# TOOLS
# ------------------------------------------------------ #
# search
search_tool = DuckDuckGoSearchRun()


# png
def describe_image_tool(file_name: str) -> str:
    """
    This tool receives a file name of an image, uploads the image and returns a detailed description of the image.
    Inputs: file_name as str
    Outputs: image detailed description as str
    """
    assert '.png' in file_name
    pic_dir = f'[describe_image_tool] files/{file_name}'
    getDescriptionPrompt = "What is in the image? describe in detail. Use professional notations when applicable. For example, if the image is a chess position, describe the position of ALL pieces with classical chess algebraic notation. BE PRECISE!"
    base64_image = encode_image(pic_dir)
    model_out = client.chat.completions.create(
        # model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
        model="meta-llama/Llama-Vision-Free",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": getDescriptionPrompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}",},
                    },
                ],
            }
        ],
        stream=False,
    )
    description = model_out.choices[0].message.content
    # state["messages"] += [HumanMessage(content='Do not use the image. Use the description provided further by tools.')]
    return f"Do not use the image. Instead, use the description provided further by the tool. Here is the detailed description of the image. {description}"


# mp3
def describe_audio_tool(file_name: str) -> str:
    """
    This tool receives a file name of an audio, uploads the audio and returns a detailed description of the audio.
    Inputs: file_name as str
    Outputs: audio detailed description as str
    """
    # --------------------------------------------------------------------------- #
    file_dir = f'files/{file_name}'
    print(f"[describe_audio_tool] {file_dir=}")
    audio_input_sr, audio_input_np = read_mp3(file_dir)
    audio_input_t = torch.tensor(audio_input_np, dtype=torch.float32)
    target_sr = 16000
    resampler = T.Resample(audio_input_sr, target_sr, dtype=audio_input_t.dtype)
    resampled_audio_input_t: torch.Tensor = resampler(audio_input_t)
    resampled_audio_input_np = resampled_audio_input_t.numpy()
    # --------------------------------------------------------------------------- #
    inputs = processor(resampled_audio_input_np, sampling_rate=16000, return_tensors="pt", padding=True)
    # Inference
    with torch.no_grad():
        logits = model(**inputs).logits
    # Decode
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription


# py
def python_repl_tool(file_name: str) -> str:
    """
    This tool receives a file name of a python code and executes it. Then, it returns a an output of the code.
    Inputs: file_name as str
    Outputs: code's output as str
    """
    file_dir = f'files/{file_name}'
    print(f"[python_repl_tool] {file_dir=}")
    if os.path.exists(file_dir):
        result = subprocess.run(["python", file_dir], capture_output=True, text=True)
        return result.stdout
    else:
        return 'No such file.'


# xlsx
def excel_repl_tool(file_name: str) -> str:
    """
    This tool receives a file name of an Excel file and reads it. Then, it returns a string of the content of the file.
    Inputs: file_name as str
    Outputs: file's content as str
    """
    file_dir = f'files/{file_name}'
    print(f"{file_dir=}")
    loader = UnstructuredExcelLoader(file_dir, mode="elements")
    docs = loader.load()
    return docs[0].metadata['text_as_html']


# youtube
def youtube_extractor_tool(url: str) -> str:
    """
    This tool receives a url of the youtube video and reads it. Then, it returns a string of the content of the video.
    Inputs: url as str
    Outputs: video's content as str
    """
    file_name = 'my_audio_file'
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': f'files/{file_name}.%(ext)s',  # <-- set your custom filename here
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return describe_audio_tool(file_name=f'{file_name}.mp3')


# wiki
def wikipedia_tool(query: str) -> str:
    """
    This tool receives a query to search inside the Wikipedia website, reads the page and returns the relevant information as a string.
    Inputs: query as str
    Outputs: Wikipedia's relevant content as str
    """
    print(f"[wiki tool] {query=}")
    wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
    respond = wikipedia.run(query)
    return respond


# pdf


# web