File size: 6,291 Bytes
45b200f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
from globals import *
from global_functions import *
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import torch
import torchaudio.transforms as T
import pydub
import numpy as np

# ------------------------------------------------------ #
# CONSTANTS FOR TOOLS
# ------------------------------------------------------ #
audio_model_dir = './models_for_proj/wav2vec2-base-960h'


# ------------------------------------------------------ #
# FUNCTIONS FOR TOOLS
# ------------------------------------------------------ #
def read_mp3(f, normalized=False):
    """Read MP3 file to numpy array."""
    a = pydub.AudioSegment.from_mp3(f)
    y = np.array(a.get_array_of_samples())
    if a.channels == 2:
        y = y.reshape((-1, 2))
        # y = y.mean(axis=1)
        y = y[:,1]
    if normalized:
        return a.frame_rate, np.float32(y) / 2**15
    else:
        return a.frame_rate, y


# ------------------------------------------------------ #
# MODELS FOR TOOLS
# ------------------------------------------------------ #
client = Together()

# audio
model = Wav2Vec2ForCTC.from_pretrained(audio_model_dir)
processor = Wav2Vec2Processor.from_pretrained(audio_model_dir)

# ------------------------------------------------------ #
# TOOLS
# ------------------------------------------------------ #
# search
search_tool = DuckDuckGoSearchRun()


# png
def describe_image_tool(file_name: str) -> str:
    """
    This tool receives a file name of an image, uploads the image and returns a detailed description of the image.
    Inputs: file_name as str
    Outputs: image detailed description as str
    """
    assert '.png' in file_name
    pic_dir = f'[describe_image_tool] files/{file_name}'
    getDescriptionPrompt = "What is in the image? describe in detail. Use professional notations when applicable. For example, if the image is a chess position, describe the position of ALL pieces with classical chess algebraic notation. BE PRECISE!"
    base64_image = encode_image(pic_dir)
    model_out = client.chat.completions.create(
        # model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
        model="meta-llama/Llama-Vision-Free",
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": getDescriptionPrompt},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/jpeg;base64,{base64_image}",},
                    },
                ],
            }
        ],
        stream=False,
    )
    description = model_out.choices[0].message.content
    # state["messages"] += [HumanMessage(content='Do not use the image. Use the description provided further by tools.')]
    return f"Do not use the image. Instead, use the description provided further by the tool. Here is the detailed description of the image. {description}"


# mp3
def describe_audio_tool(file_name: str) -> str:
    """
    This tool receives a file name of an audio, uploads the audio and returns a detailed description of the audio.
    Inputs: file_name as str
    Outputs: audio detailed description as str
    """
    # --------------------------------------------------------------------------- #
    file_dir = f'files/{file_name}'
    print(f"[describe_audio_tool] {file_dir=}")
    audio_input_sr, audio_input_np = read_mp3(file_dir)
    audio_input_t = torch.tensor(audio_input_np, dtype=torch.float32)
    target_sr = 16000
    resampler = T.Resample(audio_input_sr, target_sr, dtype=audio_input_t.dtype)
    resampled_audio_input_t: torch.Tensor = resampler(audio_input_t)
    resampled_audio_input_np = resampled_audio_input_t.numpy()
    # --------------------------------------------------------------------------- #
    inputs = processor(resampled_audio_input_np, sampling_rate=16000, return_tensors="pt", padding=True)
    # Inference
    with torch.no_grad():
        logits = model(**inputs).logits
    # Decode
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = processor.decode(predicted_ids[0])
    return transcription


# py
def python_repl_tool(file_name: str) -> str:
    """
    This tool receives a file name of a python code and executes it. Then, it returns a an output of the code.
    Inputs: file_name as str
    Outputs: code's output as str
    """
    file_dir = f'files/{file_name}'
    print(f"[python_repl_tool] {file_dir=}")
    if os.path.exists(file_dir):
        result = subprocess.run(["python", file_dir], capture_output=True, text=True)
        return result.stdout
    else:
        return 'No such file.'


# xlsx
def excel_repl_tool(file_name: str) -> str:
    """
    This tool receives a file name of an Excel file and reads it. Then, it returns a string of the content of the file.
    Inputs: file_name as str
    Outputs: file's content as str
    """
    file_dir = f'files/{file_name}'
    print(f"{file_dir=}")
    loader = UnstructuredExcelLoader(file_dir, mode="elements")
    docs = loader.load()
    return docs[0].metadata['text_as_html']


# youtube
def youtube_extractor_tool(url: str) -> str:
    """
    This tool receives a url of the youtube video and reads it. Then, it returns a string of the content of the video.
    Inputs: url as str
    Outputs: video's content as str
    """
    file_name = 'my_audio_file'
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': f'files/{file_name}.%(ext)s',  # <-- set your custom filename here
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])
    return describe_audio_tool(file_name=f'{file_name}.mp3')


# wiki
def wikipedia_tool(query: str) -> str:
    """
    This tool receives a query to search inside the Wikipedia website, reads the page and returns the relevant information as a string.
    Inputs: query as str
    Outputs: Wikipedia's relevant content as str
    """
    print(f"[wiki tool] {query=}")
    wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper())
    respond = wikipedia.run(query)
    return respond


# pdf


# web