Spaces:
Build error
Build error
| from globals import * | |
| from global_functions import * | |
| from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
| import torch | |
| import torchaudio.transforms as T | |
| import pydub | |
| import numpy as np | |
| # ------------------------------------------------------ # | |
| # CONSTANTS FOR TOOLS | |
| # ------------------------------------------------------ # | |
| audio_model_dir = './models_for_proj/wav2vec2-base-960h' | |
| # ------------------------------------------------------ # | |
| # FUNCTIONS FOR TOOLS | |
| # ------------------------------------------------------ # | |
| def read_mp3(f, normalized=False): | |
| """Read MP3 file to numpy array.""" | |
| a = pydub.AudioSegment.from_mp3(f) | |
| y = np.array(a.get_array_of_samples()) | |
| if a.channels == 2: | |
| y = y.reshape((-1, 2)) | |
| # y = y.mean(axis=1) | |
| y = y[:,1] | |
| if normalized: | |
| return a.frame_rate, np.float32(y) / 2**15 | |
| else: | |
| return a.frame_rate, y | |
| # ------------------------------------------------------ # | |
| # MODELS FOR TOOLS | |
| # ------------------------------------------------------ # | |
| client = Together() | |
| # audio | |
| model = Wav2Vec2ForCTC.from_pretrained(audio_model_dir) | |
| processor = Wav2Vec2Processor.from_pretrained(audio_model_dir) | |
| # ------------------------------------------------------ # | |
| # TOOLS | |
| # ------------------------------------------------------ # | |
| # search | |
| search_tool = DuckDuckGoSearchRun() | |
| # png | |
| def describe_image_tool(file_name: str) -> str: | |
| """ | |
| This tool receives a file name of an image, uploads the image and returns a detailed description of the image. | |
| Inputs: file_name as str | |
| Outputs: image detailed description as str | |
| """ | |
| assert '.png' in file_name | |
| pic_dir = f'[describe_image_tool] files/{file_name}' | |
| getDescriptionPrompt = "What is in the image? describe in detail. Use professional notations when applicable. For example, if the image is a chess position, describe the position of ALL pieces with classical chess algebraic notation. BE PRECISE!" | |
| base64_image = encode_image(pic_dir) | |
| model_out = client.chat.completions.create( | |
| # model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", | |
| model="meta-llama/Llama-Vision-Free", | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": getDescriptionPrompt}, | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": f"data:image/jpeg;base64,{base64_image}",}, | |
| }, | |
| ], | |
| } | |
| ], | |
| stream=False, | |
| ) | |
| description = model_out.choices[0].message.content | |
| # state["messages"] += [HumanMessage(content='Do not use the image. Use the description provided further by tools.')] | |
| return f"Do not use the image. Instead, use the description provided further by the tool. Here is the detailed description of the image. {description}" | |
| # mp3 | |
| def describe_audio_tool(file_name: str) -> str: | |
| """ | |
| This tool receives a file name of an audio, uploads the audio and returns a detailed description of the audio. | |
| Inputs: file_name as str | |
| Outputs: audio detailed description as str | |
| """ | |
| # --------------------------------------------------------------------------- # | |
| file_dir = f'files/{file_name}' | |
| print(f"[describe_audio_tool] {file_dir=}") | |
| audio_input_sr, audio_input_np = read_mp3(file_dir) | |
| audio_input_t = torch.tensor(audio_input_np, dtype=torch.float32) | |
| target_sr = 16000 | |
| resampler = T.Resample(audio_input_sr, target_sr, dtype=audio_input_t.dtype) | |
| resampled_audio_input_t: torch.Tensor = resampler(audio_input_t) | |
| resampled_audio_input_np = resampled_audio_input_t.numpy() | |
| # --------------------------------------------------------------------------- # | |
| inputs = processor(resampled_audio_input_np, sampling_rate=16000, return_tensors="pt", padding=True) | |
| # Inference | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| # Decode | |
| predicted_ids = torch.argmax(logits, dim=-1) | |
| transcription = processor.decode(predicted_ids[0]) | |
| return transcription | |
| # py | |
| def python_repl_tool(file_name: str) -> str: | |
| """ | |
| This tool receives a file name of a python code and executes it. Then, it returns a an output of the code. | |
| Inputs: file_name as str | |
| Outputs: code's output as str | |
| """ | |
| file_dir = f'files/{file_name}' | |
| print(f"[python_repl_tool] {file_dir=}") | |
| if os.path.exists(file_dir): | |
| result = subprocess.run(["python", file_dir], capture_output=True, text=True) | |
| return result.stdout | |
| else: | |
| return 'No such file.' | |
| # xlsx | |
| def excel_repl_tool(file_name: str) -> str: | |
| """ | |
| This tool receives a file name of an Excel file and reads it. Then, it returns a string of the content of the file. | |
| Inputs: file_name as str | |
| Outputs: file's content as str | |
| """ | |
| file_dir = f'files/{file_name}' | |
| print(f"{file_dir=}") | |
| loader = UnstructuredExcelLoader(file_dir, mode="elements") | |
| docs = loader.load() | |
| return docs[0].metadata['text_as_html'] | |
| # youtube | |
| def youtube_extractor_tool(url: str) -> str: | |
| """ | |
| This tool receives a url of the youtube video and reads it. Then, it returns a string of the content of the video. | |
| Inputs: url as str | |
| Outputs: video's content as str | |
| """ | |
| file_name = 'my_audio_file' | |
| ydl_opts = { | |
| 'format': 'bestaudio/best', | |
| 'outtmpl': f'files/{file_name}.%(ext)s', # <-- set your custom filename here | |
| 'postprocessors': [{ | |
| 'key': 'FFmpegExtractAudio', | |
| 'preferredcodec': 'mp3', | |
| 'preferredquality': '192', | |
| }], | |
| } | |
| with yt_dlp.YoutubeDL(ydl_opts) as ydl: | |
| ydl.download([url]) | |
| return describe_audio_tool(file_name=f'{file_name}.mp3') | |
| # wiki | |
| def wikipedia_tool(query: str) -> str: | |
| """ | |
| This tool receives a query to search inside the Wikipedia website, reads the page and returns the relevant information as a string. | |
| Inputs: query as str | |
| Outputs: Wikipedia's relevant content as str | |
| """ | |
| print(f"[wiki tool] {query=}") | |
| wikipedia = WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper()) | |
| respond = wikipedia.run(query) | |
| return respond | |
| # web |