Final_Assignment_Template_V2

Sleeping

App Files Files Community

Update tools.py

by CindyDelage - opened May 1, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+86

-17

Files changed (1) hide show

tools.py +86 -17

tools.py CHANGED Viewed

@@ -1,6 +1,12 @@
 from smolagents import DuckDuckGoSearchTool
 from smolagents import Tool
 from huggingface_hub import InferenceClient
 class Web_research(Tool):
     name="web_research"
@@ -61,36 +67,99 @@ class translate_everything(Tool):
         translated_sentence = " ".join(right_sentence[::-1])
         return f"The translated sentence is : {translated_sentence}"
-class image_interpreter(Tool):
     name="multimodal_tool"
-    description = "Allows you to answer any question which relies on image input."
     inputs = {
-        'image': {"type": "image", "description": "the image of interest"},
         'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Summarise this image in one sentence."}
     }
     output_type = "string"
     def forward(self, prompt, image):
-        model_sdxl = "meta-llama/Llama-3.1-8B-Instruct"
-        client = InferenceClient(model_sdxl)
-        output = client.chat.completions.create(
-        messages=[
             {
                 "role": "user",
                 "content": [
-                    {
-                        "type": "image",
-                        "image": {image},
-                    },
-                    {
-                        "type": "text",
-                        "text": {prompt},
-                    },
                 ],
             },
-        ],
         )
-        return output
 class Wikipedia_reader(Tool):
     name="wikipedia_tool"

 from smolagents import DuckDuckGoSearchTool
 from smolagents import Tool
 from huggingface_hub import InferenceClient
+import soundfile as sf
+from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
+from qwen_omni_utils import process_mm_info
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
+from datasets import load_dataset
 class Web_research(Tool):
     name="web_research"
         translated_sentence = " ".join(right_sentence[::-1])
         return f"The translated sentence is : {translated_sentence}"
+class multimodal_interpreter(Tool):
     name="multimodal_tool"
+    description = "Allows you to answer any question which relies on image or video input."
     inputs = {
+        'image': {"type": "image", "description": "the image or video of interest"},
         'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Summarise this image in one sentence."}
     }
     output_type = "string"
     def forward(self, prompt, image):
+        # default: Load the model on the available device(s)
+        model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
+        # We recommend enabling flash_attention_2 for better acceleration and memory saving.
+        # model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
+        #     "Qwen/Qwen2.5-Omni-7B",
+        #     torch_dtype="auto",
+        #     device_map="auto",
+        #     attn_implementation="flash_attention_2",
+        # )
+        processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
+        conversation = [
+            {
+                "role": "system",
+                "content": [
+                    {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
+                ],
+            },
             {
                 "role": "user",
                 "content": [
+                    {"type": "image", "image": {image}},
                 ],
             },
+        ]
+        # set use audio in video
+        USE_AUDIO_IN_VIDEO = True
+        # Preparation for inference
+        text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
+        audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+        inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+        inputs = inputs.to(model.device).to(model.dtype)
+        # Inference: Generation of the output text and audio
+        text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
+        text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        sf.write(
+            "output.wav",
+            audio.reshape(-1).detach().cpu().numpy(),
+            samplerate=24000,
+        )
+        return text
+class audio_or_mp3__interpreter(Tool):
+    name="multimodal_tool"
+    description = "Allows you to convert audio into text. It uses Whisper, it is a state-of-the-art model for automatic speech recognition (ASR) and speech translation"
+    inputs = {
+        'audio': {"type": "audio", "description": "the audio of interest"}
+    }
+    output_type = "string"
+    def forward(self, prompt, audio):
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+        model_id = "openai/whisper-large-v3"
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
+        )
+        model.to(device)
+        processor = AutoProcessor.from_pretrained(model_id)
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            torch_dtype=torch_dtype,
+            device=device,
         )
+        sample = {audio}[0]["audio"]
+        result = pipe(sample)
+        return result["text"]
 class Wikipedia_reader(Tool):
     name="wikipedia_tool"