Final_Assignment_Template_V2

Sleeping

App Files Files Community

CindyDelage commited on May 1, 2025

Commit

c0a5526

verified ·

1 Parent(s): be157ca

Update tools.py

Browse files

Files changed (1) hide show

tools.py +47 -37

tools.py CHANGED Viewed

@@ -2,11 +2,11 @@ from smolagents import DuckDuckGoSearchTool
 from smolagents import Tool
 from huggingface_hub import InferenceClient
 import soundfile as sf
-from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
-from qwen_omni_utils import process_mm_info
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 from datasets import load_dataset
 class Web_research(Tool):
     name="web_research"
@@ -72,59 +72,69 @@ class multimodal_interpreter(Tool):
     description = "Allows you to answer any question which relies on image or video input."
     inputs = {
         'image': {"type": "image", "description": "the image or video of interest"},
-        'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Summarise this image in one sentence."}
     }
     output_type = "string"
     def forward(self, prompt, image):
         # default: Load the model on the available device(s)
-        model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
-        # We recommend enabling flash_attention_2 for better acceleration and memory saving.
-        # model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
-        #     "Qwen/Qwen2.5-Omni-7B",
-        #     torch_dtype="auto",
-        #     device_map="auto",
         #     attn_implementation="flash_attention_2",
         # )
-        processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
-        conversation = [
-            {
-                "role": "system",
-                "content": [
-                    {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
-                ],
-            },
             {
                 "role": "user",
                 "content": [
-                    {"type": "image", "image": {image}},
                 ],
-            },
         ]
-        # set use audio in video
-        USE_AUDIO_IN_VIDEO = True
         # Preparation for inference
-        text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
-        audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
-        inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
-        inputs = inputs.to(model.device).to(model.dtype)
-        # Inference: Generation of the output text and audio
-        text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
-        text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        sf.write(
-            "output.wav",
-            audio.reshape(-1).detach().cpu().numpy(),
-            samplerate=24000,
         )
-        return text
 class audio_or_mp3__interpreter(Tool):
     name="multimodal_tool"
@@ -156,7 +166,7 @@ class audio_or_mp3__interpreter(Tool):
             device=device,
         )
-        sample = {audio}[0]["audio"]
         result = pipe(sample)
         return result["text"]

 from smolagents import Tool
 from huggingface_hub import InferenceClient
 import soundfile as sf
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 from datasets import load_dataset
+from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
 class Web_research(Tool):
     name="web_research"
     description = "Allows you to answer any question which relies on image or video input."
     inputs = {
         'image': {"type": "image", "description": "the image or video of interest"},
+        'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Describe this image."}
     }
     output_type = "string"
     def forward(self, prompt, image):
         # default: Load the model on the available device(s)
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
+        )
+        # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
+        # model = Qwen2VLForConditionalGeneration.from_pretrained(
+        #     "Qwen/Qwen2-VL-7B-Instruct",
+        #     torch_dtype=torch.bfloat16,
         #     attn_implementation="flash_attention_2",
+        #     device_map="auto",
         # )
+        # default processer
+        processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+        # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
+        # min_pixels = 256*28*28
+        # max_pixels = 1280*28*28
+        # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
+        messages = [
             {
                 "role": "user",
                 "content": [
+                    {
+                        "type": "image",
+                        "image": {image},
+                    },
+                    {"type": "text", "text": {prompt}},
                 ],
+            }
         ]
         # Preparation for inference
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        inputs = processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to("cuda")
+        # Inference: Generation of the output
+        generated_ids = model.generate(**inputs, max_new_tokens=128)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
+        return output_text
 class audio_or_mp3__interpreter(Tool):
     name="multimodal_tool"
             device=device,
         )
+        sample = {audio} #sample must be of the type dataset[0]["audio"]
         result = pipe(sample)
         return result["text"]