CindyDelage commited on
Commit
c0a5526
·
verified ·
1 Parent(s): be157ca

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +47 -37
tools.py CHANGED
@@ -2,11 +2,11 @@ from smolagents import DuckDuckGoSearchTool
2
  from smolagents import Tool
3
  from huggingface_hub import InferenceClient
4
  import soundfile as sf
5
- from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
6
- from qwen_omni_utils import process_mm_info
7
  import torch
8
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
9
  from datasets import load_dataset
 
 
10
 
11
  class Web_research(Tool):
12
  name="web_research"
@@ -72,59 +72,69 @@ class multimodal_interpreter(Tool):
72
  description = "Allows you to answer any question which relies on image or video input."
73
  inputs = {
74
  'image': {"type": "image", "description": "the image or video of interest"},
75
- 'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Summarise this image in one sentence."}
76
  }
77
  output_type = "string"
78
 
79
  def forward(self, prompt, image):
 
80
  # default: Load the model on the available device(s)
81
- model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
 
 
82
 
83
- # We recommend enabling flash_attention_2 for better acceleration and memory saving.
84
- # model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
85
- # "Qwen/Qwen2.5-Omni-7B",
86
- # torch_dtype="auto",
87
- # device_map="auto",
88
  # attn_implementation="flash_attention_2",
 
89
  # )
90
 
91
- processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
 
92
 
93
- conversation = [
94
- {
95
- "role": "system",
96
- "content": [
97
- {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
98
- ],
99
- },
100
  {
101
  "role": "user",
102
  "content": [
103
- {"type": "image", "image": {image}},
 
 
 
 
104
  ],
105
- },
106
  ]
107
 
108
- # set use audio in video
109
- USE_AUDIO_IN_VIDEO = True
110
-
111
  # Preparation for inference
112
- text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
113
- audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
114
- inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
115
- inputs = inputs.to(model.device).to(model.dtype)
 
 
 
 
 
 
 
 
116
 
117
- # Inference: Generation of the output text and audio
118
- text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
119
-
120
- text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
121
- sf.write(
122
- "output.wav",
123
- audio.reshape(-1).detach().cpu().numpy(),
124
- samplerate=24000,
125
  )
126
-
127
- return text
128
 
129
  class audio_or_mp3__interpreter(Tool):
130
  name="multimodal_tool"
@@ -156,7 +166,7 @@ class audio_or_mp3__interpreter(Tool):
156
  device=device,
157
  )
158
 
159
- sample = {audio}[0]["audio"]
160
 
161
  result = pipe(sample)
162
  return result["text"]
 
2
  from smolagents import Tool
3
  from huggingface_hub import InferenceClient
4
  import soundfile as sf
 
 
5
  import torch
6
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
7
  from datasets import load_dataset
8
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
9
+ from qwen_vl_utils import process_vision_info
10
 
11
  class Web_research(Tool):
12
  name="web_research"
 
72
  description = "Allows you to answer any question which relies on image or video input."
73
  inputs = {
74
  'image': {"type": "image", "description": "the image or video of interest"},
75
+ 'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Describe this image."}
76
  }
77
  output_type = "string"
78
 
79
  def forward(self, prompt, image):
80
+
81
  # default: Load the model on the available device(s)
82
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
83
+ "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
84
+ )
85
 
86
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
87
+ # model = Qwen2VLForConditionalGeneration.from_pretrained(
88
+ # "Qwen/Qwen2-VL-7B-Instruct",
89
+ # torch_dtype=torch.bfloat16,
 
90
  # attn_implementation="flash_attention_2",
91
+ # device_map="auto",
92
  # )
93
 
94
+ # default processer
95
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
96
 
97
+ # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
98
+ # min_pixels = 256*28*28
99
+ # max_pixels = 1280*28*28
100
+ # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
101
+
102
+ messages = [
 
103
  {
104
  "role": "user",
105
  "content": [
106
+ {
107
+ "type": "image",
108
+ "image": {image},
109
+ },
110
+ {"type": "text", "text": {prompt}},
111
  ],
112
+ }
113
  ]
114
 
 
 
 
115
  # Preparation for inference
116
+ text = processor.apply_chat_template(
117
+ messages, tokenize=False, add_generation_prompt=True
118
+ )
119
+ image_inputs, video_inputs = process_vision_info(messages)
120
+ inputs = processor(
121
+ text=[text],
122
+ images=image_inputs,
123
+ videos=video_inputs,
124
+ padding=True,
125
+ return_tensors="pt",
126
+ )
127
+ inputs = inputs.to("cuda")
128
 
129
+ # Inference: Generation of the output
130
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
131
+ generated_ids_trimmed = [
132
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
133
+ ]
134
+ output_text = processor.batch_decode(
135
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
 
136
  )
137
+ return output_text
 
138
 
139
  class audio_or_mp3__interpreter(Tool):
140
  name="multimodal_tool"
 
166
  device=device,
167
  )
168
 
169
+ sample = {audio} #sample must be of the type dataset[0]["audio"]
170
 
171
  result = pipe(sample)
172
  return result["text"]