CindyDelage commited on
Commit
e1d7362
·
verified ·
1 Parent(s): a9a996b

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +16 -38
tools.py CHANGED
@@ -74,54 +74,32 @@ class translate_everything(Tool):
74
  return f"The translated sentence is : {translated_sentence}"
75
 
76
  class multimodal_interpreter(Tool):
77
- name="multimodal_tool"
78
  description = "Allows you to answer any question which relies on image or video input."
79
  inputs = {
80
- 'image': {"type": "image", "description": "the image or video of interest"},
81
- 'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Describe this image."}
82
  }
83
  output_type = "string"
84
-
85
  def forward(self, prompt, image):
86
-
87
- # default: Load the model on the available device(s)
88
  model = Qwen2VLForConditionalGeneration.from_pretrained(
89
- "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
90
  )
91
-
92
- # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
93
- # model = Qwen2VLForConditionalGeneration.from_pretrained(
94
- # "Qwen/Qwen2-VL-7B-Instruct",
95
- # torch_dtype=torch.bfloat16,
96
- # attn_implementation="flash_attention_2",
97
- # device_map="auto",
98
- # )
99
-
100
- # default processer
101
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
102
 
103
- # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
104
- # min_pixels = 256*28*28
105
- # max_pixels = 1280*28*28
106
- # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
107
-
108
  messages = [
109
  {
110
  "role": "user",
111
  "content": [
112
- {
113
- "type": "image",
114
- "image": {image},
115
- },
116
- {"type": "text", "text": {prompt}},
117
  ],
118
  }
119
  ]
120
-
121
- # Preparation for inference
122
- text = processor.apply_chat_template(
123
- messages, tokenize=False, add_generation_prompt=True
124
- )
125
  image_inputs, video_inputs = process_vision_info(messages)
126
  inputs = processor(
127
  text=[text],
@@ -129,18 +107,18 @@ class multimodal_interpreter(Tool):
129
  videos=video_inputs,
130
  padding=True,
131
  return_tensors="pt",
132
- )
133
- inputs = inputs.to("cuda")
134
-
135
- # Inference: Generation of the output
136
  generated_ids = model.generate(**inputs, max_new_tokens=128)
137
  generated_ids_trimmed = [
138
- out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
139
  ]
140
  output_text = processor.batch_decode(
141
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
142
  )
143
- return output_text
 
 
144
 
145
  class audio_or_mp3__interpreter(Tool):
146
  name="audio_tool"
 
74
  return f"The translated sentence is : {translated_sentence}"
75
 
76
  class multimodal_interpreter(Tool):
77
+ name = "multimodal_tool"
78
  description = "Allows you to answer any question which relies on image or video input."
79
  inputs = {
80
+ 'image': {"type": "image", "description": "The image or video of interest"},
81
+ 'prompt': {"type": "string", "description": "Any specific question you have on the image. For example: Describe this image."}
82
  }
83
  output_type = "string"
84
+
85
  def forward(self, prompt, image):
86
+ device = "cuda" if torch.cuda.is_available() else "cpu"
 
87
  model = Qwen2VLForConditionalGeneration.from_pretrained(
88
+ "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
89
  )
 
 
 
 
 
 
 
 
 
 
90
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
91
 
 
 
 
 
 
92
  messages = [
93
  {
94
  "role": "user",
95
  "content": [
96
+ {"type": "image", "image": image},
97
+ {"type": "text", "text": prompt},
 
 
 
98
  ],
99
  }
100
  ]
101
+
102
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
 
103
  image_inputs, video_inputs = process_vision_info(messages)
104
  inputs = processor(
105
  text=[text],
 
107
  videos=video_inputs,
108
  padding=True,
109
  return_tensors="pt",
110
+ ).to(device)
111
+
 
 
112
  generated_ids = model.generate(**inputs, max_new_tokens=128)
113
  generated_ids_trimmed = [
114
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
115
  ]
116
  output_text = processor.batch_decode(
117
  generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
118
  )
119
+
120
+ return output_text[0]
121
+
122
 
123
  class audio_or_mp3__interpreter(Tool):
124
  name="audio_tool"