Final_Assignment_Template_V2

Sleeping

App Files Files Community

CindyDelage commited on May 2, 2025

Commit

e1d7362

verified ·

1 Parent(s): a9a996b

Update tools.py

Browse files

Files changed (1) hide show

tools.py +16 -38

tools.py CHANGED Viewed

@@ -74,54 +74,32 @@ class translate_everything(Tool):
         return f"The translated sentence is : {translated_sentence}"
 class multimodal_interpreter(Tool):
-    name="multimodal_tool"
     description = "Allows you to answer any question which relies on image or video input."
     inputs = {
-        'image': {"type": "image", "description": "the image or video of interest"},
-        'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Describe this image."}
     }
     output_type = "string"
     def forward(self, prompt, image):
-        # default: Load the model on the available device(s)
         model = Qwen2VLForConditionalGeneration.from_pretrained(
-            "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
         )
-        # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
-        # model = Qwen2VLForConditionalGeneration.from_pretrained(
-        #     "Qwen/Qwen2-VL-7B-Instruct",
-        #     torch_dtype=torch.bfloat16,
-        #     attn_implementation="flash_attention_2",
-        #     device_map="auto",
-        # )
-        # default processer
         processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
-        # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
-        # min_pixels = 256*28*28
-        # max_pixels = 1280*28*28
-        # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
         messages = [
             {
                 "role": "user",
                 "content": [
-                    {
-                        "type": "image",
-                        "image": {image},
-                    },
-                    {"type": "text", "text": {prompt}},
                 ],
             }
         ]
-        # Preparation for inference
-        text = processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
         image_inputs, video_inputs = process_vision_info(messages)
         inputs = processor(
             text=[text],
@@ -129,18 +107,18 @@ class multimodal_interpreter(Tool):
             videos=video_inputs,
             padding=True,
             return_tensors="pt",
-        )
-        inputs = inputs.to("cuda")
-        # Inference: Generation of the output
         generated_ids = model.generate(**inputs, max_new_tokens=128)
         generated_ids_trimmed = [
-            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         output_text = processor.batch_decode(
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
-        return output_text
 class audio_or_mp3__interpreter(Tool):
     name="audio_tool"

         return f"The translated sentence is : {translated_sentence}"
 class multimodal_interpreter(Tool):
+    name = "multimodal_tool"
     description = "Allows you to answer any question which relies on image or video input."
     inputs = {
+        'image': {"type": "image", "description": "The image or video of interest"},
+        'prompt': {"type": "string", "description": "Any specific question you have on the image. For example: Describe this image."}
     }
     output_type = "string"
     def forward(self, prompt, image):
+        device = "cuda" if torch.cuda.is_available() else "cpu"
         model = Qwen2VLForConditionalGeneration.from_pretrained(
+        "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
         )
         processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
         messages = [
             {
                 "role": "user",
                 "content": [
+                    {"type": "image", "image": image},
+                    {"type": "text", "text": prompt},
                 ],
             }
         ]
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
         image_inputs, video_inputs = process_vision_info(messages)
         inputs = processor(
             text=[text],
             videos=video_inputs,
             padding=True,
             return_tensors="pt",
+        ).to(device)
         generated_ids = model.generate(**inputs, max_new_tokens=128)
         generated_ids_trimmed = [
+            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
         ]
         output_text = processor.batch_decode(
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
+        return output_text[0]
 class audio_or_mp3__interpreter(Tool):
     name="audio_tool"