Files changed (1) hide show
  1. tools.py +86 -17
tools.py CHANGED
@@ -1,6 +1,12 @@
1
  from smolagents import DuckDuckGoSearchTool
2
  from smolagents import Tool
3
  from huggingface_hub import InferenceClient
 
 
 
 
 
 
4
 
5
  class Web_research(Tool):
6
  name="web_research"
@@ -61,36 +67,99 @@ class translate_everything(Tool):
61
  translated_sentence = " ".join(right_sentence[::-1])
62
  return f"The translated sentence is : {translated_sentence}"
63
 
64
- class image_interpreter(Tool):
65
  name="multimodal_tool"
66
- description = "Allows you to answer any question which relies on image input."
67
  inputs = {
68
- 'image': {"type": "image", "description": "the image of interest"},
69
  'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Summarise this image in one sentence."}
70
  }
71
  output_type = "string"
72
 
73
  def forward(self, prompt, image):
74
- model_sdxl = "meta-llama/Llama-3.1-8B-Instruct"
75
- client = InferenceClient(model_sdxl)
76
- output = client.chat.completions.create(
77
- messages=[
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  {
79
  "role": "user",
80
  "content": [
81
- {
82
- "type": "image",
83
- "image": {image},
84
- },
85
- {
86
- "type": "text",
87
- "text": {prompt},
88
- },
89
  ],
90
  },
91
- ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  )
93
- return output
 
 
 
 
94
 
95
  class Wikipedia_reader(Tool):
96
  name="wikipedia_tool"
 
1
  from smolagents import DuckDuckGoSearchTool
2
  from smolagents import Tool
3
  from huggingface_hub import InferenceClient
4
+ import soundfile as sf
5
+ from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
6
+ from qwen_omni_utils import process_mm_info
7
+ import torch
8
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
9
+ from datasets import load_dataset
10
 
11
  class Web_research(Tool):
12
  name="web_research"
 
67
  translated_sentence = " ".join(right_sentence[::-1])
68
  return f"The translated sentence is : {translated_sentence}"
69
 
70
+ class multimodal_interpreter(Tool):
71
  name="multimodal_tool"
72
+ description = "Allows you to answer any question which relies on image or video input."
73
  inputs = {
74
+ 'image': {"type": "image", "description": "the image or video of interest"},
75
  'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Summarise this image in one sentence."}
76
  }
77
  output_type = "string"
78
 
79
  def forward(self, prompt, image):
80
+ # default: Load the model on the available device(s)
81
+ model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
82
+
83
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving.
84
+ # model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
85
+ # "Qwen/Qwen2.5-Omni-7B",
86
+ # torch_dtype="auto",
87
+ # device_map="auto",
88
+ # attn_implementation="flash_attention_2",
89
+ # )
90
+
91
+ processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
92
+
93
+ conversation = [
94
+ {
95
+ "role": "system",
96
+ "content": [
97
+ {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
98
+ ],
99
+ },
100
  {
101
  "role": "user",
102
  "content": [
103
+ {"type": "image", "image": {image}},
 
 
 
 
 
 
 
104
  ],
105
  },
106
+ ]
107
+
108
+ # set use audio in video
109
+ USE_AUDIO_IN_VIDEO = True
110
+
111
+ # Preparation for inference
112
+ text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
113
+ audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
114
+ inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
115
+ inputs = inputs.to(model.device).to(model.dtype)
116
+
117
+ # Inference: Generation of the output text and audio
118
+ text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)
119
+
120
+ text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
121
+ sf.write(
122
+ "output.wav",
123
+ audio.reshape(-1).detach().cpu().numpy(),
124
+ samplerate=24000,
125
+ )
126
+
127
+ return text
128
+
129
+ class audio_or_mp3__interpreter(Tool):
130
+ name="multimodal_tool"
131
+ description = "Allows you to convert audio into text. It uses Whisper, it is a state-of-the-art model for automatic speech recognition (ASR) and speech translation"
132
+ inputs = {
133
+ 'audio': {"type": "audio", "description": "the audio of interest"}
134
+ }
135
+ output_type = "string"
136
+
137
+ def forward(self, prompt, audio):
138
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
139
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
140
+
141
+ model_id = "openai/whisper-large-v3"
142
+
143
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
144
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
145
+ )
146
+ model.to(device)
147
+
148
+ processor = AutoProcessor.from_pretrained(model_id)
149
+
150
+ pipe = pipeline(
151
+ "automatic-speech-recognition",
152
+ model=model,
153
+ tokenizer=processor.tokenizer,
154
+ feature_extractor=processor.feature_extractor,
155
+ torch_dtype=torch_dtype,
156
+ device=device,
157
  )
158
+
159
+ sample = {audio}[0]["audio"]
160
+
161
+ result = pipe(sample)
162
+ return result["text"]
163
 
164
  class Wikipedia_reader(Tool):
165
  name="wikipedia_tool"