from smolagents import DuckDuckGoSearchTool from smolagents import Tool from huggingface_hub import InferenceClient import soundfile as sf from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor from qwen_omni_utils import process_mm_info import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline from datasets import load_dataset class Web_research(Tool): name="web_research" description = "Web search on a specific topic." inputs = { "topic": { "type": "string", "description": "The topic on which the user wants the latest news" } } output_type = "string" def forward(self, topic: str): search_tool = DuckDuckGoSearchTool() # Example usage results = search_tool(f"{topic}") return f"Here is what we can find on the web for {topic} : str({results})" class Find_wikipedia_URL(Tool): name="wiki_url" description = "Always use to check a wikipedia ENGLISH URL page before trying to acces the URL. For another langage, you just have to change the beginning of the url (here, it is en for english)" inputs = { "subject": { "type": "string", "description": "The name or topic on which you want the Wikipedia URL" } } output_type = "string" def forward(self, subject: str): words=subject.split() url_wiki="https://en.wikipedia.org/wiki/" for i in range(len(words)): if(i==0): url_wiki+=str(words[i]) if(i!=0): url_wiki+='_'+str(words[i]) return f"Here is what we url to use : str({url_wiki}). If it does not work, change the first letters of {subject} to be upper or lower, but never change anything else" class translate_everything(Tool): name="translator" description = "You do not understand a sentence? It does not look like any language you know? Try this tool, maybe the sentence is just reversed!" inputs = { "sentence": { "type": "string", "description": "The sentence to translate" } } output_type = "string" def forward(self, sentence: str): # Input string reversed_words = sentence.split() #' '.join(s.split()[::-1]) right_sentence=[] for word in reversed_words: right_sentence.append(word[::-1]) translated_sentence = " ".join(right_sentence[::-1]) return f"The translated sentence is : {translated_sentence}" class multimodal_interpreter(Tool): name="multimodal_tool" description = "Allows you to answer any question which relies on image or video input." inputs = { 'image': {"type": "image", "description": "the image or video of interest"}, 'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Summarise this image in one sentence."} } output_type = "string" def forward(self, prompt, image): # default: Load the model on the available device(s) model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto") # We recommend enabling flash_attention_2 for better acceleration and memory saving. # model = Qwen2_5OmniForConditionalGeneration.from_pretrained( # "Qwen/Qwen2.5-Omni-7B", # torch_dtype="auto", # device_map="auto", # attn_implementation="flash_attention_2", # ) processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B") conversation = [ { "role": "system", "content": [ {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."} ], }, { "role": "user", "content": [ {"type": "image", "image": {image}}, ], }, ] # set use audio in video USE_AUDIO_IN_VIDEO = True # Preparation for inference text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False) audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO) inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO) inputs = inputs.to(model.device).to(model.dtype) # Inference: Generation of the output text and audio text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO) text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) sf.write( "output.wav", audio.reshape(-1).detach().cpu().numpy(), samplerate=24000, ) return text class audio_or_mp3__interpreter(Tool): name="multimodal_tool" description = "Allows you to convert audio into text. It uses Whisper, it is a state-of-the-art model for automatic speech recognition (ASR) and speech translation" inputs = { 'audio': {"type": "audio", "description": "the audio of interest"} } output_type = "string" def forward(self, prompt, audio): device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "openai/whisper-large-v3" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, device=device, ) sample = {audio}[0]["audio"] result = pipe(sample) return result["text"] class Wikipedia_reader(Tool): name="wikipedia_tool" description = "To be used whenever you need to read a Wikipedia page. Will return all the text of the Wikipedia page, to easily read it and find information" inputs = { "url": { "type": "string", "description": "The wikippedia url page" } } output_type = "string" def forward(self, url: str): try: page = requests.get(url) except Exception as e: print('Error downloading page: ',e) soup = BeautifulSoup(page.text, 'html.parser') return soup.text