| from smolagents import DuckDuckGoSearchTool |
| from smolagents import Tool |
| from huggingface_hub import InferenceClient |
| import soundfile as sf |
| import torch |
| from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline |
| from datasets import load_dataset |
| from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor |
| from qwen_vl_utils import process_vision_info |
|
|
| class Web_research(Tool): |
| name="web_research" |
| description = "Web search on a specific topic." |
| inputs = { |
| "topic": { |
| "type": "string", |
| "description": "The topic on which the user wants the latest news" |
| } |
| } |
| output_type = "string" |
|
|
| def forward(self, topic: str): |
| search_tool = DuckDuckGoSearchTool() |
| |
| results = search_tool(f"{topic}") |
| return f"Here is what we can find on the web for {topic} : str({results})" |
|
|
| class Find_wikipedia_URL(Tool): |
| name="wiki_url" |
| description = "Always use to check a wikipedia ENGLISH URL page before trying to acces the URL. For another langage, you just have to change the beginning of the url (here, it is en for english)" |
| inputs = { |
| "subject": { |
| "type": "string", |
| "description": "The name or topic on which you want the Wikipedia URL" |
| } |
| } |
| output_type = "string" |
|
|
| def forward(self, subject: str): |
| words=subject.split() |
| url_wiki="https://en.wikipedia.org/wiki/" |
| for i in range(len(words)): |
| if(i==0): |
| url_wiki+=str(words[i]) |
| if(i!=0): |
| url_wiki+='_'+str(words[i]) |
| return f"Here is what we url to use : str({url_wiki}). If it does not work, change the first letters of {subject} to be upper or lower, but never change anything else" |
|
|
| class translate_everything(Tool): |
| name="translator" |
| description = "You do not understand a sentence? It does not look like any language you know? Try this tool, maybe the sentence is just reversed!" |
| inputs = { |
| "sentence": { |
| "type": "string", |
| "description": "The sentence to translate" |
| } |
| } |
| output_type = "string" |
|
|
| def forward(self, sentence: str): |
| |
| reversed_words = sentence.split() |
| right_sentence=[] |
| for word in reversed_words: |
| right_sentence.append(word[::-1]) |
| |
| translated_sentence = " ".join(right_sentence[::-1]) |
| return f"The translated sentence is : {translated_sentence}" |
|
|
| class multimodal_interpreter(Tool): |
| name="multimodal_tool" |
| description = "Allows you to answer any question which relies on image or video input." |
| inputs = { |
| 'image': {"type": "image", "description": "the image or video of interest"}, |
| 'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Describe this image."} |
| } |
| output_type = "string" |
| |
| def forward(self, prompt, image): |
| |
| |
| model = Qwen2VLForConditionalGeneration.from_pretrained( |
| "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto" |
| ) |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") |
| |
| |
| |
| |
| |
| |
| messages = [ |
| { |
| "role": "user", |
| "content": [ |
| { |
| "type": "image", |
| "image": {image}, |
| }, |
| {"type": "text", "text": {prompt}}, |
| ], |
| } |
| ] |
| |
| |
| text = processor.apply_chat_template( |
| messages, tokenize=False, add_generation_prompt=True |
| ) |
| image_inputs, video_inputs = process_vision_info(messages) |
| inputs = processor( |
| text=[text], |
| images=image_inputs, |
| videos=video_inputs, |
| padding=True, |
| return_tensors="pt", |
| ) |
| inputs = inputs.to("cuda") |
| |
| |
| generated_ids = model.generate(**inputs, max_new_tokens=128) |
| generated_ids_trimmed = [ |
| out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) |
| ] |
| output_text = processor.batch_decode( |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False |
| ) |
| return output_text |
|
|
| class audio_or_mp3__interpreter(Tool): |
| name="multimodal_tool" |
| description = "Allows you to convert audio into text. It uses Whisper, it is a state-of-the-art model for automatic speech recognition (ASR) and speech translation" |
| inputs = { |
| 'audio': {"type": "audio", "description": "the audio of interest"} |
| } |
| output_type = "string" |
| |
| def forward(self, prompt, audio): |
| device = "cuda:0" if torch.cuda.is_available() else "cpu" |
| torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 |
| |
| model_id = "openai/whisper-large-v3" |
| |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( |
| model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True |
| ) |
| model.to(device) |
| |
| processor = AutoProcessor.from_pretrained(model_id) |
| |
| pipe = pipeline( |
| "automatic-speech-recognition", |
| model=model, |
| tokenizer=processor.tokenizer, |
| feature_extractor=processor.feature_extractor, |
| torch_dtype=torch_dtype, |
| device=device, |
| ) |
| |
| sample = {audio} |
| |
| result = pipe(sample) |
| return result["text"] |
|
|
| class Wikipedia_reader(Tool): |
| name="wikipedia_tool" |
| description = "To be used whenever you need to read a Wikipedia page. Will return all the text of the Wikipedia page, to easily read it and find information" |
| inputs = { |
| "url": { |
| "type": "string", |
| "description": "The wikippedia url page" |
| } |
| } |
| output_type = "string" |
|
|
| def forward(self, url: str): |
| try: |
| page = requests.get(url) |
| except Exception as e: |
| print('Error downloading page: ',e) |
| soup = BeautifulSoup(page.text, 'html.parser') |
| return soup.text |