File size: 7,075 Bytes
0dbf350
 
 
a4bb35a
 
 
 
 
 
0dbf350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63b8e47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db9f047
bdba95b
db9f047
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a4bb35a
0dbf350
a4bb35a
0dbf350
a4bb35a
0dbf350
 
 
 
 
a4bb35a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0dbf350
 
 
a4bb35a
0dbf350
 
a4bb35a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0dbf350
a4bb35a
 
 
 
 
b596b30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
from smolagents import DuckDuckGoSearchTool
from smolagents import Tool
from huggingface_hub import InferenceClient
import soundfile as sf
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from datasets import load_dataset

class Web_research(Tool):
    name="web_research"
    description = "Web search on a specific topic."
    inputs = {
        "topic": {
            "type": "string",
            "description": "The topic on which the user wants the latest news"
        }
    }
    output_type = "string"

    def forward(self, topic: str):
        search_tool = DuckDuckGoSearchTool()
        # Example usage
        results = search_tool(f"{topic}")
        return f"Here is what we can find on the web for {topic} : str({results})"

class Find_wikipedia_URL(Tool):
    name="wiki_url"
    description = "Always use to check a wikipedia ENGLISH URL page before trying to acces the URL. For another langage, you just have to change the beginning of the url (here, it is en for english)"
    inputs = {
        "subject": {
            "type": "string",
            "description": "The name or topic on which you want the Wikipedia URL"
        }
    }
    output_type = "string"

    def forward(self, subject: str):
        words=subject.split()
        url_wiki="https://en.wikipedia.org/wiki/"
        for i in range(len(words)):
            if(i==0):
                url_wiki+=str(words[i])
            if(i!=0):
                url_wiki+='_'+str(words[i])
        return f"Here is what we url to use : str({url_wiki}). If it does not work, change the first letters of {subject} to be upper or lower, but never change anything else"

class translate_everything(Tool):
    name="translator"
    description = "You do not understand a sentence? It does not look like any language you know? Try this tool, maybe the sentence is just reversed!"
    inputs = {
        "sentence": {
            "type": "string",
            "description": "The sentence to translate"
        }
    }
    output_type = "string"

    def forward(self, sentence: str):
        # Input string
        reversed_words = sentence.split() #' '.join(s.split()[::-1])
        right_sentence=[]
        for word in reversed_words:
            right_sentence.append(word[::-1])
        
        translated_sentence = " ".join(right_sentence[::-1])
        return f"The translated sentence is : {translated_sentence}"

class multimodal_interpreter(Tool):
    name="multimodal_tool"
    description = "Allows you to answer any question which relies on image or video input."
    inputs = {
        'image': {"type": "image", "description": "the image or video of interest"},
        'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Summarise this image in one sentence."}
    }
    output_type = "string"
    
    def forward(self, prompt, image):
        # default: Load the model on the available device(s)
        model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")
        
        # We recommend enabling flash_attention_2 for better acceleration and memory saving.
        # model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
        #     "Qwen/Qwen2.5-Omni-7B",
        #     torch_dtype="auto",
        #     device_map="auto",
        #     attn_implementation="flash_attention_2",
        # )
        
        processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")
        
        conversation = [
            {
                "role": "system",
                "content": [
                    {"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
                ],
            },
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": {image}},
                ],
            },
        ]
        
        # set use audio in video
        USE_AUDIO_IN_VIDEO = True
        
        # Preparation for inference
        text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
        audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
        inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
        inputs = inputs.to(model.device).to(model.dtype)
        
        # Inference: Generation of the output text and audio
        text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)

        text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        sf.write(
            "output.wav",
            audio.reshape(-1).detach().cpu().numpy(),
            samplerate=24000,
        )

        return text

class audio_or_mp3__interpreter(Tool):
    name="multimodal_tool"
    description = "Allows you to convert audio into text. It uses Whisper, it is a state-of-the-art model for automatic speech recognition (ASR) and speech translation"
    inputs = {
        'audio': {"type": "audio", "description": "the audio of interest"}
    }
    output_type = "string"
    
    def forward(self, prompt, audio):
        device = "cuda:0" if torch.cuda.is_available() else "cpu"
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
        
        model_id = "openai/whisper-large-v3"
        
        model = AutoModelForSpeechSeq2Seq.from_pretrained(
            model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
        )
        model.to(device)
        
        processor = AutoProcessor.from_pretrained(model_id)
        
        pipe = pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            torch_dtype=torch_dtype,
            device=device,
        )
        
        sample = {audio}[0]["audio"]
        
        result = pipe(sample)
        return result["text"]

class Wikipedia_reader(Tool):
    name="wikipedia_tool"
    description = "To be used whenever you need to read a Wikipedia page. Will return all the text of the Wikipedia page, to easily read it and find information"
    inputs = {
        "url": {
            "type": "string",
            "description": "The wikippedia url page"
        }
    }
    output_type = "string"

    def forward(self, url: str):
        try:
            page = requests.get(url)
        except Exception as e:
            print('Error downloading page: ',e)
        soup = BeautifulSoup(page.text, 'html.parser')
        return soup.text