Final_Assignment_Template_V2

Sleeping

App Files Files Community

Final_Assignment_Template_V2 / tools.py

CindyDelage

Update tools.py

a4bb35a verified 9 months ago

raw

history blame

7.08 kB

	from smolagents import DuckDuckGoSearchTool
	from smolagents import Tool
	from huggingface_hub import InferenceClient
	import soundfile as sf
	from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
	from qwen_omni_utils import process_mm_info
	import torch
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
	from datasets import load_dataset

	class Web_research(Tool):
	name="web_research"
	description = "Web search on a specific topic."
	inputs = {
	"topic": {
	"type": "string",
	"description": "The topic on which the user wants the latest news"
	}
	}
	output_type = "string"

	def forward(self, topic: str):
	search_tool = DuckDuckGoSearchTool()
	# Example usage
	results = search_tool(f"{topic}")
	return f"Here is what we can find on the web for {topic} : str({results})"

	class Find_wikipedia_URL(Tool):
	name="wiki_url"
	description = "Always use to check a wikipedia ENGLISH URL page before trying to acces the URL. For another langage, you just have to change the beginning of the url (here, it is en for english)"
	inputs = {
	"subject": {
	"type": "string",
	"description": "The name or topic on which you want the Wikipedia URL"
	}
	}
	output_type = "string"

	def forward(self, subject: str):
	words=subject.split()
	url_wiki="https://en.wikipedia.org/wiki/"
	for i in range(len(words)):
	if(i==0):
	url_wiki+=str(words[i])
	if(i!=0):
	url_wiki+='_'+str(words[i])
	return f"Here is what we url to use : str({url_wiki}). If it does not work, change the first letters of {subject} to be upper or lower, but never change anything else"

	class translate_everything(Tool):
	name="translator"
	description = "You do not understand a sentence? It does not look like any language you know? Try this tool, maybe the sentence is just reversed!"
	inputs = {
	"sentence": {
	"type": "string",
	"description": "The sentence to translate"
	}
	}
	output_type = "string"

	def forward(self, sentence: str):
	# Input string
	reversed_words = sentence.split() #' '.join(s.split()[::-1])
	right_sentence=[]
	for word in reversed_words:
	right_sentence.append(word[::-1])

	translated_sentence = " ".join(right_sentence[::-1])
	return f"The translated sentence is : {translated_sentence}"

	class multimodal_interpreter(Tool):
	name="multimodal_tool"
	description = "Allows you to answer any question which relies on image or video input."
	inputs = {
	'image': {"type": "image", "description": "the image or video of interest"},
	'prompt': {"type": "string", "description": "Any specific question you have on the image. For example, the prompt can be : Summarise this image in one sentence."}
	}
	output_type = "string"

	def forward(self, prompt, image):
	# default: Load the model on the available device(s)
	model = Qwen2_5OmniForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-Omni-7B", torch_dtype="auto", device_map="auto")

	# We recommend enabling flash_attention_2 for better acceleration and memory saving.
	# model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
	# "Qwen/Qwen2.5-Omni-7B",
	# torch_dtype="auto",
	# device_map="auto",
	# attn_implementation="flash_attention_2",
	# )

	processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")

	conversation = [
	{
	"role": "system",
	"content": [
	{"type": "text", "text": "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech."}
	],
	},
	{
	"role": "user",
	"content": [
	{"type": "image", "image": {image}},
	],
	},
	]

	# set use audio in video
	USE_AUDIO_IN_VIDEO = True

	# Preparation for inference
	text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
	audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)
	inputs = processor(text=text, audio=audios, images=images, videos=videos, return_tensors="pt", padding=True, use_audio_in_video=USE_AUDIO_IN_VIDEO)
	inputs = inputs.to(model.device).to(model.dtype)

	# Inference: Generation of the output text and audio
	text_ids, audio = model.generate(**inputs, use_audio_in_video=USE_AUDIO_IN_VIDEO)

	text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
	sf.write(
	"output.wav",
	audio.reshape(-1).detach().cpu().numpy(),
	samplerate=24000,
	)

	return text

	class audio_or_mp3__interpreter(Tool):
	name="multimodal_tool"
	description = "Allows you to convert audio into text. It uses Whisper, it is a state-of-the-art model for automatic speech recognition (ASR) and speech translation"
	inputs = {
	'audio': {"type": "audio", "description": "the audio of interest"}
	}
	output_type = "string"

	def forward(self, prompt, audio):
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	model_id = "openai/whisper-large-v3"

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
	)
	model.to(device)

	processor = AutoProcessor.from_pretrained(model_id)

	pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	torch_dtype=torch_dtype,
	device=device,
	)

	sample = {audio}[0]["audio"]

	result = pipe(sample)
	return result["text"]

	class Wikipedia_reader(Tool):
	name="wikipedia_tool"
	description = "To be used whenever you need to read a Wikipedia page. Will return all the text of the Wikipedia page, to easily read it and find information"
	inputs = {
	"url": {
	"type": "string",
	"description": "The wikippedia url page"
	}
	}
	output_type = "string"

	def forward(self, url: str):
	try:
	page = requests.get(url)
	except Exception as e:
	print('Error downloading page: ',e)
	soup = BeautifulSoup(page.text, 'html.parser')
	return soup.text