Spaces:

nvidia
/

audio-flamingo-2

Paused

audio-flamingo-2 / my_ms_clap /src /zero_shot_predictions.py

root

initial commit

a344f64 10 months ago

1.55 kB

	"""
	This is an example using CLAP for zero-shot inference.
	"""
	from CLAPWrapper import CLAPWrapper
	import torch.nn.functional as F

	# Define classes for zero-shot
	# Should be in lower case and can be more than one word
	classes = ['coughing','sneezing','drinking sipping', 'breathing', 'brushing teeth']
	ground_truth = ['coughing']
	# Add prompt
	prompt = 'this is a sound of '
	class_prompts = [prompt + x for x in classes]
	#Load audio files
	audio_files = ['audio_file']

	# Load and initialize CLAP
	weights_path = "weights_path"
	# Setting use_cuda = True will load the model on a GPU using CUDA
	clap_model = CLAPWrapper(weights_path, version = '2023', use_cuda=False)

	# compute text embeddings from natural text
	text_embeddings = clap_model.get_text_embeddings(class_prompts)

	# compute the audio embeddings from an audio file
	audio_embeddings = clap_model.get_audio_embeddings(audio_files, resample=True)

	# compute the similarity between audio_embeddings and text_embeddings
	similarity = clap_model.compute_similarity(audio_embeddings, text_embeddings)

	similarity = F.softmax(similarity, dim=1)
	values, indices = similarity[0].topk(5)

	# Print the results
	print("Ground Truth: {}".format(ground_truth))
	print("Top predictions:\n")
	for value, index in zip(values, indices):
	print(f"{classes[index]:>16s}: {100 * value.item():.2f}%")

	"""
	The output (the exact numbers may vary):

	Ground Truth: coughing
	Top predictions:

	coughing: 98.55%
	sneezing: 1.24%
	drinking sipping: 0.15%
	breathing: 0.02%
	brushing teeth: 0.01%
	"""