Spaces:

Eripsa
/

basque-asr-space

Sleeping

App Files Files Community

basque-asr-space / phd_model /example.py

Eripsa

add phd_model deps

c27990f about 2 months ago

raw

history blame contribute delete

1.9 kB

	"""
	This is an example how to load the model from Huggingface and use it to
	- Recognize IPA phones
	- Extract CNN features
	- Extract Transformer Encoder features
	"""
	from decoder.ctc_decoder import decode_lattice
	from phonetics.ipa import symbol_to_descriptor, to_symbol
	from model.wav2vec2 import Wav2Vec2
	from torchinfo import summary
	import torch
	import numpy as np


	def main():
	# Get device
	device = "cuda" if torch.cuda.is_available() else "cpu"

	# Load model from Huggingface hub
	wav2vec2 = Wav2Vec2.from_pretrained("pklumpp/Wav2Vec2_CommonPhone")
	wav2vec2.to(device)
	wav2vec2.eval()

	# Print model summary for batch_size 1 and a single second of audio samples
	summary(wav2vec2, input_size=(1, 16_000), depth=8, device=device)

	# Create new random audio (you can load your own audio here to get actual predictions)
	rand_audio = np.random.rand(1, 16_000)

	# IMPORTANT: Always standardize input audio
	mean = rand_audio.mean()
	std = rand_audio.std()
	rand_audio = (rand_audio - mean) / (std + 1e-9)

	# Create torch tensor, move to device and feed the model
	rand_audio = torch.tensor(
	rand_audio,
	dtype=torch.float,
	device=device,
	)
	with torch.no_grad():
	y_pred, enc_features, cnn_features = wav2vec2(rand_audio)

	# Decode CTC output for first sample in batch
	phone_sequence, enc_feats, cnn_feats, probs = decode_lattice(
	lattice=y_pred[0].cpu().numpy(),
	enc_feats=enc_features[0].cpu().numpy(),
	cnn_feats=cnn_features[0].cpu().numpy(),
	)
	# phone_sequence contains indices right now. Convert to actual IPA symbols
	symbol_sequence = [to_symbol(i) for i in phone_sequence]

	# Example to convert [œ] to the descriptor "front open-mid rounded vowel"
	print(symbol_to_descriptor("œ"))


	if __name__ == "__main__":
	main()