Spaces:
Runtime error
Runtime error
| import torch | |
| from TTS.api import TTS | |
| #Andy edited: import losses | |
| import audio_diffusion_attacks_forhf.src.losses | |
| from audiotools import AudioSignal | |
| import numpy as np | |
| import torchaudio | |
| import random | |
| import string | |
| import os | |
| class XTTS_Eval: | |
| def __init__(self, input_sample_rate, text="The quick brown fox jumps over the lazy dog."): | |
| self.model = TTS("tts_models/multilingual/multi-dataset/xtts_v2") | |
| self.model=self.model.to(device='cuda') | |
| self.text=text | |
| self.input_sample_rate=input_sample_rate | |
| self.mel_loss = losses.MelSpectrogramLoss(n_mels=[5, 10, 20, 40, 80, 160, 320], | |
| window_lengths=[32, 64, 128, 256, 512, 1024, 2048], | |
| mel_fmin=[0, 0, 0, 0, 0, 0, 0], | |
| pow=1.0, | |
| clamp_eps=1.0e-5, | |
| mag_weight=0.0) | |
| def eval(self, original_audio, protected_audio): | |
| original_audio=original_audio[0] | |
| protected_audio=protected_audio[0] | |
| unprotected_gen=self.generate_audio(original_audio).to(device='cuda') | |
| protected_gen=self.generate_audio(protected_audio).to(device='cuda') | |
| match_len=min(original_audio.shape[1], unprotected_gen.shape[1]) | |
| if original_audio.shape[1]<unprotected_gen.shape[1]: | |
| s_unprotected_gen=unprotected_gen[:, :match_len] | |
| s_protected_gen=unprotected_gen[:, :match_len] | |
| s_original_audio=original_audio | |
| s_protected_audio=protected_audio | |
| else: | |
| s_unprotected_gen=unprotected_gen | |
| s_protected_gen=unprotected_gen | |
| s_original_audio=original_audio[:, :match_len] | |
| s_protected_audio=protected_audio[:, :match_len] | |
| match_len=min(protected_gen.shape[1], unprotected_gen.shape[1]) | |
| protected_gen=protected_gen[:,:match_len] | |
| unprotected_gen=unprotected_gen[:,:match_len] | |
| eval_dict={} | |
| # Difference between original and unprotected gen | |
| eval_dict["original_unprotectedgen_l1"]=torch.mean(torch.abs(s_original_audio-s_unprotected_gen)) | |
| eval_dict["original_unprotectedgen_mel"]=self.mel_loss(AudioSignal(s_original_audio, self.input_sample_rate), AudioSignal(s_unprotected_gen, self.input_sample_rate)) | |
| # Difference between original and protected gen | |
| eval_dict["original_protectedgen_l1"]=torch.mean(torch.abs(s_original_audio-s_protected_gen)) | |
| eval_dict["original_protectedgen_mel"]=self.mel_loss(AudioSignal(s_original_audio, self.input_sample_rate), AudioSignal(s_protected_gen, self.input_sample_rate)) | |
| # Difference between protected and protected gen | |
| eval_dict["protected_protectedgen_l1"]=torch.mean(torch.abs(s_protected_audio-s_protected_gen)) | |
| eval_dict["protected_protectedgen_mel"]=self.mel_loss(AudioSignal(s_protected_audio, self.input_sample_rate), AudioSignal(s_protected_gen, self.input_sample_rate)) | |
| # Difference between unprotected gen and protected gen | |
| eval_dict["protectedgen_unprotectedgen_l1"]=torch.mean(torch.abs(protected_gen-unprotected_gen)) | |
| eval_dict["protectedgen_unprotectedgen_mel"]=self.mel_loss(AudioSignal(protected_gen, self.input_sample_rate), AudioSignal(unprotected_gen, self.input_sample_rate)) | |
| return eval_dict, unprotected_gen, protected_gen | |
| def generate_audio(self, audio): | |
| random_str=''.join(random.choices(string.ascii_uppercase + string.digits, k=50)) | |
| torchaudio.save(f"test_audio/{random_str}.wav", torch.reshape(audio.detach().cpu(), (2, audio.shape[1])), self.input_sample_rate, format="wav") | |
| torch.manual_seed(0) | |
| wav = self.model.tts(text=self.text, | |
| speaker_wav=f"test_audio/{random_str}.wav", | |
| language="en") | |
| os.remove(f"test_audio/{random_str}.wav") | |
| wav=torch.from_numpy(np.array(wav)) | |
| stereo_wave=torch.zeros((2, wav.shape[0])) | |
| stereo_wave[:,:]=wav | |
| transform = torchaudio.transforms.Resample(24000, self.input_sample_rate) | |
| stereo_wave=transform(stereo_wave) | |
| return stereo_wave | |
| # # Init TTS | |
| # tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) | |
| # | |
| # # Run TTS | |
| # # ❗ Since this model is multi-lingual voice cloning model, we must set the target speaker_wav and language | |
| # # Text to speech list of amplitude values as output | |
| # # wav = tts.tts(text="Hello world!", speaker_wav=, language="en") | |
| # # Text to speech to a file | |
| # tts.tts_to_file(text="Hello world!", | |
| # speaker_wav="/media/willie/1caf5422-4135-4f2c-9619-c44041b51146/audio_data/DS_10283_3443/VCTK-Corpus-0.92/wav48_silence_trimmed/p227/p227_023_mic1.flac", | |
| # language="en", | |
| # file_path="/home/willie/eclipse-workspace/audio_diffusion_attacks/src/test_audio/speech/output.wav") |