Spaces:
Sleeping
Sleeping
Commit ·
678fd0b
1
Parent(s): b6809ba
Changed to 16hz and added speaker embedding
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import voicebox.src.attacks.offline.perturbation.voicebox.voicebox as vb #To acc
|
|
| 4 |
#import voicebox.src.attacks.online.voicebox_streamer as streamer #To access VoiceBoxStreamer class
|
| 5 |
import numpy as np
|
| 6 |
from voicebox.src.constants import PPG_PRETRAINED_PATH
|
|
|
|
| 7 |
|
| 8 |
#Set voicebox default parameters
|
| 9 |
LOOKAHEAD = 5
|
|
@@ -28,7 +29,19 @@ voicebox_kwargs={'win_length': 256,
|
|
| 28 |
'projection_norm': float('inf'),
|
| 29 |
'conditioning_dim': 512}
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
model = vb.VoiceBox(**voicebox_kwargs)
|
| 33 |
model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True)
|
| 34 |
model.eval()
|
|
@@ -41,6 +54,12 @@ def float32_to_int16(waveform):
|
|
| 41 |
waveform = waveform.ravel()
|
| 42 |
return waveform
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
#Define predict function:
|
| 45 |
def predict(inp):
|
| 46 |
#How to transform audio from string to tensor
|
|
@@ -51,9 +70,15 @@ def predict(inp):
|
|
| 51 |
waveform = transform_to_16hz(waveform)
|
| 52 |
sample_rate = 16000
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
#Run model without changing weights
|
| 55 |
with torch.no_grad():
|
| 56 |
-
waveform = model(waveform)
|
| 57 |
|
| 58 |
#Transform output audio into gradio-readable format
|
| 59 |
waveform = waveform.numpy()
|
|
|
|
| 4 |
#import voicebox.src.attacks.online.voicebox_streamer as streamer #To access VoiceBoxStreamer class
|
| 5 |
import numpy as np
|
| 6 |
from voicebox.src.constants import PPG_PRETRAINED_PATH
|
| 7 |
+
from voicebox.src.models import ResNetSE34V2
|
| 8 |
|
| 9 |
#Set voicebox default parameters
|
| 10 |
LOOKAHEAD = 5
|
|
|
|
| 29 |
'projection_norm': float('inf'),
|
| 30 |
'conditioning_dim': 512}
|
| 31 |
|
| 32 |
+
'''
|
| 33 |
+
#Set streamer default parameters:
|
| 34 |
+
config_path = 'voicebox/pretrained/voicebox/voicebox_final.yaml'
|
| 35 |
+
with open(config_path) as f:
|
| 36 |
+
config = yaml.safe_load(f)
|
| 37 |
+
|
| 38 |
+
#Load pretrained model (streamer):
|
| 39 |
+
model = streamer.VoiceBoxStreamer(**config)
|
| 40 |
+
model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True)
|
| 41 |
+
model.eval()
|
| 42 |
+
'''
|
| 43 |
+
|
| 44 |
+
#Load pretrained model (VoiceBox):
|
| 45 |
model = vb.VoiceBox(**voicebox_kwargs)
|
| 46 |
model.load_state_dict(torch.load('voicebox/pretrained/voicebox/voicebox_final.pt', map_location=torch.device('cpu')), strict=True)
|
| 47 |
model.eval()
|
|
|
|
| 54 |
waveform = waveform.ravel()
|
| 55 |
return waveform
|
| 56 |
|
| 57 |
+
def get_embedding(recording):
|
| 58 |
+
resnet = ResNetSE34V2(nOut=512, encoder_type='ASP')
|
| 59 |
+
recording = recording.view(1, -1)
|
| 60 |
+
embedding = resnet(recording)
|
| 61 |
+
return embedding
|
| 62 |
+
|
| 63 |
#Define predict function:
|
| 64 |
def predict(inp):
|
| 65 |
#How to transform audio from string to tensor
|
|
|
|
| 70 |
waveform = transform_to_16hz(waveform)
|
| 71 |
sample_rate = 16000
|
| 72 |
|
| 73 |
+
#Get speaker embedding
|
| 74 |
+
condition_tensor = get_embedding(waveform)
|
| 75 |
+
condition_tensor = condition_tensor.reshape(1, 1, -1)
|
| 76 |
+
n_frames = waveform.shape[1]
|
| 77 |
+
condition_tensor = condition_tensor.repeat(1, n_frames, 1)
|
| 78 |
+
|
| 79 |
#Run model without changing weights
|
| 80 |
with torch.no_grad():
|
| 81 |
+
waveform = model(x=waveform, y=condition_tensor)
|
| 82 |
|
| 83 |
#Transform output audio into gradio-readable format
|
| 84 |
waveform = waveform.numpy()
|