Spaces:

ALeLacheur
/

voiceblock

Sleeping

ALeLacheur commited on Jul 10, 2024

Commit

d09f267

1 Parent(s): 6caf132

Bug fix: Speaker embedding

Files changed (1) hide show

app.py CHANGED Viewed

@@ -55,39 +55,32 @@ def float32_to_int16(waveform):
     return waveform
 def get_embedding(recording):
-    print("Getting ResNet")
     resnet = ResNetSE34V2(nOut=512, encoder_type='ASP')
     recording = recording.view(1, -1)
-    print("Running ResNet")
     embedding = resnet(recording)
     return embedding
 #Define predict function:
 def predict(inp):
     #How to transform audio from string to tensor
-    print("Transforming audio to tensor")
     waveform, sample_rate = torchaudio.load(inp)
     #Resample to 16kHz
-    print("Resampling to 16Hz")
     transform_to_16hz = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
     waveform = transform_to_16hz(waveform)
     sample_rate = 16000
     #Get speaker embedding
-    print("Getting speaker embedding")
     condition_tensor = get_embedding(waveform)
     condition_tensor = condition_tensor.reshape(1, 1, -1)
     n_frames = waveform.shape[1]
     condition_tensor = condition_tensor.repeat(1, n_frames, 1)
     #Run model without changing weights
-    print("Running the model")
     with torch.no_grad():
         waveform = model(x=waveform, y=condition_tensor)
     #Transform output audio into gradio-readable format
-    print("Transforming returned audio")
     waveform = waveform.numpy()
     waveform = float32_to_int16(waveform)
     return sample_rate, waveform

     return waveform
 def get_embedding(recording):
     resnet = ResNetSE34V2(nOut=512, encoder_type='ASP')
     recording = recording.view(1, -1)
     embedding = resnet(recording)
     return embedding
 #Define predict function:
 def predict(inp):
     #How to transform audio from string to tensor
     waveform, sample_rate = torchaudio.load(inp)
     #Resample to 16kHz
     transform_to_16hz = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
     waveform = transform_to_16hz(waveform)
     sample_rate = 16000
     #Get speaker embedding
     condition_tensor = get_embedding(waveform)
     condition_tensor = condition_tensor.reshape(1, 1, -1)
     n_frames = waveform.shape[1]
     condition_tensor = condition_tensor.repeat(1, n_frames, 1)
     #Run model without changing weights
     with torch.no_grad():
         waveform = model(x=waveform, y=condition_tensor)
     #Transform output audio into gradio-readable format
     waveform = waveform.numpy()
     waveform = float32_to_int16(waveform)
     return sample_rate, waveform