ALeLacheur's picture
Voiceblock demo: Attempt 8
957e2dc
"""
Pipeline for enrolling:
1. Provide Recording
2. Convert to 16 kHz
3. Divide into recordings
4. Get embeddings for each recording
5. Find centroid
6. Save conditioning as some value.
"""
import os
import argbind
import sounddevice as sd
import soundfile
import torch
import numpy as np
import sys
sys.path.append('.')
from src.constants import CONDITIONING_FILENAME, CONDITIONING_FOLDER
from src.data import DataProperties
from src.models import ResNetSE34V2
MIN_WINDOWS = 10
WINDOW_SIZE = 64_000
BLOCK_SIZE = 256
RECORDING_TEXT = """
This script will record you speaking, and will create an embedding
to be used for conditioning Voicebox. This will overwrite any previous
embeddings. We recommend at least 10 seconds of non-stop voice recording.
Press enter to begin recording. To stop recording, press ctrl-C.
"""
def get_streams(input_name: str, block_size: int) -> sd.InputStream:
"""
Gets Input stream object
"""
try:
input_name = int(input_name)
except ValueError:
pass
return (
sd.InputStream(device=input_name,
samplerate=DataProperties.get('sample_rate'),
channels=1,
blocksize=block_size)
)
def record_from_user(input_name: str) -> torch.Tensor:
input_stream = get_streams(input_name, BLOCK_SIZE)
input(RECORDING_TEXT)
input_stream.start()
all_frames = []
try:
print("Recording...")
while True:
frames, _ = input_stream.read(BLOCK_SIZE)
all_frames.append(frames)
except KeyboardInterrupt:
print("Stopped Recording.")
pass
all_frames = torch.Tensor(np.array(all_frames))
recording = all_frames.reshape(-1)
return recording
def get_embedding(recording) -> torch.Tensor:
model = ResNetSE34V2(nOut=512, encoder_type='ASP')
recording = recording.view(1, -1)
embedding = model(recording)
return embedding
def save(embedding, audio) -> None:
os.makedirs(CONDITIONING_FOLDER, exist_ok=True)
torch.save(embedding, CONDITIONING_FILENAME)
soundfile.write(
CONDITIONING_FOLDER / 'conditioning_audio.wav',
audio.detach().cpu(),
DataProperties.get('sample_rate')
)
@argbind.bind(positional=True, without_prefix=True)
def main(input: str = None):
"""
Creating a conditioning vector for VoiceBox from your voice
:param input: Index or name of input audio interface. Defaults to current device
:type input: str, optional
"""
recording = record_from_user(input)
embedding = get_embedding(recording)
save(embedding, recording)
if __name__ == "__main__":
args = argbind.parse_args()
with argbind.scope(args):
main()