Spaces:

ALeLacheur
/

voiceblock

Sleeping

App Files Files Community

voiceblock / voicebox /scripts /streamer /enroll.py

ALeLacheur

Voiceblock demo: Attempt 8

957e2dc over 1 year ago

raw

history blame contribute delete

2.75 kB

	"""
	Pipeline for enrolling:
	1. Provide Recording
	2. Convert to 16 kHz
	3. Divide into recordings
	4. Get embeddings for each recording
	5. Find centroid
	6. Save conditioning as some value.
	"""
	import os
	import argbind
	import sounddevice as sd
	import soundfile
	import torch
	import numpy as np

	import sys

	sys.path.append('.')

	from src.constants import CONDITIONING_FILENAME, CONDITIONING_FOLDER
	from src.data import DataProperties
	from src.models import ResNetSE34V2


	MIN_WINDOWS = 10
	WINDOW_SIZE = 64_000
	BLOCK_SIZE = 256

	RECORDING_TEXT = """
	This script will record you speaking, and will create an embedding
	to be used for conditioning Voicebox. This will overwrite any previous
	embeddings. We recommend at least 10 seconds of non-stop voice recording.
	Press enter to begin recording. To stop recording, press ctrl-C.
	"""


	def get_streams(input_name: str, block_size: int) -> sd.InputStream:
	"""
	Gets Input stream object
	"""
	try:
	input_name = int(input_name)
	except ValueError:
	pass
	return (
	sd.InputStream(device=input_name,
	samplerate=DataProperties.get('sample_rate'),
	channels=1,
	blocksize=block_size)
	)


	def record_from_user(input_name: str) -> torch.Tensor:
	input_stream = get_streams(input_name, BLOCK_SIZE)
	input(RECORDING_TEXT)
	input_stream.start()
	all_frames = []
	try:
	print("Recording...")
	while True:
	frames, _ = input_stream.read(BLOCK_SIZE)
	all_frames.append(frames)
	except KeyboardInterrupt:
	print("Stopped Recording.")
	pass
	all_frames = torch.Tensor(np.array(all_frames))
	recording = all_frames.reshape(-1)
	return recording


	def get_embedding(recording) -> torch.Tensor:
	model = ResNetSE34V2(nOut=512, encoder_type='ASP')
	recording = recording.view(1, -1)
	embedding = model(recording)
	return embedding


	def save(embedding, audio) -> None:
	os.makedirs(CONDITIONING_FOLDER, exist_ok=True)
	torch.save(embedding, CONDITIONING_FILENAME)
	soundfile.write(
	CONDITIONING_FOLDER / 'conditioning_audio.wav',
	audio.detach().cpu(),
	DataProperties.get('sample_rate')
	)


	@argbind.bind(positional=True, without_prefix=True)
	def main(input: str = None):
	"""
	Creating a conditioning vector for VoiceBox from your voice

	:param input: Index or name of input audio interface. Defaults to current device
	:type input: str, optional
	"""
	recording = record_from_user(input)
	embedding = get_embedding(recording)
	save(embedding, recording)


	if __name__ == "__main__":
	args = argbind.parse_args()
	with argbind.scope(args):
	main()