Spaces:

Transduce
/

vc_demo

Runtime error

SefyanKehail

debugging..

83b4e28 almost 2 years ago

4.07 kB

	import torch, torchaudio
	import requests
	import IPython.display as display
	import gradio as gr
	import os



	def get_file_size_in_mb(file_path):
	# Get the file size in bytes
	file_size_bytes = os.path.getsize(file_path)

	# Convert bytes to megabytes
	file_size_mb = file_size_bytes / (1024 * 1024)

	return file_size_mb



	# Load the models using torch.hub
	hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
	acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True, pretrained=False)
	hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, pretrained=False)


	# Load the state dictionaries from the CPU-saved files
	hubert_state_dict = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
	acoustic_state_dict = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
	hifigan_state_dict = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))


	# Set the state dictionaries to the models
	hubert.load_state_dict(hubert_state_dict, strict=False)
	acoustic.load_state_dict(acoustic_state_dict, strict=False)
	hifigan.load_state_dict(hifigan_state_dict, strict=False)

	# Move models to CPU (if not already on CPU)
	hubert = hubert.to('cpu')
	acoustic = acoustic.to('cpu')
	hifigan = hifigan.to('cpu')



	# Conversion function
	def convert_speech(filename, progress=gr.Progress()):
	progress(0, desc="Starting conversion")

	progress(0.1, desc="Loading audio")

	source, sr = torchaudio.load(filename)

	progress(0.3, desc="Preprocessing audio")

	# Use the first channel if the audio is stereo
	if source.shape[0] > 1:
	source = source[0, :].unsqueeze(0)
	source = torchaudio.functional.resample(source, sr, 16000)
	source = source.unsqueeze(0).to('cpu')

	progress(0.6, desc="Converting speech")

	# Convert to the target speaker:
	with torch.inference_mode():
	# Extract speech units
	units = hubert.units(source)
	# Generate target spectrogram
	mel = acoustic.generate(units).transpose(1, 2)
	# Generate audio waveform
	target = hifigan(mel)
	progress(0.9, desc="Postprocessing audio")
	# Move the tensor to CPU and convert to NumPy
	target = target.squeeze().cpu().numpy()
	progress(1.0, desc="Conversion complete")
	return 16000, target

	"""Convert to the target speaker:"""

	def enable_convert_button(audio):
	if audio is not None:
	return gr.update(interactive=True)
	return gr.update(interactive=False)


	def clear_components():
	return None, None


	js = """
	document.addEventListener('DOMContentLoaded', function() {
	const audioInput = document.querySelector('input[type="file"]');
	const convertButton = document.querySelector('button');

	function updateButtonText() {
	if (audioInput.files.length > 0) {
	convertButton.innerText = "Uploading audio, please wait ...";
	}
	}

	audioInput.addEventListener('change', updateButtonText);
	});
	"""



	# Gradio interface
	with gr.Blocks() as interface:
	gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
	gr.Markdown("Upload an audio file to convert it to the target speaker's voice using soft speech units. Or use your microphone.")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(type="filepath", label="Upload Audio", sources=["upload", "microphone"])
	convert_button = gr.Button("Convert Speech", interactive=False)
	with gr.Column():
	converted_audio = gr.Audio(type="numpy", label="Converted Speech")

	audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button])

	convert_button.click(convert_speech, inputs=[audio_input], outputs=[converted_audio])

	audio_input.clear(clear_components, inputs=None, outputs=[audio_input, converted_audio])

	interface.launch(debug=False)