Spaces:

Transduce
/

vc_demo

Runtime error

vc_demo / app.py

SefyanKehail

solved

ad61f31 over 1 year ago

4.62 kB

	import torch, torchaudio
	import requests
	import IPython.display as display
	import gradio as gr
	import os
	import sys



	hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True, pretrained=False)
	acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True, pretrained=False)
	hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, pretrained=False)

	hubert_loaded = torch.load("hubert_cpu.pt", map_location=torch.device('cpu'))
	acoustic_loaded = torch.load("acoustic_cpu.pt", map_location=torch.device('cpu'))
	hifigan_loaded = torch.load("hifigan_cpu.pt", map_location=torch.device('cpu'))

	# Set the state dictionaries to the models
	# model.load_state_dict(hubert_loaded.state_dict(), strict=False)
	# acoustic.load_state_dict(acoustic_loaded.state_dict(), strict=False)
	# hifigan.load_state_dict(hifigan_loaded.state_dict(), strict=False)


	# print(hubert_loaded)
	# print(model)
	# sys.exit()
	# Move models to CPU (if not already on CPU)
	# hubert = hubert.to('cpu')
	# acoustic = acoustic.to('cpu')
	# hifigan = hifigan.to('cpu')



	# Conversion function
	def convert_speech(filename, progress=gr.Progress()):
	if not filename:
	raise ValueError("Please provide an audio")
	progress(0, desc="Starting conversion")

	progress(0.1, desc="Loading audio")

	source, sr = torchaudio.load(filename)

	progress(0.3, desc="Preprocessing audio")

	# Use the first channel if the audio is stereo
	if source.shape[0] > 1:
	source = source[0, :].unsqueeze(0)
	source = torchaudio.functional.resample(source, sr, 16000)
	source = source.unsqueeze(0).to('cpu')

	progress(0.6, desc="Converting speech")

	# Convert to the target speaker:
	with torch.inference_mode():
	# Extract speech units
	units = hubert_loaded.units(source)
	progress(0.7, desc="Generating target spectrogram")

	# Generate target spectrogram
	mel = acoustic_loaded.generate(units).transpose(1, 2)
	progress(0.8, desc="Generating audio waveform")

	# Generate audio waveform
	target = hifigan_loaded(mel)
	progress(0.9, desc="Postprocessing audio")
	# Move the tensor to CPU and convert to NumPy
	target = target.squeeze().cpu().numpy()
	progress(1.0, desc="Conversion complete")
	return 16000, target

	"""Convert to the target speaker:"""

	def enable_convert_button(audio):
	if audio is not None:
	return gr.update(interactive=True), gr.update(value="", visible=False)
	return gr.update(interactive=False), None

	def clear_components():
	return None, None

	def stop_recording_info(audio):
	if audio is None:
	return gr.update(value="### <i style='color:yellow'>Recording and uploading, please wait ...</i>", visible=True)
	return gr.update(value="", visible=False)

	def stop():
	print("this is working")

	# Gradio interface
	def gui():
	with gr.Blocks() as interface:
	gr.Markdown("# Soft Speech Units for Improved Voice Conversion")
	gr.Markdown("Upload an audio file to convert it to the target speaker's voice using soft speech units. Or use your microphone.")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Upload Audio", editable=False)
	convert_button = gr.Button("Convert Speech", interactive=False)
	info = gr.Markdown("", visible=False)

	with gr.Column():
	converted_audio = gr.Audio(type="numpy", label="Converted Speech", show_share_button=False)




	# Use audio_input.change to trigger stop_recording_info when audio changes

	audio_input.start_recording(stop_recording_info, inputs=[audio_input], outputs=[info])

	audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info])

	convert_button.click(convert_speech, inputs=[audio_input], outputs=[converted_audio])


	audio_input.clear(clear_components, inputs=None, outputs=[audio_input, converted_audio])

	# audio_input.change(enable_convert_button, inputs=[audio_input], outputs=[convert_button, info])

	return interface


	if __name__ == "__main__":

	app = gui()

	app.queue(default_concurrency_limit=40)

	app.launch(
	max_threads=40,
	share=True,
	show_error=True,
	quiet=False,
	debug=False,
	)