Spaces:

qgyd2021
/

cc_denoise

Paused

App Files Files Community

cc_denoise / toolbox /torchaudio /models /dfnet /inference_dfnet.py

HoneyTian

first commit

1af34cd 9 months ago

raw

history blame contribute delete

4.06 kB

	#!/usr/bin/python3
	# -- coding: utf-8 --
	import logging
	from pathlib import Path
	import shutil
	import tempfile, time
	import zipfile

	import librosa
	import numpy as np
	import torch
	import torchaudio

	torch.set_num_threads(1)

	from project_settings import project_path
	from toolbox.torchaudio.models.dfnet.configuration_dfnet import DfNetConfig
	from toolbox.torchaudio.models.dfnet.modeling_dfnet import DfNetPretrainedModel, MODEL_FILE

	logger = logging.getLogger("toolbox")


	class InferenceDfNet(object):
	def __init__(self, pretrained_model_path_or_zip_file: str, device: str = "cpu"):
	self.pretrained_model_path_or_zip_file = pretrained_model_path_or_zip_file
	self.device = torch.device(device)

	logger.info(f"loading model; model_file: {self.pretrained_model_path_or_zip_file}")
	config, model = self.load_models(self.pretrained_model_path_or_zip_file)
	logger.info(f"model loading completed; model_file: {self.pretrained_model_path_or_zip_file}")

	self.config = config
	self.model = model
	self.model.to(device)
	self.model.eval()

	def load_models(self, model_path: str):
	model_path = Path(model_path)
	if model_path.name.endswith(".zip"):
	with zipfile.ZipFile(model_path.as_posix(), "r") as f_zip:
	out_root = Path(tempfile.gettempdir()) / "nx_denoise"
	out_root.mkdir(parents=True, exist_ok=True)
	f_zip.extractall(path=out_root)
	model_path = out_root / model_path.stem

	config = DfNetConfig.from_pretrained(
	pretrained_model_name_or_path=model_path.as_posix(),
	)
	model = DfNetPretrainedModel.from_pretrained(
	pretrained_model_name_or_path=model_path.as_posix(),
	)
	model.to(self.device)
	model.eval()

	shutil.rmtree(model_path)
	return config, model

	def enhancement_by_ndarray(self, noisy_audio: np.ndarray) -> np.ndarray:
	noisy_audio = torch.tensor(noisy_audio, dtype=torch.float32)
	noisy_audio = noisy_audio.unsqueeze(dim=0)

	# noisy_audio shape: [batch_size, n_samples]
	enhanced_audio = self.enhancement_by_tensor(noisy_audio)
	# enhanced_audio shape: [channels, num_samples]
	enhanced_audio = enhanced_audio[0]
	# enhanced_audio shape: [num_samples]
	return enhanced_audio.cpu().numpy()

	def enhancement_by_tensor(self, noisy_audio: torch.Tensor) -> torch.Tensor:
	if torch.max(noisy_audio) > 1 or torch.min(noisy_audio) < -1:
	raise AssertionError(f"The value range of audio samples should be between -1 and 1.")

	# noisy_audio shape: [batch_size, num_samples]
	noisy_audios = noisy_audio.to(self.device)

	with torch.no_grad():
	est_spec, est_wav, est_mask, lsnr = self.model.forward(noisy_audios)

	# shape: [batch_size, 1, num_samples]
	denoise = est_wav[0]
	# shape: [channels, num_samples]
	return denoise


	def main():
	model_zip_file = project_path / "trained_models/dfnet-nx-dns3.zip"
	infer_model = InferenceDfNet(model_zip_file)

	sample_rate = 8000
	noisy_audio_file = project_path / "data/examples/ai_agent/chinese-3.wav"
	noisy_audio, sample_rate = librosa.load(
	noisy_audio_file.as_posix(),
	sr=sample_rate,
	)
	duration = librosa.get_duration(y=noisy_audio, sr=sample_rate)
	# noisy_audio = noisy_audio[int(7sample_rate):int(9sample_rate)]
	noisy_audio = torch.tensor(noisy_audio, dtype=torch.float32)
	noisy_audio = noisy_audio.unsqueeze(dim=0)

	begin = time.time()
	enhanced_audio = infer_model.enhancement_by_tensor(noisy_audio)
	time_cost = time.time() - begin
	print(f"enhanced_audio.shape: {enhanced_audio.shape}, time_cost: {time_cost:.4f}, audio_duration: {duration:.4f}, fpr: {time_cost / duration:.4f}")

	filename = "enhanced_audio.wav"
	torchaudio.save(filename, enhanced_audio.detach().cpu(), sample_rate)

	return


	if __name__ == "__main__":
	main()