DTLN / real_time_processing.py
alekya's picture
model files
34d6c18 verified
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 23 16:23:15 2020
@author: nils
"""
import soundfile as sf
import numpy as np
import tensorflow as tf
##########################
# the values are fixed, if you need other values, you have to retrain.
# The sampling rate of 16k is also fix.
block_len = 512
block_shift = 128
# load model
model = tf.saved_model.load('./pretrained_model/dtln_saved_model')
infer = model.signatures["serving_default"]
# load audio file at 16k fs (please change)
audio,fs = sf.read('path_to_your_favorite_audio.wav')
# check for sampling rate
if fs != 16000:
raise ValueError('This model only supports 16k sampling rate.')
# preallocate output audio
out_file = np.zeros((len(audio)))
# create buffer
in_buffer = np.zeros((block_len))
out_buffer = np.zeros((block_len))
# calculate number of blocks
num_blocks = (audio.shape[0] - (block_len-block_shift)) // block_shift
# iterate over the number of blcoks
for idx in range(num_blocks):
# shift values and write to buffer
in_buffer[:-block_shift] = in_buffer[block_shift:]
in_buffer[-block_shift:] = audio[idx*block_shift:(idx*block_shift)+block_shift]
# create a batch dimension of one
in_block = np.expand_dims(in_buffer, axis=0).astype('float32')
# process one block
out_block= infer(tf.constant(in_block))['conv1d_1']
# shift values and write to buffer
out_buffer[:-block_shift] = out_buffer[block_shift:]
out_buffer[-block_shift:] = np.zeros((block_shift))
out_buffer += np.squeeze(out_block)
# write block to output file
out_file[idx*block_shift:(idx*block_shift)+block_shift] = out_buffer[:block_shift]
# write to .wav file
sf.write('out.wav', out_file, fs)
print('Processing finished.')