SSRL_Artikulation / generate_whole.py
Berker Banar
model
7b3714a
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
from tensorflow.keras import layers
tf.keras.backend.clear_session() # For easy reset of notebook state.
import random
import numpy as np
import os, sys, argparse, time
from pathlib import Path
import librosa
import configparser
import random
import json
import matplotlib.pyplot as plt
import soundfile as sf
import cv2
# os.environ["CUDA_VISIBLE_DEVICES"]="2"
def synthesize_audio():
workdir = Path(os.getcwd() + '/Whole/func-timbre-vae/run-010/')
# workdir =Path(os.getcwd() + '/Audio_Segments_2secs/func-timbre-vae/run-017/')
#Parse arguments
parser = argparse.ArgumentParser()
parser.add_argument('--config', type=str, default ='./default_whole.ini' , help='path to the config file')
args = parser.parse_args()
#Get configs
config_path = args.config
config = configparser.ConfigParser(allow_no_value=True)
try:
config.read(config_path)
except FileNotFoundError:
print('Config File Not Found at {}'.format(config_path))
sys.exit()
#import audio configs
sample_rate = config['audio'].getint('sample_rate')
hop_length = config['audio'].getint('hop_length')
bins_per_octave = config['audio'].getint('bins_per_octave')
num_octaves = config['audio'].getint('num_octaves')
n_bins = int(num_octaves * bins_per_octave)
n_iter = config['audio'].getint('n_iter')
#dataset
dataset = Path(config['dataset'].get('datapath'))
if not dataset.exists():
raise FileNotFoundError(dataset.resolve())
cqt_dataset = config['dataset'].get('cqt_dataset')
if config['dataset'].get('workspace') != None:
workspace = Path(config['dataset'].get('workspace'))
run_number = config['dataset'].getint('run_number')
my_audio = dataset / 'audio_external_test'
#Training configs
epochs = config['training'].getint('epochs')
learning_rate = config['training'].getfloat('learning_rate')
batch_size = config['training'].getint('batch_size')
train_buf = config['training'].getint('buffer_size')
buffer_size_dataset = config['training'].getboolean('buffer_size_dataset')
max_to_keep = config['training'].getint('max_ckpts_to_keep')
ckpt_epochs = config['training'].getint('checkpoint_epochs')
continue_training = config['training'].getboolean('continue_training')
learning_schedule = config['training'].getboolean('learning_schedule')
save_best_only = config['training'].getboolean('save_best_only')
early_patience_epoch = config['training'].getint('early_patience_epoch')
early_delta = config['training'].getfloat('early_delta')
adam_beta_1 = config['training'].getfloat('adam_beta_1')
adam_beta_2 = config['training'].getfloat('adam_beta_2')
#Model configs
latent_dim_audio = config['VAE'].getint('latent_dim_audio')
latent_dim_visual_score = config['VAE'].getint('latent_dim_visual_score')
n_units_audio = config['VAE'].getint('n_units_audio')
n_units_visual_score = config['VAE'].getint('n_units_visual_score')
kl_beta = config['VAE'].getfloat('kl_beta')
batch_normalization = config['VAE'].getboolean('batch_norm')
VAE_output_activation = config['VAE'].get('output_activation')
#etc
example_length = config['extra'].getint('example_length')
normalize_examples = config['extra'].getboolean('normalize_examples')
plot_model = config['extra'].getboolean('plot_model')
desc = config['extra'].get('description')
start_time = time.time()
config['extra']['start'] = time.asctime( time.localtime(start_time) )
AUTOTUNE = tf.data.experimental.AUTOTUNE
#Define Sampling Layer
class Sampling(layers.Layer):
"""Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
def call(self, inputs):
z_mean, z_log_var = inputs
batch = tf.shape(z_mean)[0]
dim = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch, dim))
return z_mean + tf.exp(0.5 * z_log_var) * epsilon
# LOAD MODEL **********************
my_model_path = workdir.joinpath('model','mymodel_last.h5')
print(my_model_path)
# my_model_path = '/homes/bb311/vs_audio/Whole/func-timbre-vae/run-009/model/mymodel_last.h5'
with tf.keras.utils.CustomObjectScope({'Sampling': Sampling}):
vae = tf.keras.models.load_model(my_model_path)
# vae = tf.saved_model.load(my_model_path)
vae.summary()
# create Encoder model
encoder_audio = tf.keras.Model(inputs = vae.get_layer("encoder_input_audio").input, outputs = [vae.get_layer("z_mean_audio").output, vae.get_layer("z_log_var_audio").output], name='encoder_audio')
encoder_audio.summary()
encoder_visual_score = tf.keras.Model(inputs = vae.get_layer("encoder_input_visual_score").input, outputs = [vae.get_layer("z_mean_visual_score").output, vae.get_layer("z_log_var_visual_score").output], name='encoder_visual_score')
encoder_visual_score.summary()
# create Decoder model
decoder_audio = tf.keras.Model(inputs = vae.get_layer('decoder_audio').input, outputs = vae.get_layer('decoder_audio').output, name='decoder_audio')
decoder_audio.summary()
decoder_visual_score = tf.keras.Model(inputs = vae.get_layer('decoder_visual_score').input, outputs = vae.get_layer('decoder_visual_score').output, name='decoder_visual_score')
decoder_visual_score.summary()
workdir2 = Path(os.getcwd() + '/Whole/')
# Generate examples
print("Generating examples...")
my_examples_folder_audio = workdir2.joinpath('Designed_Audio')
my_examples_folder_visual_score_source = workdir2.joinpath('Manually_Designed_Visual_Scores_npy')
my_examples_folder_visual_score_recons = workdir2.joinpath('Manually_Designed_Visual_Scores_recons')
for f in os.listdir(my_examples_folder_visual_score_source):
print("Examples for {}".format(os.path.splitext(f)[0]))
file_path = my_examples_folder_visual_score_source / f
my_array = np.load(file_path)
test_dataset = tf.data.Dataset.from_tensor_slices(my_array).batch(batch_size).prefetch(AUTOTUNE)
output_audio = tf.constant(0., dtype='float32', shape=(1,n_bins))
output_visual_score = tf.constant(0., dtype='float32', shape=(1,249,3))
print("Working on regenerating cqt magnitudes with the DL model")
for step, x_batch_train in enumerate(test_dataset):
print(x_batch_train.shape)
reconstructed_whole = vae([tf.random.uniform(shape=(1, 384)), x_batch_train]) # random uniform is just a dull tensor for the other modality
reconstructed_audio = reconstructed_whole[0]
reconstructed_visual_score = reconstructed_whole[1]
# print(reconstructed.shape)
output_audio = tf.concat([output_audio, reconstructed_audio], 0)
output_visual_score = tf.concat([output_visual_score, reconstructed_visual_score], 0)
#recons visual score
os.makedirs(my_examples_folder_visual_score_recons, exist_ok=True)
image = output_visual_score.numpy()
cv2.imwrite(str(my_examples_folder_visual_score_recons) + '/' + f[:-4] + '.jpeg', image*256)
#recons audio
fs = sample_rate
output_np = np.transpose(output_audio.numpy())
output_inv_32 = librosa.griffinlim_cqt(output_np[1:],
sr=fs, n_iter=n_iter, hop_length=hop_length, bins_per_octave=bins_per_octave, dtype=np.float32)
if normalize_examples:
output_inv_32 = librosa.util.normalize(output_inv_32)
print("Saving audio files...")
my_audio_out_fold = my_examples_folder_audio.joinpath(os.path.splitext(f)[0])
os.makedirs(my_audio_out_fold,exist_ok=True)
sf.write(my_audio_out_fold.joinpath(f[:-4] + '.wav'),
output_inv_32, sample_rate)