Spaces:

snnithya
/

GaMaDHaNi

Running on Zero

App Files Files Community

Nithya commited on Sep 23, 2024

Commit

98eb218

1 Parent(s): 3752793

updated parent repo and restructured things

Browse files

Files changed (13) hide show

.gitattributes +0 -35
.gitignore +0 -1
app.py +65 -150
models/diffusion_pitch/config.gin +36 -35
models/pitch_to_audio/config.gin +39 -36
requirements.txt +1 -19
src/dataset.py +0 -312
src/generate_utils.py +0 -88
src/model.py +0 -1130
src/pitch_to_audio_utils.py +0 -121
src/preprocess_utils.py +0 -127
src/process_encodec.py +0 -22
src/utils.py +0 -65

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

	@@ -1 +0,0 @@
1	- src/__pycache__/

app.py CHANGED Viewed

@@ -1,91 +1,28 @@
 import spaces
-from gradio import Interface, Audio
 import gradio as gr
 import numpy as np
 import torch
-import subprocess
 import librosa
 import matplotlib.pyplot as plt
 import pandas as pd
 import os
 from functools import partial
 import gin
-import sys
-sys.path.append('./')
-from src.generate_utils import invert_pitch_read, load_pitch_model, load_audio_model
-import src.pitch_to_audio_utils as p2a
 import torchaudio
 from absl import app
 from torch.nn.functional import interpolate
-import pdb
 import logging
 import crepe
 from hmmlearn import hmm
-import time
 import soundfile as sf
 pitch_path = 'models/diffusion_pitch/'
-# pitch_path = '/network/scratch/n/nithya.shikarpur/checkpoints/pitch-diffusion/corrected-attention-v3/4833583'
 audio_path = 'models/pitch_to_audio/'
-# audio_path = '/network/scratch/n/nithya.shikarpur/checkpoints/pitch-diffusion/corrected-attention-v3/4835364'
-# db_path_audio = '/home/mila/n/nithya.shikarpur/scratch/pitch-diffusion/data/merged_data-finalest/cached-audio-pitch-16k'
-device = 'cuda'
-global_ind = -1
-global_audios = np.array([0.0])
-global_pitches = np.array([0])
-singer = 3
-audio_components = []
-preprocessed_primes = []
-selected_prime = None
-def make_prime_npz(prime):
-    np.savez('./temp/prime.npz', concatenated_array=[[prime]])
-def load_pitch_fns():
-    pitch_model, pitch_qt, _, pitch_task_fn = load_pitch_model(
-        os.path.join(pitch_path, 'config.gin'),
-        os.path.join(pitch_path, 'last.ckpt'),
-        os.path.join(pitch_path, 'qt.joblib'),
-        device=device
-        )
-    invert_pitch_fn = partial(
-        invert_pitch_read,
-        min_norm_pitch=gin.query_parameter('dataset.pitch_read_w_downsample.min_norm_pitch'),
-        time_downsample=gin.query_parameter('dataset.pitch_read_w_downsample.time_downsample'),
-        pitch_downsample=gin.query_parameter('dataset.pitch_read_w_downsample.pitch_downsample'),
-        qt_transform=pitch_qt,
-        min_clip=gin.query_parameter('dataset.pitch_read_w_downsample.min_clip'),
-        max_clip=gin.query_parameter('dataset.pitch_read_w_downsample.max_clip')
-    )
-    return pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn
-def interpolate_pitch(pitch, audio_seq_len):
-    pitch = interpolate(pitch, size=audio_seq_len, mode='linear')
-    # plt.plot(pitch[0].squeeze(0).detach().cpu().numpy())
-    # plt.savefig(f"./temp/interpolated_pitch.png")
-    # plt.close()
-    return pitch
-def load_audio_fns():
-    ckpt = os.path.join(audio_path, 'last.ckpt')
-    config = os.path.join(audio_path, 'config.gin')
-    qt = os.path.join(audio_path, 'qt.joblib')
-    # qt = '/home/mila/n/nithya.shikarpur/scratch/pitch-diffusion/data/merged_data-finalest/cached-audio-pitch-16k/qt.joblib'
-    audio_model, audio_qt = load_audio_model(config, ckpt, qt, device=device)
-    audio_seq_len = gin.query_parameter('%AUDIO_SEQ_LEN')
-    invert_audio_fn = partial(
-        p2a.normalized_mels_to_audio,
-        qt=audio_qt,
-        n_iter=200
-    )
-    return audio_model, audio_qt, audio_seq_len, invert_audio_fn
 def predict_voicing(confidence):
     # https://github.com/marl/crepe/pull/26
@@ -136,73 +73,67 @@ def extract_pitch(audio, unvoice=True, sr=16000, frame_shift_ms=10, log=True):
     return time, f0, confidence
-def generate_pitch(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, outfolder=None, processed_primes=None):
-    noisy_pitch = torch.Tensor(pitch[:, :, :1200]).to(pitch_model.device) + (torch.normal(mean=0.0, std=0.4*torch.ones(( 1200)))).to(pitch_model.device)
-    noisy_pitch = torch.clamp(noisy_pitch, -5.19, 5.19)
     samples = pitch_model.sample_sdedit(noisy_pitch, num_samples, num_steps)
-    inverted_pitches = [invert_pitch_fn(samples.detach().cpu().numpy()[0])[0]]
-    if outfolder is not None:
-        os.makedirs(outfolder, exist_ok=True)
-        # pdb.set_trace()
-        for i, pitch in enumerate(inverted_pitches):
-            flattened_pitch = pitch.flatten()
-            pd.DataFrame({'f0': flattened_pitch}).to_csv(f"{outfolder}/{i}.csv", index=False)
-            plt.plot(np.where(flattened_pitch == 0, np.nan, flattened_pitch))
-            plt.savefig(f"{outfolder}/{i}.png")
-            plt.close()
     return samples, inverted_pitches
-def generate_audio(audio_model, f0s, invert_audio_fn, outfolder, singers=[3], num_steps=100):
     singer_tensor = torch.tensor(np.repeat(singers, repeats=f0s.shape[0])).to(audio_model.device)
     samples, _, singers = audio_model.sample_cfg(f0s.shape[0], f0=f0s, num_steps=num_steps, singer=singer_tensor, strength=3)
     audio = invert_audio_fn(samples)
-    if outfolder is not None:
-        os.makedirs(outfolder, exist_ok=True)
-        for i, a in enumerate(audio):
-            logging.log(logging.INFO, f"Saving audio {i}")
-            torchaudio.save(f"{outfolder}/{i}.wav", torch.tensor(a).detach().unsqueeze(0).cpu(), 16000)
     return audio
 @spaces.GPU(duration=120)
-def generate(pitch, num_samples=2, num_steps=100, singers=[3], outfolder='temp', audio_seq_len=750, pitch_qt=None ):
-    global global_ind, audio_components
-    global preprocessed_primes
-    # pdb.set_trace()
     logging.log(logging.INFO, 'Generate function')
-    pitch, inverted_pitch = generate_pitch(pitch, pitch_model, invert_pitch_fn, 1, 100, outfolder=outfolder, processed_primes=selected_prime if global_ind != 0 else None)
     if pitch_qt is not None:
         def undo_qt(x, min_clip=200):
             pitch= pitch_qt.inverse_transform(x.reshape(-1, 1)).reshape(1, -1)
             pitch = np.around(pitch) # round to nearest integer, done in preprocessing of pitch contour fed into model
             pitch[pitch < 200] = np.nan
             return pitch
         pitch = torch.tensor(np.array([undo_qt(x) for x in pitch.detach().cpu().numpy()])).to(pitch_model.device)
-    interpolated_pitch = interpolate_pitch(pitch=pitch, audio_seq_len=audio_seq_len)
-    interpolated_pitch = torch.nan_to_num(interpolated_pitch, nan=196)
     interpolated_pitch = interpolated_pitch.squeeze(1) # to match input size by removing the extra dimension
-    audio = generate_audio(audio_model, interpolated_pitch, invert_audio_fn, singers=singers, num_steps=100, outfolder=outfolder)
-    # pdb.set_trace()
-    audio = audio.detach().cpu().numpy()[:, :]
     pitch = pitch.detach().cpu().numpy()
-    # state = [(16000, audio[0]), (16000, audio[1])]
-    # pdb.set_trace()
     pitch_vals = np.where(pitch[0][:, 0] == 0, np.nan, pitch[0].flatten())
-    fig1 = plt.figure()
-    # plt.plot(np.arange(0, 400), pitch_vals[:400], figure=fig1, label='User Input')
-    plt.plot(pitch_vals, figure=fig1, label='Pitch')
-    # plt.legend(fig1)
-    # state.append(fig1)
-    plt.close(fig1)
-    return (16000, audio[0]), fig1, pitch_vals
-pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn = load_pitch_fns()
-audio_model, audio_qt, audio_seq_len, invert_audio_fn = load_audio_fns()
-partial_generate = partial(generate, num_samples=1, num_steps=100, singers=[3], outfolder=None, pitch_qt=pitch_qt)
-@spaces.GPU(duration=180)
-def set_prime_and_generate(audio, full_pitch, full_audio, full_user):
     global selected_prime, pitch_task_fn
     if audio is None:
@@ -215,40 +146,32 @@ def set_prime_and_generate(audio, full_pitch, full_audio, full_user):
     audio /= np.max(np.abs(audio))
     audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
     mic_audio = audio.copy()
-    audio = audio[-12*16000:]
     _, f0, _ = extract_pitch(audio)
-    mic_f0 = f0.copy()
-    f0 = pitch_task_fn({
-        'pitch': {
-            'data': f0,
-            'sampling_rate': 100
-        }
-    }, qt_transform=pitch_qt)
     f0 = f0.reshape(1, 1, -1)
     f0 = torch.tensor(f0).to(pitch_model.device).float()
-    audio, pitch, pitch_vals = partial_generate(f0)
-    # pdb.set_trace()
-    full_pitch = np.concatenate((full_pitch, mic_f0, pitch_vals))
-    full_user = np.concatenate((full_user, ['User'] * len(mic_f0), ['Model'] * len(pitch_vals)))
-    full_audio[1] = np.concatenate((full_audio[1], mic_audio, audio[1]))
-    # pdb.set_trace()
-    fig = plt.figure()
-    plt.plot(np.arange(0, len(mic_f0)), mic_f0, label='User Input', figure=fig)
-    plt.close(fig)
-    return audio, full_pitch, full_audio, full_user, pitch
-def save_session(full_pitch, full_audio, full_user):
-    pass
-    # os.makedirs(output_folder, exist_ok=True)
-    # filename = f'session-{time.time()}'
-    # logging.log(logging.INFO, f"Saving session to {filename}")
-    # pd.DataFrame({'pitch': full_pitch, 'time': np.arange(0, len(full_pitch)/100, 0.01), 'user': full_user}).to_csv(os.path.join(output_folder, filename + '.csv'), index=False)
-    # sf.write(os.path.join(output_folder, filename + '.wav'), full_audio[1], 16000)
 with gr.Blocks() as demo:
-    full_audio = gr.State((16000, np.array([])))
-    full_pitch = gr.State(np.array([]))
-    full_user = gr.State(np.array([]))
     with gr.Row():
         with gr.Column():
             audio = gr.Audio(label="Input")
@@ -257,17 +180,9 @@ with gr.Blocks() as demo:
         with gr.Column():
             generated_audio = gr.Audio(label="Generated Audio")
             generated_pitch = gr.Plot(label="Generated Pitch")
-    sbmt.click(set_prime_and_generate, inputs=[audio, full_pitch, full_audio, full_user], outputs=[generated_audio, full_pitch, full_audio, full_user, user_input])
-    save = gr.Button("Save Session")
-    save.click(save_session, inputs=[full_pitch, full_audio, full_user])
 def main(argv):
-    # audio = np.random.randint(0, high=128, size=(44100*5), dtype=np.int16)
-    # sr = 44100
-    # pdb.set_trace()
-    # p, a = set_prime_and_generate((sr, audio))
     demo.launch(share=True)

 import spaces
 import gradio as gr
 import numpy as np
 import torch
 import librosa
 import matplotlib.pyplot as plt
 import pandas as pd
 import os
 from functools import partial
 import gin
+from gamadhani.utils.generate_utils import load_pitch_fns, load_audio_fns
+import gamadhani.utils.pitch_to_audio_utils as p2a
+from gamadhani.utils.utils import get_device
 import torchaudio
 from absl import app
 from torch.nn.functional import interpolate
 import logging
 import crepe
 from hmmlearn import hmm
 import soundfile as sf
+import pdb
 pitch_path = 'models/diffusion_pitch/'
 audio_path = 'models/pitch_to_audio/'
+device = get_device()
 def predict_voicing(confidence):
     # https://github.com/marl/crepe/pull/26
     return time, f0, confidence
+def generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples, num_steps, noise_std=0.4):
+    '''Generate pitch values for the melodic reinterpretation task'''
+    # hardcoding the amount of noise to be added
+    noisy_pitch = torch.Tensor(pitch[:, :, -1200:]).to(pitch_model.device) + (torch.normal(mean=0.0, std=noise_std*torch.ones((1200)))).to(pitch_model.device)
+    noisy_pitch = torch.clamp(noisy_pitch, -5.19, 5.19)     # clipping the pitch values to be within the range of the model
     samples = pitch_model.sample_sdedit(noisy_pitch, num_samples, num_steps)
+    inverted_pitches = [invert_pitch_fn(f0=samples.detach().cpu().numpy()[0])[0]]   # pitch values in Hz
     return samples, inverted_pitches
+def generate_audio(audio_model, f0s, invert_audio_fn, singers=[3], num_steps=100):
+    '''Generate audio given pitch values'''
     singer_tensor = torch.tensor(np.repeat(singers, repeats=f0s.shape[0])).to(audio_model.device)
     samples, _, singers = audio_model.sample_cfg(f0s.shape[0], f0=f0s, num_steps=num_steps, singer=singer_tensor, strength=3)
     audio = invert_audio_fn(samples)
     return audio
 @spaces.GPU(duration=120)
+def generate(pitch, num_samples=1, num_steps=100, singers=[3], outfolder='temp', audio_seq_len=750, pitch_qt=None ):
     logging.log(logging.INFO, 'Generate function')
+    pitch, inverted_pitch = generate_pitch_reinterp(pitch, pitch_model, invert_pitch_fn, num_samples=num_samples, num_steps=100)
     if pitch_qt is not None:
+        # if there is not pitch quantile transformer, undo the default quantile transformation that occurs
         def undo_qt(x, min_clip=200):
             pitch= pitch_qt.inverse_transform(x.reshape(-1, 1)).reshape(1, -1)
             pitch = np.around(pitch) # round to nearest integer, done in preprocessing of pitch contour fed into model
             pitch[pitch < 200] = np.nan
             return pitch
         pitch = torch.tensor(np.array([undo_qt(x) for x in pitch.detach().cpu().numpy()])).to(pitch_model.device)
+    interpolated_pitch = p2a.interpolate_pitch(pitch=pitch, audio_seq_len=audio_seq_len)    # interpolate pitch values to match the audio model's input size
+    interpolated_pitch = torch.nan_to_num(interpolated_pitch, nan=196)  # replace nan values with silent token
     interpolated_pitch = interpolated_pitch.squeeze(1) # to match input size by removing the extra dimension
+    audio = generate_audio(audio_model, interpolated_pitch, invert_audio_fn, singers=singers, num_steps=100)
+    audio = audio.detach().cpu().numpy()
     pitch = pitch.detach().cpu().numpy()
     pitch_vals = np.where(pitch[0][:, 0] == 0, np.nan, pitch[0].flatten())
+    # generate plot of model output to display on interface
+    model_output_plot = plt.figure()
+    plt.plot(pitch_vals, figure=model_output_plot, label='Model Output')
+    plt.close(model_output_plot)
+    return (16000, audio[0]), model_output_plot, pitch_vals
+# pdb.set_trace()
+pitch_model, pitch_qt, pitch_task_fn, invert_pitch_fn, _ = load_pitch_fns(
+    os.path.join(pitch_path, 'last.ckpt'), \
+    model_type = 'diffusion', \
+    config_path = os.path.join(pitch_path, 'config.gin'), \
+    qt_path = os.path.join(pitch_path, 'qt.joblib'), \
+)
+audio_model, audio_qt, audio_seq_len, invert_audio_fn = load_audio_fns(
+    os.path.join(audio_path, 'last.ckpt'),
+    qt_path = os.path.join(audio_path, 'qt.joblib'),
+    config_path = os.path.join(audio_path, 'config.gin')
+)
+partial_generate = partial(generate, num_samples=1, num_steps=100, singers=[3], outfolder=None, pitch_qt=pitch_qt)  # generate function with default arguments
+@spaces.GPU(duration=120)
+def set_guide_and_generate(audio):
     global selected_prime, pitch_task_fn
     if audio is None:
     audio /= np.max(np.abs(audio))
     audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) # convert only last 4 s
     mic_audio = audio.copy()
+    audio = audio[-12*16000:] # consider only last 12 s
     _, f0, _ = extract_pitch(audio)
+    mic_f0 = f0.copy() # save the user input pitch values
+    f0 = pitch_task_fn(**{
+        'inputs': {
+            'pitch': {
+                'data': torch.Tensor(f0), # task function expects a tensor
+                'sampling_rate': 100
+                }
+        },
+        'qt_transform': pitch_qt,
+        'time_downsample': 1, # pitch will be extracted at 100 Hz, thus no downsampling
+        'seq_len': None,
+    })['sampled_sequence']
+    # pdb.set_trace()
     f0 = f0.reshape(1, 1, -1)
     f0 = torch.tensor(f0).to(pitch_model.device).float()
+    audio, pitch, _ = partial_generate(f0)
+    mic_f0 = np.where(mic_f0 == 0, np.nan, mic_f0)
+    # plot user input
+    user_input_plot = plt.figure()
+    plt.plot(np.arange(0, len(mic_f0)), mic_f0, label='User Input', figure=user_input_plot)
+    plt.close(user_input_plot)
+    return audio, user_input_plot, pitch
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             audio = gr.Audio(label="Input")
         with gr.Column():
             generated_audio = gr.Audio(label="Generated Audio")
             generated_pitch = gr.Plot(label="Generated Pitch")
+    sbmt.click(set_guide_and_generate, inputs=[audio], outputs=[generated_audio, user_input, generated_pitch])
 def main(argv):
     demo.launch(share=True)

models/diffusion_pitch/config.gin CHANGED Viewed

@@ -1,7 +1,9 @@
 from __gin__ import dynamic_registration
-from src import dataset
-from src import model
-from src import utils
 import torch
 # Macros:
@@ -23,47 +25,46 @@ utils.build_warmed_exponential_lr_scheduler.eta_min = 0.1
 utils.build_warmed_exponential_lr_scheduler.peak_iteration = 10000
 utils.build_warmed_exponential_lr_scheduler.start_factor = 0.01
-# Parameters for model.UNetBase.configure_optimizers:
 # ==============================================================================
-model.UNetBase.configure_optimizers.optimizer_cls = @torch.optim.AdamW
-model.UNetBase.configure_optimizers.scheduler_cls = \
     @utils.build_warmed_exponential_lr_scheduler
-# Parameters for dataset.pitch_read_w_downsample:
 # ==============================================================================
-dataset.pitch_read_w_downsample.add_noise_to_silence = True
-dataset.pitch_read_w_downsample.decoder_key = 'pitch'
-dataset.pitch_read_w_downsample.max_clip = 600
-dataset.pitch_read_w_downsample.min_clip = 200
-dataset.pitch_read_w_downsample.min_norm_pitch = -4915
-dataset.pitch_read_w_downsample.pitch_downsample = 10
-dataset.pitch_read_w_downsample.seq_len = %SEQ_LEN
-dataset.pitch_read_w_downsample.time_downsample = 2
 # Parameters for train/dataset.pitch_read_w_downsample:
 # ==============================================================================
-train/dataset.pitch_read_w_downsample.transpose_pitch = %TRANSPOSE_VALUE
-# Parameters for train/dataset.SequenceDataset:
 # ==============================================================================
-train/dataset.SequenceDataset.task_fn = @train/dataset.pitch_read_w_downsample
-# Parameters for val/dataset.SequenceDataset:
-# ==============================================================================
-val/dataset.SequenceDataset.task_fn = @dataset.pitch_read_w_downsample
-# Parameters for model.UNet:
 # ==============================================================================
-model.UNet.dropout = 0.3
-model.UNet.features = [512, 640, 1024]
-model.UNet.inp_dim = 1
-model.UNet.kernel_size = 5
-model.UNet.nonlinearity = 'mish'
-model.UNet.norm = True
-model.UNet.num_attns = 4
-model.UNet.num_convs = 4
-model.UNet.num_heads = 8
-model.UNet.project_dim = 256
-model.UNet.seq_len = %SEQ_LEN
-model.UNet.strides = [4, 2, 2]
-model.UNet.time_dim = 128

 from __gin__ import dynamic_registration
+from gamadhani import src
+from gamadhani.src import dataset
+from gamadhani.src import model_diffusion
+from gamadhani.src import task_functions
+from gamadhani.utils import utils
 import torch
 # Macros:
 utils.build_warmed_exponential_lr_scheduler.peak_iteration = 10000
 utils.build_warmed_exponential_lr_scheduler.start_factor = 0.01
+# Parameters for model_diffusion.UNetBase.configure_optimizers:
 # ==============================================================================
+model_diffusion.UNetBase.configure_optimizers.optimizer_cls = @torch.optim.AdamW
+model_diffusion.UNetBase.configure_optimizers.scheduler_cls = \
     @utils.build_warmed_exponential_lr_scheduler
+# Parameters for dataset.Task:
 # ==============================================================================
+src.dataset.Task.kwargs = {
+                        "decoder_key" : 'pitch',
+                        "max_clip" : 600,
+                        "min_clip" : 200,
+                        "min_norm_pitch" : -4915,
+                        "pitch_downsample" : 10,
+                        "seq_len" : %SEQ_LEN,
+                        "time_downsample" : 2}
 # Parameters for train/dataset.pitch_read_w_downsample:
 # ==============================================================================
+# train/dataset.Task.kwargs = {"transpose_pitch": %TRANSPOSE_VALUE}
+# Parameters for train/dataset.Task:
 # ==============================================================================
+src.dataset.Task.read_fn = @src.task_functions.pitch_read_downsample_diff
+src.dataset.Task.invert_fn = @src.task_functions.invert_pitch_read_downsample_diff
+# Parameters for model_diffusion.UNet:
 # ==============================================================================
+model_diffusion.UNet.dropout = 0.3
+model_diffusion.UNet.features = [512, 640, 1024]
+model_diffusion.UNet.inp_dim = 1
+model_diffusion.UNet.kernel_size = 5
+model_diffusion.UNet.nonlinearity = 'mish'
+model_diffusion.UNet.norm = True
+model_diffusion.UNet.num_attns = 4
+model_diffusion.UNet.num_convs = 4
+model_diffusion.UNet.num_heads = 8
+model_diffusion.UNet.project_dim = 256
+model_diffusion.UNet.seq_len = %SEQ_LEN
+model_diffusion.UNet.strides = [4, 2, 2]
+model_diffusion.UNet.time_dim = 128

models/pitch_to_audio/config.gin CHANGED Viewed

@@ -1,8 +1,9 @@
 from __gin__ import dynamic_registration
-from src import dataset
-from src import model
-from src import pitch_to_audio_utils
-from src import utils
 import torch
 # Macros:
@@ -27,10 +28,10 @@ utils.build_warmed_exponential_lr_scheduler.eta_min = 0.1
 utils.build_warmed_exponential_lr_scheduler.peak_iteration = 10000
 utils.build_warmed_exponential_lr_scheduler.start_factor = 0.01
-# Parameters for model.UNetBase.configure_optimizers:
 # ==============================================================================
-model.UNetBase.configure_optimizers.optimizer_cls = @torch.optim.AdamW
-model.UNetBase.configure_optimizers.scheduler_cls = \
     @utils.build_warmed_exponential_lr_scheduler
 # Parameters for pitch_to_audio_utils.from_mels:
@@ -39,11 +40,6 @@ pitch_to_audio_utils.from_mels.nfft = %NFFT
 pitch_to_audio_utils.from_mels.num_mels = %NUM_MELS
 pitch_to_audio_utils.from_mels.sr = %SR
-# Parameters for dataset.load_cached_dataset:
-# ==============================================================================
-dataset.load_cached_dataset.audio_len = %AUDIO_SEQ_LEN
-dataset.load_cached_dataset.return_singer = %SINGER_CONDITIONING
 # Parameters for pitch_to_audio_utils.normalized_mels_to_audio:
 # ==============================================================================
 pitch_to_audio_utils.normalized_mels_to_audio.n_iter = 100
@@ -53,7 +49,13 @@ pitch_to_audio_utils.normalized_mels_to_audio.sr = %SR
 # Parameters for dataset.SequenceDataset:
 # ==============================================================================
-dataset.SequenceDataset.task_fn = @dataset.load_cached_dataset
 # Parameters for pitch_to_audio_utils.torch_gl:
 # ==============================================================================
@@ -65,27 +67,28 @@ pitch_to_audio_utils.torch_gl.sr = %SR
 # ==============================================================================
 pitch_to_audio_utils.torch_istft.nfft = %NFFT
-# Parameters for model.UNetPitchConditioned:
 # ==============================================================================
-model.UNetPitchConditioned.audio_seq_len = %AUDIO_SEQ_LEN
-model.UNetPitchConditioned.cfg = True
-model.UNetPitchConditioned.cond_drop_prob = 0.2
-model.UNetPitchConditioned.dropout = 0.3
-model.UNetPitchConditioned.f0_dim = 128
-model.UNetPitchConditioned.features = [512, 640, 1024]
-model.UNetPitchConditioned.inp_dim = %NUM_MELS
-model.UNetPitchConditioned.kernel_size = 5
-model.UNetPitchConditioned.log_samples_every = 10
-model.UNetPitchConditioned.log_wandb_samples_every = 50
-model.UNetPitchConditioned.nonlinearity = 'mish'
-model.UNetPitchConditioned.norm = False
-model.UNetPitchConditioned.num_attns = 4
-model.UNetPitchConditioned.num_convs = 4
-model.UNetPitchConditioned.num_heads = 8
-model.UNetPitchConditioned.project_dim = 256
-model.UNetPitchConditioned.singer_conditioning = %SINGER_CONDITIONING
-model.UNetPitchConditioned.singer_dim = 128
-model.UNetPitchConditioned.singer_vocab = 55
-model.UNetPitchConditioned.sr = %SR
-model.UNetPitchConditioned.strides = [4, 2, 2]
-model.UNetPitchConditioned.time_dim = 128

 from __gin__ import dynamic_registration
+from gamadhani import src
+from gamadhani.src import dataset
+from gamadhani.src import model_diffusion
+from gamadhani.utils import pitch_to_audio_utils
+from gamadhani.utils import utils
 import torch
 # Macros:
 utils.build_warmed_exponential_lr_scheduler.peak_iteration = 10000
 utils.build_warmed_exponential_lr_scheduler.start_factor = 0.01
+# Parameters for model_diffusion.UNetPitchConditioned.configure_optimizers:
 # ==============================================================================
+model_diffusion.UNetPitchConditioned.configure_optimizers.optimizer_cls = @torch.optim.AdamW
+model_diffusion.UNetPitchConditioned.configure_optimizers.scheduler_cls = \
     @utils.build_warmed_exponential_lr_scheduler
 # Parameters for pitch_to_audio_utils.from_mels:
 pitch_to_audio_utils.from_mels.num_mels = %NUM_MELS
 pitch_to_audio_utils.from_mels.sr = %SR
 # Parameters for pitch_to_audio_utils.normalized_mels_to_audio:
 # ==============================================================================
 pitch_to_audio_utils.normalized_mels_to_audio.n_iter = 100
 # Parameters for dataset.SequenceDataset:
 # ==============================================================================
+dataset.SequenceDataset.task = @dataset.Task()
+# Parameters for dataset.Task:
+# ==============================================================================
+dataset.Task.read_fn = @dataset.load_cached_dataset
+dataset.Task.kwargs = {"audio_len": %AUDIO_SEQ_LEN,
+                       "return_singer": %SINGER_CONDITIONING}
 # Parameters for pitch_to_audio_utils.torch_gl:
 # ==============================================================================
 # ==============================================================================
 pitch_to_audio_utils.torch_istft.nfft = %NFFT
+# Parameters for model_diffusion.UNetPitchConditioned:
 # ==============================================================================
+model_diffusion.UNetPitchConditioned.audio_seq_len = %AUDIO_SEQ_LEN
+model_diffusion.UNetPitchConditioned.cfg = True
+model_diffusion.UNetPitchConditioned.cond_drop_prob = 0.2
+model_diffusion.UNetPitchConditioned.dropout = 0.3
+model_diffusion.UNetPitchConditioned.f0_dim = 128
+model_diffusion.UNetPitchConditioned.features = [512, 640, 1024]
+model_diffusion.UNetPitchConditioned.inp_dim = %NUM_MELS
+model_diffusion.UNetPitchConditioned.kernel_size = 5
+model_diffusion.UNetPitchConditioned.log_samples_every = 10
+model_diffusion.UNetPitchConditioned.log_wandb_samples_every = 50
+model_diffusion.UNetPitchConditioned.loss_w_padding = True
+model_diffusion.UNetPitchConditioned.nonlinearity = 'mish'
+model_diffusion.UNetPitchConditioned.norm = False
+model_diffusion.UNetPitchConditioned.num_attns = 4
+model_diffusion.UNetPitchConditioned.num_convs = 4
+model_diffusion.UNetPitchConditioned.num_heads = 8
+model_diffusion.UNetPitchConditioned.project_dim = 256
+model_diffusion.UNetPitchConditioned.singer_conditioning = %SINGER_CONDITIONING
+model_diffusion.UNetPitchConditioned.singer_dim = 128
+model_diffusion.UNetPitchConditioned.singer_vocab = 55
+model_diffusion.UNetPitchConditioned.sr = %SR
+model_diffusion.UNetPitchConditioned.strides = [4, 2, 2]
+model_diffusion.UNetPitchConditioned.time_dim = 128

requirements.txt CHANGED Viewed

@@ -1,22 +1,4 @@
-absl_py==1.4.0
-einops==0.8.0
-gin_config==0.5.0
-joblib==1.2.0
-librosa==0.10.0
-lmdb==1.4.1
-matplotlib==3.9.2
-numpy==1.24.4
-pandas==2.0.3
-protobuf==3.20.3
-pytorch_lightning==1.9.0
-scikit_learn==1.2.0
-setuptools==67.8.0
-torch==2.4.0
-torchaudio==2.4.0
-tqdm==4.65.0
-wandb==0.15.4
-x_transformers==1.30.2
 crepe==0.0.15
 hmmlearn==0.3.2
 tensorflow==2.17.0

 crepe==0.0.15
 hmmlearn==0.3.2
 tensorflow==2.17.0
+GaMaDHaNi @ git+https://github.com/snnithya/GaMaDHaNi.git@782dde8f48ff15a50394bcc7506df1ece0e0310e

src/dataset.py DELETED Viewed

@@ -1,312 +0,0 @@
-from typing import Callable, Dict, Optional, Tuple
-import lmdb
-import torch
-import pdb
-import numpy as np
-from torch.utils.data import Dataset
-from random import randint
-from sklearn.preprocessing import QuantileTransformer
-# from protobuf.data_example import AudioExample
-import gin
-import sys
-import src.pitch_to_audio_utils as p2a
-TensorDict = Dict[str, torch.Tensor]
-@gin.configurable
-class SequenceDataset(Dataset):
-    def __init__(
-            self,
-            db_path: str,
-            task_fn: Optional[Callable[[TensorDict], TensorDict]] = None,
-            device: Optional[torch.device] = None
-        ) -> None:
-        super().__init__()
-        self._env = None
-        self._keys = None
-        self._db_path = db_path
-        self.task_fn = task_fn
-        self.device = device
-    def __len__(self):
-        return len(self.keys)
-    def __getitem__(self, index):
-        # pdb.set_trace()
-        with self.env.begin() as txn:
-            ae = AudioExample(txn.get(self.keys[index]))
-        ae = ae.as_dict()
-        if self.task_fn is not None:
-            ae = self.task_fn(ae)
-        if self.device is not None:
-            ae = {k: torch.tensor(v, device=self.device) for k, v in ae.items()}
-        return ae
-    @property
-    def env(self):
-        if self._env is None:
-            self._env = lmdb.open(
-                self._db_path,
-                lock=False,
-                readahead=False,
-            )
-        return self._env
-    @property
-    def keys(self):
-        if self._keys is None:
-            with self.env.begin(write=False) as txn:
-                self._keys = list(txn.cursor().iternext(values=False))
-            self._keys = self._keys
-        return self._keys
-class MelPitchDataLoader(torch.utils.data.DataLoader):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-    def __iter__(self):
-        for batch in super().__iter__():
-            # Apply online transform to each sample in the batch
-            audio, f0 = batch
-            # generate mel spectrogram
-            mel = p2a.audio_to_normalized_mels(audio)   # doing mel conversion here since it is done in a batch and thus presumably faster
-            yield zip(mel, f0)
-@gin.configurable
-def pitch_read_w_downsample(
-    inputs: TensorDict,
-    seq_len: int,
-    decoder_key: str,
-    min_norm_pitch: int,
-    transpose_pitch: Optional[int] = None,
-    time_downsample: int = 1,
-    pitch_downsample: int = 1,
-    qt_transform: Optional[QuantileTransformer] = None,
-    min_clip: int = 200,
-    max_clip: int = 600,
-    add_noise_to_silence: bool = False
-    ):
-    # pdb.set_trace()
-    # print(min_norm_pitch, seq_len, transpose_pitch, qt_transform)
-    data = inputs[decoder_key]["data"]
-    if seq_len is not None:
-        start = randint(0, data.shape[0] - seq_len*time_downsample - 1)
-        end = start + seq_len*time_downsample
-        f0 = inputs[decoder_key]['data'][start:end:time_downsample].copy()
-    else:
-        f0 = data.copy()
-    # normalize pitch
-    f0[f0 == 0] = np.nan
-    norm_f0 = f0.copy()
-    norm_f0[~np.isnan(norm_f0)] = (1200) * np.log2(norm_f0[~np.isnan(norm_f0)] / 440)
-    del f0
-    # descretize pitch
-    norm_f0[~np.isnan(norm_f0)] = np.around(norm_f0[~np.isnan(norm_f0)])
-    norm_f0[~np.isnan(norm_f0)] = norm_f0[~np.isnan(norm_f0)] - (min_norm_pitch)
-    norm_f0[~np.isnan(norm_f0)] = norm_f0[~np.isnan(norm_f0)] // pitch_downsample + 1 # reserve 0 for silence
-    # data augmentation
-    if transpose_pitch:
-        transpose_amt = randint(-transpose_pitch, transpose_pitch)  # in cents
-        transposed_values = norm_f0[~np.isnan(norm_f0)] + (transpose_amt//pitch_downsample)
-        norm_f0[~np.isnan(norm_f0)] = transposed_values
-    # clip values HACK to change
-    norm_f0[~np.isnan(norm_f0)] = np.clip(norm_f0[~np.isnan(norm_f0)], min_clip, max_clip)
-    # add silence token of min_clip - 4
-    if add_noise_to_silence:
-        norm_f0[np.isnan(norm_f0)] = min_clip - 4 + np.clip(np.random.normal(size=norm_f0[np.isnan(norm_f0)].shape), -3, 3) # making sure noise is between -3 and 3 and thus won't spill into pitched values
-    else:
-        norm_f0[np.isnan(norm_f0)] = min_clip - 4
-    if qt_transform:
-        qt_inp = norm_f0.reshape(-1, 1)
-        norm_f0 = qt_transform.transform(qt_inp).reshape(-1)
-    return norm_f0.reshape(1, -1)
-def hz_to_cents(f0, ref=440, min_norm_pitch=0, pitch_downsample=1, min_clip=200, max_clip=600, silence_token=None):
-    # pdb.set_trace()
-    f0[f0 == 0] = np.nan
-    norm_f0 = f0.copy()
-    norm_f0[~np.isnan(norm_f0)] = (1200) * np.log2(norm_f0[~np.isnan(norm_f0)] / ref)
-    # descretize pitch
-    norm_f0[~np.isnan(norm_f0)] = np.around(norm_f0[~np.isnan(norm_f0)])
-    norm_f0[~np.isnan(norm_f0)] = norm_f0[~np.isnan(norm_f0)] - (min_norm_pitch)
-    norm_f0[~np.isnan(norm_f0)] = norm_f0[~np.isnan(norm_f0)] // pitch_downsample + 1 # reserve 0 for silence
-    norm_f0[~np.isnan(norm_f0)] = np.clip(norm_f0[~np.isnan(norm_f0)], min_clip, max_clip) #HACK
-    if silence_token is not None:
-        norm_f0[np.isnan(norm_f0)] = silence_token
-    return norm_f0
-@gin.configurable
-def mel_pitch(
-        inputs: TensorDict,
-        min_norm_pitch: int,
-        audio_seq_len: int=None,
-        pitch_downsample: int = 1,
-        qt_transform: Optional[QuantileTransformer] = None,
-        min_clip: int = 200,
-        max_clip: int = 600,
-        nfft: int = 2048,
-        convert_audio_to_mel: bool = False
-        ):
-    hop_size = nfft // 4
-    audio_data = inputs['audio']['data']
-    audio_sr = inputs['audio']['sampling_rate']
-    pitch_data = inputs['pitch']['data']
-    pitch_sr = inputs['pitch']['sampling_rate']
-    # pdb.set_trace()
-    if audio_seq_len is not None:
-        # if audio_seq_len is given, cuts audio/pitch else returns the entire chunk
-        pitch_seq_len = np.around((audio_seq_len/audio_sr) * pitch_sr ).astype(int)
-        pitch_start = randint(0, pitch_data.shape[0] - pitch_seq_len - 1)
-        pitch_end = pitch_start + pitch_seq_len
-        pitch_data = pitch_data[pitch_start:pitch_end]
-        audio_start = np.around(pitch_start * audio_sr // pitch_sr).astype(int)
-        audio_end = np.around(audio_start + audio_seq_len).astype(int)
-        # pdb.set_trace()
-        audio_data = audio_data[audio_start:audio_end]
-    else:
-        pitch_seq_len = np.around((audio_data.shape[0]/audio_sr) * pitch_sr ).astype(int)
-    audio_data = p2a.audio_to_normalized_mels(torch.Tensor(audio_data).unsqueeze(0), qt=qt_transform).numpy()[0]
-    pitch_data = hz_to_cents(pitch_data, min_norm_pitch=min_norm_pitch, pitch_downsample=pitch_downsample, min_clip=min_clip, max_clip=max_clip)
-    if audio_seq_len is not None:
-        # linearly interpolate pitch data to match audio sequence length, if audio_seq_len is given
-        pitch_inds = np.linspace(0, pitch_data.shape[0], num=audio_seq_len//hop_size, endpoint=False) #check here
-        pitch_data = np.interp(pitch_inds, np.arange(0, pitch_data.shape[0]), pitch_data)
-    # replace nan (aka silences) with min_clip - 4
-    pitch_data[np.isnan(pitch_data)] = min_clip - 4
-    return audio_data, pitch_data
-def running_average(signal, window_size):
-    weights = np.ones(int(window_size)) / window_size
-    pad_width = len(weights) // 2
-    padded_signal = np.pad(signal, pad_width, mode='symmetric')
-    # Perform the convolution
-    smoothed_signal = np.convolve(padded_signal, weights, mode='valid')
-    if window_size % 2 == 0:
-        smoothed_signal = smoothed_signal[:-1]
-    return smoothed_signal
-@gin.configurable
-def pitch_coarse_condition(
-        inputs: TensorDict,
-        min_norm_pitch: int,
-        pitch_seq_len: int=None,
-        pitch_downsample: int = 1,
-        time_downsample: int = 1,
-        qt_transform: Optional[QuantileTransformer] = None,
-        min_clip: int = 200,
-        max_clip: int = 600,
-        add_noise: bool = True,
-        avg_window_size: float = 1 # window size in seconds
-        ):
-    pitch_data = inputs['pitch']['data']
-    if pitch_seq_len is not None:
-        pitch_start = randint(0, pitch_data.shape[0] - pitch_seq_len*time_downsample - 1)
-        pitch_end = pitch_start + pitch_seq_len*time_downsample
-        pitch_data = pitch_data[pitch_start:pitch_end:time_downsample]
-    pitch_data = hz_to_cents(pitch_data, min_norm_pitch=min_norm_pitch, pitch_downsample=pitch_downsample, min_clip=min_clip, max_clip=max_clip)
-    # extract coarse pitch condition
-    pitch_sr = inputs['pitch']['sampling_rate'] // time_downsample
-    avg_pitch = running_average(pitch_data, np.around(pitch_sr * avg_window_size).astype(int))
-    # replace nan (aka silences) with min_clip - 4
-    if add_noise:
-        pitch_data[np.isnan(pitch_data)] = min_clip - 4 + np.clip(np.random.normal(size=pitch_data[np.isnan(pitch_data)].shape), -3, 3) # making sure noise is between -3 and 3 and thus won't spill into pitched values
-        avg_pitch[np.isnan(avg_pitch)] = min_clip - 4 + np.clip(np.random.normal(size=avg_pitch[np.isnan(avg_pitch)].shape), -3, 3) # making sure noise is between -3 and 3 and thus won't spill into pitched values
-    else:
-        pitch_data[np.isnan(pitch_data)] = min_clip - 4
-    if qt_transform:
-        # apply qt transform
-        qt_inp = pitch_data.reshape(-1, 1)
-        pitch_data = qt_transform.transform(qt_inp).reshape(-1)
-        avg_qt_inp = avg_pitch.reshape(-1, 1)
-        avg_pitch = qt_transform.transform(avg_qt_inp).reshape(-1)
-    # pdb.set_trace()
-    return pitch_data, avg_pitch
-@gin.configurable
-def mel_pitch_coarse_condition(
-        inputs: TensorDict,
-        min_norm_pitch: int,
-        audio_seq_len: int=None,
-        pitch_downsample: int = 1,
-        qt_transform: Optional[QuantileTransformer] = None,
-        min_clip: int = 200,
-        max_clip: int = 600,
-        nfft: int = 2048,
-        avg_window_size: float = 1 # duration of avg window in seconds
-):
-    mel, pitch = mel_pitch(inputs, min_norm_pitch, audio_seq_len, pitch_downsample, qt_transform, min_clip, max_clip, nfft)
-    silence_token = min_clip - 4
-    avg_pitch = pitch.copy()
-    avg_pitch[pitch == silence_token] = np.nan
-    time = mel.shape[1]/inputs['audio']['sampling_rate']
-    pitch_sr = pitch.shape[0]/time
-    avg_pitch = running_average(avg_pitch, np.around(pitch_sr*avg_window_size))
-    avg_pitch[np.isnan(avg_pitch)] = silence_token
-    return mel, pitch, avg_pitch
-def load_cached_audio(
-        inputs: TensorDict,
-        audio_len: Optional[float] = None,
-    ) -> torch.Tensor:
-    audio_data = inputs['audio']['data']
-    if audio_len is not None:
-        audio_start = randint(0, audio_data.shape[1] - audio_len - 1)
-        audio_end = audio_start + audio_len
-        audio_data = audio_data[:, audio_start:audio_end]
-    return torch.Tensor(audio_data)
-# need to add a silence token / range, calculate pitch avg
-def load_cached_dataset(
-        inputs: TensorDict,
-        audio_len: float,
-        return_singer: bool = False
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-    # pdb.set_trace()
-    audio_sr = inputs['audio']['sampling_rate']
-    audio_data = inputs['audio']['data']
-    audio_start = randint(0, audio_data.shape[1] - audio_len - 1)
-    audio_end = audio_start + audio_len
-    audio_data = audio_data[:, audio_start:audio_end]
-    pitch_sr = inputs['pitch']['sampling_rate']
-    pitch_len = np.floor(audio_len / audio_sr * pitch_sr).astype(int)
-    pitch_data = inputs['pitch']['data']
-    pitch_start = np.floor(audio_start * pitch_sr / audio_sr).astype(int)
-    pitch_end = pitch_start + pitch_len
-    pitch_data = pitch_data[pitch_start:pitch_end]
-    # interpolate data to match audio length
-    pitch_inds = np.linspace(0, pitch_data.shape[0], num=audio_len, endpoint=False) #check here
-    pitch_data = np.interp(pitch_inds, np.arange(0, pitch_data.shape[0]), pitch_data)
-    if return_singer:
-        singer = torch.Tensor([inputs['global_conditions']['singer']])
-    else:
-        singer = None
-    # print(audio_data.shape, pitch_data.shape, singer.shape if singer is not None else None)
-    return torch.Tensor(audio_data), torch.Tensor(pitch_data), singer

src/generate_utils.py DELETED Viewed

@@ -1,88 +0,0 @@
-import numpy as np
-from typing import Optional
-from sklearn.preprocessing import QuantileTransformer
-import sys
-import pdb
-sys.path.append('../pitch-diffusion')
-import torch
-import gin
-from src.model import UNet, UNetPitchConditioned
-from functools import partial
-import joblib
-from src.dataset import hz_to_cents, pitch_read_w_downsample
-def invert_pitch_read(pitch,
-                  min_norm_pitch: int,
-                  time_downsample: int,
-                  pitch_downsample: int,
-                  qt_transform: Optional[QuantileTransformer],
-                  min_clip: int,
-                  max_clip: int):
-    try:
-        pitch = pitch.detach().cpu().numpy()
-    except:
-        pass
-    if qt_transform is not None:
-        pitch = qt_transform.inverse_transform(pitch.reshape(-1, 1))
-        pitch.reshape(1, -1)
-    pitch[pitch < min_clip] = np.nan
-    pitch[~np.isnan(pitch)] = (pitch[~np.isnan(pitch)] - 1) * pitch_downsample
-    pitch[~np.isnan(pitch)] = pitch[~np.isnan(pitch)] + min_norm_pitch
-    pitch[~np.isnan(pitch)] = 440 * 2**(pitch[~np.isnan(pitch)] / 1200)
-    pitch[np.isnan(pitch)] = 0
-    return pitch, 200//time_downsample
-def invert_tonic(tonic: Optional[int] = None,
-                      min_norm_pitch: int = 0,
-                      min_clip: int = 200,
-                      pitch_downsample: int = 1,
-                      ):
-    tonic += min_clip
-    tonic = pitch_downsample * (tonic - 1)
-    tonic += min_norm_pitch
-    tonic = 440 * 2**(tonic / 1200)
-    return tonic
-def load_processed_pitch(pitch,
-                         audio_seq_len: int,
-                         min_norm_pitch: int,
-                         pitch_downsample: int,
-                         min_clip: int,
-                         max_clip: int,
-                        ):
-    # pdb.set_trace()
-    pitch = hz_to_cents(pitch, min_norm_pitch=min_norm_pitch, min_clip=min_clip, max_clip=max_clip, pitch_downsample=pitch_downsample, silence_token=min_clip-4)
-    pitch_inds = np.linspace(0, pitch.shape[0], num=audio_seq_len, endpoint=False)
-    pitch = np.interp(pitch_inds, np.arange(0, pitch.shape[0]), pitch)
-    return pitch
-def load_pitch_model(config, ckpt, qt = None, prime_file=None, device='cuda'):
-    gin.parse_config_file(config)
-    model = UNet()
-    model.load_state_dict(torch.load(ckpt, map_location='cuda')['state_dict'])
-    model.to(device)
-    if qt is not None:
-        qt = joblib.load(qt)
-    if prime_file is not None:
-        with gin.config_scope('val'): # probably have to change this
-            with gin.unlock_config():
-                gin.bind_parameter('dataset.pitch_read_w_downsample.qt_transform', qt)
-        primes = np.load(prime_file, allow_pickle=True)['concatenated_array'][:, 0]
-    else:
-        primes = None
-        task_fn = None
-    task_fn = partial(pitch_read_w_downsample,
-    seq_len=None)
-    return model, qt, primes, task_fn
-def load_audio_model(config, ckpt, qt = None, device='cuda'):
-    gin.parse_config_file(config)
-    model = UNetPitchConditioned() # there are no gin parameters for some reason
-    model.load_state_dict(torch.load(ckpt, map_location='cuda')['state_dict'])
-    model.to(device)
-    if qt is not None:
-        qt = joblib.load(qt)
-    return model, qt

src/model.py DELETED Viewed

@@ -1,1130 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import pytorch_lightning as pl
-import torch.nn.functional as F
-import math
-from typing import Optional, Union
-import numpy as np
-import wandb
-import matplotlib.pyplot as plt
-import gin
-import os
-import pandas as pd
-import src.pitch_to_audio_utils as p2a
-import torchaudio
-from typing import Callable
-from pytorch_lightning.utilities import grad_norm
-import sys
-sys.path.append('..')
-sys.path.append('../x-transformers/')
-from src.utils import prob_mask_like
-from x_transformers.x_transformers import AttentionLayers
-import pdb
-def get_activation(act: str = 'mish'):
-    act = act.lower()
-    if act == 'mish':
-        return nn.Mish()
-    elif act == 'relu':
-        return nn.ReLU()
-    elif act == 'leaky_relu':
-        return nn.LeakyReLU()
-    elif act == 'gelu':
-        return nn.GELU()
-    elif act == 'swish':
-        return nn.SiLU()
-    else:
-        raise ValueError(f'Activation {act} not supported')
-def get_weight_norm(layer):
-    return torch.nn.utils.parametrizations.weight_norm(layer)
-def get_layer(layer, norm: bool):
-    if norm:
-        return get_weight_norm(layer)
-    else:
-        return layer
-class PositionalEncoding(nn.Module):
-    def __init__(self, dim):
-        super(PositionalEncoding, self).__init__()
-        self.dim = dim
-    def forward(self, x):
-        shape = x.shape
-        x = x * 100
-        w = torch.pow(10000, (2 * torch.arange(self.dim // 2).float() / self.dim)).to(x)
-        x = x.unsqueeze(-1) / w
-        embed = torch.cat([torch.cos(x), torch.sin(x)], -1)
-        embed = embed.reshape(*shape, -1)
-        if len(shape) == 2:  # f0 embedding, else time embedding
-            embed = embed.permute(0, 2, 1)
-        return embed
-class ConvBlock(nn.Module):
-    def __init__(self,
-                 inp_dim,
-                 out_dim,
-                 kernel_size: int = 3,
-                 stride: int = 1,
-                 padding: Union[str, int] = "same",
-                 norm: bool = True,
-                 nonlinearity: Optional[str] = None,
-                 up: bool = False,
-                 dropout: float = 0.0,
-                 ):
-        super(ConvBlock, self).__init__()
-        self.inp_dim = inp_dim
-        self.out_dim = out_dim
-        # self.norm = norm
-        # pdb.set_trace()
-        if nonlinearity is not None:
-            self.nonlinearity = get_activation(nonlinearity)
-        else:
-            self.nonlinearity = None
-        if up:
-            self.conv = get_layer(nn.ConvTranspose1d(inp_dim, out_dim, kernel_size=kernel_size, stride=stride, padding=padding), norm)
-        else:
-            self.conv = get_layer(nn.Conv1d(inp_dim, out_dim, kernel_size=kernel_size, stride=stride, padding=padding), norm)
-        self.layers = nn.ModuleList()
-        if self.nonlinearity is not None:
-            self.layers.append(self.nonlinearity)
-        if dropout > 0:
-            self.layers.append(nn.Dropout(dropout))
-        self.layers.append(self.conv)
-    def forward(self, x):
-        for layer in self.layers:
-            x = layer(x)
-        return x
-class UpSampleLayer(nn.Module):
-    def __init__(self,
-                inp_dim,
-                out_dim,
-                kernel_size: int = 3,
-                stride: int = 1,
-                padding: Union[str, int] = "same",
-                num_convs: int = 2,
-                norm: bool = True,
-                nonlinearity: Optional[str] = None,
-                dropout: float = 0.0,
-                ):
-        super(UpSampleLayer, self).__init__()
-        assert num_convs > 0, "Number of convolutions must be greater than 0"
-        self.num_convs = num_convs
-        self.convs = nn.ModuleList([])
-        self.convs.append(ConvBlock(inp_dim, out_dim, kernel_size=stride*2, stride=stride, padding=padding, norm=norm, nonlinearity=nonlinearity, up=True))  # first convolutional layer to upsample
-        for ind in range(1, num_convs):
-            self.convs.append(ConvBlock(out_dim, out_dim, kernel_size=kernel_size, stride=1, padding="same", norm=norm, nonlinearity=nonlinearity, up=False, dropout=dropout if ind == num_convs-1 else 0))
-    def forward(self, x):
-        for conv in self.convs:
-            x = conv(x)
-        return x
-class DownSampleLayer(nn.Module):
-    def __init__(self,
-                inp_dim,
-                out_dim,
-                kernel_size: int = 3,
-                stride: int = 1,
-                padding: Union[str, int] = "same",
-                num_convs: int = 2,
-                norm: bool = True,
-                nonlinearity: Optional[str] = None,
-                dropout: float = 0.0,
-                ):
-        super(DownSampleLayer, self).__init__()
-        assert num_convs > 0, "Number of convolutions must be greater than 0"
-        self.num_convs = num_convs
-        self.convs = nn.ModuleList([])
-        self.convs.append(ConvBlock(inp_dim, out_dim, kernel_size=stride*2, stride=stride, padding=padding, norm=norm, nonlinearity=nonlinearity, up=False))  # first convolutional layer to upsample
-        for ind in range(1, num_convs):
-            self.convs.append(ConvBlock(out_dim, out_dim, kernel_size=kernel_size, stride=1, padding="same", norm=norm, nonlinearity=nonlinearity, up=False, dropout=dropout if ind == num_convs-1 else 0))
-    def forward(self, x):
-        for conv in self.convs:
-            x = conv(x)
-        return x
-# class Attention(nn.Module):
-#     def __init__(self,
-#                  num_heads,
-#                  num_channels,
-#                  dropout=0.0):
-#         super(Attention, self).__init__()
-#         self.num_heads = num_heads
-#         self.num_channels = num_channels
-#         self.layer_norm1 = nn.LayerNorm(self.num_channels)
-#         self.layer_norm2 = nn.LayerNorm(self.num_channels)
-#         self.qkv_proj = nn.Linear(self.num_channels, self.num_channels * 3, bias=False)
-#         self.head_dim = self.num_channels // self.num_heads
-#         self.final_proj = nn.Linear(self.num_channels, self.num_channels)
-#         self.dropout = nn.Dropout(dropout)
-#     def split_heads(self, x):
-#         # input shape bs, time, channels
-#         x = x.view(x.shape[0], x.shape[1], self.num_heads, self.head_dim)
-#         return x.permute(0, 2, 1, 3) # bs, num_heads, time, head_dim
-#     def forward(self, x):
-#         # pdb.set_trace()
-#         x = torch.permute(x, (0, 2, 1)) # bs, time, channels
-#         residual = x
-#         x = self.layer_norm1(x)
-#         x = self.qkv_proj(x)
-#         q, k, v = x.chunk(3, dim=-1)
-#         # split heads
-#         q = self.split_heads(q)
-#         k = self.split_heads(k)
-#         v = self.split_heads(v)
-#         # calculate attention
-#         x = torch.einsum("...td,...sd->...ts", q, k) / math.sqrt(self.head_dim)
-#         x = self.dropout(x)
-#         x = torch.einsum("...ts,...sd->...td", F.softmax(x, dim=-1), v) # bs, num_heads, time, head_dim
-#         # combine heads
-#         x = torch.permute(x, (0, 2, 1, 3)) # bs, time, num_heads, head_dim
-#         x = x.reshape(x.shape[0], x.shape[1], self.num_heads * self.head_dim)
-#         # final projection
-#         x = self.final_proj(x)
-#         x = self.layer_norm2(x + residual)
-#         return torch.permute(x, (0, 2, 1)) # bs, channels, time
-class ResNetBlock(nn.Module):
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 dropout: float = 0.0,
-                 nonlinearity: Optional[str] = None,
-                 kernel_size: int = 3,
-                 stride: int = 1,
-                 norm: bool = True,
-                up: bool = False,
-                num_convs: int = 2,
-                ):
-        super(ResNetBlock, self).__init__()
-        self.input_layers = nn.ModuleList([])
-        if nonlinearity is not None:
-            self.input_layers.append(get_activation(nonlinearity))
-        if up:
-            self.input_layers.append(get_layer(nn.ConvTranspose1d(in_channels, out_channels, kernel_size=stride*2, stride=stride, padding=stride//2), norm))
-        else:
-            if in_channels != out_channels:
-                self.input_layers.append(get_layer(nn.Conv1d(in_channels, out_channels, kernel_size=stride*2, stride=stride, padding=stride//2), norm))
-            elif stride > 1:
-                self.input_layers.append(nn.AvgPool1d(stride*2, stride=stride, padding=stride//2))
-            else:
-                self.input_layers.append(nn.Identity())
-        if up:
-            self.process_layer = UpSampleLayer(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=stride//2, num_convs=num_convs, norm=norm, nonlinearity=nonlinearity, dropout=dropout)
-        else:
-            self.process_layer = DownSampleLayer(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=stride//2, num_convs=num_convs, norm=norm, nonlinearity=nonlinearity, dropout=dropout)
-    def forward(self, x):
-        # pdb.set_trace()
-        inputs = x.clone()
-        for layer in self.input_layers:
-            inputs = layer(inputs)
-        x = self.process_layer(x)
-        return x + inputs
-@gin.configurable
-class UNetBase(pl.LightningModule):
-    def __init__(self, log_grad_norms_every=10):
-        super(UNetBase, self).__init__()
-        self.log_grad_norms_every = log_grad_norms_every
-    @gin.configurable
-    def configure_optimizers(self, optimizer_cls: Callable[[], torch.optim.Optimizer],
-    scheduler_cls: Callable[[],
-                            torch.optim.lr_scheduler._LRScheduler]):
-        # pdb.set_trace()
-        optimizer = optimizer_cls(self.parameters())
-        scheduler = scheduler_cls(optimizer)
-        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]
-@gin.configurable
-class UNet(UNetBase):
-    def __init__(self,
-                 inp_dim,
-                 time_dim,
-                 features,
-                 strides,
-                 kernel_size,
-                 seq_len,
-                 project_dim=None,
-                 dropout=0.0,
-                 nonlinearity=None,
-                 norm=True,
-                 num_convs=2,
-                 num_attns=2,
-                 num_heads=8,
-                 log_samples_every=10,
-                 ckpt=None,
-                 loss_w_padding=False,
-                 groups=None,
-                 nfft=None,
-                log_grad_norms_every=10
-                 ):
-        super(UNet, self).__init__()
-        self.time_dim = time_dim
-        self.features = features
-        self.strides = strides
-        self.kernel_size = kernel_size
-        self.seq_len = seq_len
-        self.log_samples_every = log_samples_every
-        self.ckpt = ckpt
-        self.strides_prod = np.prod(strides)
-        self.loss_w_padding = loss_w_padding
-        if log_grad_norms_every is not None:
-            assert log_grad_norms_every > 0, "log_grad_norms_every must be greater than 0"
-        self.log_grad_norms_every = log_grad_norms_every
-        if project_dim is None:
-            project_dim = features[0]
-        self.initial_projection = nn.Conv1d(inp_dim, project_dim, kernel_size=1)
-        self.positional_encoding = PositionalEncoding(time_dim)
-        features = [project_dim] + features
-        strides = [None] + strides
-        self.downsample_layers = nn.ModuleList([
-            ResNetBlock(features[ind-1] + time_dim,
-                        features[ind],
-                        kernel_size=kernel_size,
-                        stride=strides[ind],
-                        dropout=dropout,
-                        nonlinearity=nonlinearity,
-                        norm=norm,
-                        num_convs=num_convs,
-                        ) for ind in range(1, len(features))
-        ])
-        # self.attention_layers = nn.ModuleList(
-        #     [Attention(num_heads=num_heads, num_channels=features[-1], dropout=dropout) for _ in range(num_attns)]
-        # )
-        self.attention_layers = AttentionLayers(
-            dim = features[-1],
-            heads = num_heads,
-            depth = num_attns,
-        )
-        self.upsample_layers = nn.ModuleList([
-            ResNetBlock(features[ind] * 2 + time_dim, # input size defined by features + skip dimension + time dimension
-            features[ind-1],
-            kernel_size=kernel_size,
-            stride=strides[ind],
-            dropout=dropout,
-            nonlinearity=nonlinearity,
-            norm=norm,
-            num_convs=num_convs,
-            up=True
-            ) for ind in range(len(features) - 1, 0, -1)
-        ])
-        self.final_projection = nn.Conv1d(2*project_dim, inp_dim, kernel_size=1)
-    def pad_to(self, x, strides):
-        # modified from: https://stackoverflow.com/questions/66028743/how-to-handle-odd-resolutions-in-unet-architecture-pytorch
-        l = x.shape[-1]
-        if l % strides > 0:
-            new_l = l + strides - l % strides
-        else:
-            new_l = l
-        ll, ul = int((new_l-l) / 2), int(new_l-l) - int((new_l-l) / 2)
-        pads = (ll, ul)
-        out = F.pad(x, pads, "reflect").to(x)
-        return out, pads
-    def unpad(self, x, pad):
-        # modified from: https://stackoverflow.com/questions/66028743/how-to-handle-odd-resolutions-in-unet-architecture-pytorch
-        if pad[0]+pad[1] > 0:
-            x = x[:,:,pad[0]:-pad[1]]
-        return x
-    def forward(self, x, time):
-        # INITIAL PROJECTION
-        x = self.initial_projection(x)
-        # TIME CONDITIONING
-        time = self.positional_encoding(time)
-        def _concat_time(x, time):
-            time = time.unsqueeze(2).expand(-1, -1, x.shape[-1])
-            x = torch.cat([x, time], -2)
-            return x
-        skips = []
-        # DOWNSAMPLING
-        for ind, downsample_layer in enumerate(self.downsample_layers):
-            # print(f'Down sample layer {ind}')
-            skips.append(x)
-            x = _concat_time(x, time)
-            x = downsample_layer(x)
-        skips.append(x)
-        # BOTTLENECK ATTENTION
-        x = torch.permute(x, (0, 2, 1))
-        x = self.attention_layers(x)
-        x = torch.permute(x, (0, 2, 1))
-        # pdb.set_trace()
-        # UPSAMPLING
-        for ind, upsample_layer in enumerate(self.upsample_layers):
-            # print(f'Up sample layer {ind}')
-            x = _concat_time(x, time)
-            x = torch.cat([x, skips.pop(-1)], 1)
-            x = upsample_layer(x)
-        x = torch.cat([x, skips.pop(-1)], 1)
-        # FINAL PROJECTION
-        x = self.final_projection(x)
-        return x
-    def loss(self, x):
-        # pdb.set_trace()
-        padded_x, padding = self.pad_to(x, self.strides_prod)
-        t = torch.rand((padded_x.shape[0],)).to(padded_x)
-        noise = torch.normal(0, 1, padded_x.shape).to(padded_x)
-        # print(t.device, noise.device, x.device)
-        x_t = t[:, None, None] * padded_x + (1 - t[:, None, None]) * noise
-        # print(t.device, noise.device, x_t.device, x.device)
-        padded_y = self.forward(x_t, t)
-        unpadded_y = self.unpad(padded_y, padding)
-        if self.loss_w_padding:
-            target = padded_x - noise
-            return torch.mean((padded_y - target) ** 2)
-        else:
-            target = x - self.unpad(noise, padding) # x1 - x0
-            return torch.mean((unpadded_y - target) ** 2)
-    def on_before_optimizer_step(self, optimizer, *_):
-        def calculate_grad_norm(module_list, norm_type=2):
-            total_norm = 0
-            if isinstance(module_list, nn.Module):
-                module_list = [module_list]
-            for module in module_list:
-                for name, param in module.named_parameters():
-                    if param.requires_grad:
-                        param_norm = torch.norm(param.grad.detach(), p=norm_type)
-                        total_norm += param_norm**2
-            # pdb.set_trace()
-            total_norm = torch.sqrt(total_norm)
-            return total_norm
-        if self.log_grad_norms_every is not None and self.global_step % self.log_grad_norms_every == 0:
-            self.log('Grad Norm/Downsample Layers', calculate_grad_norm(self.downsample_layers))
-            self.log('Grad Norm/Attention Layers', calculate_grad_norm(self.attention_layers))
-            self.log('Grad Norm/Upsample Layers', calculate_grad_norm(self.upsample_layers))
-    def training_step(self, batch, batch_idx):
-        # print('\n', batch_idx, batch.shape)
-        x = batch
-        loss = self.loss(x)
-        # log grad_norms
-        # if self.log_grad_norms_every > 0 and self.current_epoch % self.log_grad_norms_every == 0:
-        # for ind, attention_layer in enumerate(self.attention_layers):
-        #     self.log(f'Grad Norm/Attention Layer {ind}', grad_norm(attention_layer, norm_type=2))
-        # for ind, downsample_layer in enumerate(self.downsample_layers):
-            # self.log(f'Grad Norm/Downsample Layer {ind}', grad_norm(downsample_layer, norm_type=2))
-        self.log('train_loss', loss)
-        return loss
-    def validation_step(self, batch, batch_idx):
-        x = batch
-        loss = self.loss(x)
-        self.log('val_loss', loss)
-        return loss
-    def sample_fn(self, batch_size: int, num_steps: int, prime: Optional[torch.Tensor] = None):
-        # CREATE INITIAL NOISE
-        if prime is not None:
-            prime = prime.to(self.device)
-        noise = torch.normal(mean=0, std=1, size=(batch_size, 1, self.seq_len)).to(self.device)
-        x_alpha_t = noise.clone()
-        t_array = torch.ones((batch_size,)).to(self.device)
-        # x_alpha_ts = {}
-        with torch.no_grad():
-            # SAMPLE FROM MODEL
-            for t in np.linspace(0, 1, num_steps + 1)[:-1]:
-                t_tensor = torch.tensor(t)
-                alpha_t = t_tensor * t_array
-                alpha_t = alpha_t.unsqueeze(1).unsqueeze(2).to(self.device)
-                if prime is not None:
-                    x_alpha_t[:, :, :prime.shape[-1]] = ((1 - alpha_t) * noise[:, :, :prime.shape[-1]]) + (alpha_t * prime) # fill in the prime in the beginning of each x_t
-                diff =  self.forward(x_alpha_t, t_tensor * t_array)
-                x_alpha_t = x_alpha_t + 1 / num_steps * diff
-                # x_alpha_ts[t] = x_alpha_t
-            # if prime is not None:
-            #     x_alpha_t[:, :, :prime.shape[-1]] = prime
-        return x_alpha_t
-    def sample_sdedit(self, cond, batch_size, num_steps, t0=0.5):
-        # pdb.set_trace()
-        t0_steps = int(t0*num_steps)
-        # iterate to get x0
-        t_array = torch.ones((batch_size,)).to(self.device)
-        x_alpha_t = cond.clone()
-        with torch.no_grad():
-            for t in np.linspace(t0, 0, t0_steps + 1)[:-1]:
-                t_tensor = torch.tensor(t)
-                x_alpha_t = x_alpha_t - (1 / num_steps) * self.forward(x_alpha_t, t_tensor * t_array)
-            # x_alpha_t is x0 now
-            # iterate to get x1
-            for t in np.linspace(0, 1, num_steps + 1)[:-1]:
-                t_tensor = torch.tensor(t)
-                # print(unet.device, noise.device, t_tensor.device, t_array.device)
-                x_alpha_t = x_alpha_t + 1 / num_steps * self.forward(x_alpha_t, t_tensor * t_array)
-        return x_alpha_t
-    def on_validation_epoch_end(self) -> None:
-        if self.current_epoch % self.log_samples_every == 0:
-            samples = self.sample_fn(16, 100).detach().cpu().numpy()
-            if self.ckpt is not None:
-                os.makedirs(os.path.join(self.ckpt, 'samples', str(self.current_epoch)), exist_ok=True)
-            fig, axs = plt.subplots(4, 4, figsize=(16, 16))
-            for i in range(4):
-                for j in range(4):
-                    axs[i, j].plot(samples[i*4+j].squeeze())
-                    pd.DataFrame(samples[i*4+j].squeeze(), columns=['normalized_pitch']).to_csv(os.path.join(self.ckpt, 'samples', str(self.current_epoch), f'sample_{i*4+j}.csv'))
-            if self.logger:
-                wandb.log({"samples": [wandb.Image(fig, caption="Samples")]})
-            else:
-                fig.savefig(os.path.join(self.ckpt, 'samples', str(self.current_epoch), 'samples.png'))
-            plt.close(fig)
-@gin.configurable
-class UNetAudio(UNetBase):
-    def __init__(self,
-                 inp_dim,
-                 time_dim,
-                 features,
-                 strides,
-                 kernel_size,
-                 seq_len,
-                 project_dim=None,
-                 dropout=0.0,
-                 nonlinearity=None,
-                 norm=True,
-                 num_convs=2,
-                 num_attns=2,
-                 num_heads=8,
-                 ckpt=None,
-                 qt = None,
-                 log_samples_every = 10,
-                 log_wandb_samples_every = 50,
-                 sr=16000,
-                 loss_w_padding=False,
-                 log_grad_norms_every=10
-                 ):
-        super(UNetAudio, self).__init__()
-        self.inp_dim = inp_dim
-        self.time_dim = time_dim
-        self.features = features
-        self.strides = strides
-        self.kernel_size = kernel_size
-        self.seq_len = seq_len
-        self.log_samples_every = log_samples_every
-        self.log_wandb_samples_every = log_wandb_samples_every
-        self.ckpt = ckpt
-        self.qt = qt
-        self.sr = sr
-        self.strides_prod = np.prod(strides)
-        self.loss_w_padding = loss_w_padding
-        self.log_grad_norms_every = log_grad_norms_every
-        if project_dim is None:
-            project_dim = features[0]
-        self.initial_projection = nn.Conv1d(inp_dim, project_dim, kernel_size=1)
-        self.positional_encoding = PositionalEncoding(time_dim)
-        features = [project_dim] + features
-        strides = [None] + strides
-        self.downsample_layers = nn.ModuleList([
-            ResNetBlock(features[ind-1] + time_dim,
-                        features[ind],
-                        kernel_size=kernel_size,
-                        stride=strides[ind],
-                        dropout=dropout,
-                        nonlinearity=nonlinearity,
-                        norm=norm,
-                        num_convs=num_convs,
-                        ) for ind in range(1, len(features))
-        ])
-        self.attention_layers = AttentionLayers(
-            dim = features[-1],
-            heads = num_heads,
-            depth = num_attns,
-        )
-        self.upsample_layers = nn.ModuleList([
-            ResNetBlock(features[ind] * 2 + time_dim, # input size defined by features + skip dimension + time dimension
-            features[ind-1],
-            kernel_size=kernel_size,
-            stride=strides[ind],
-            dropout=dropout,
-            nonlinearity=nonlinearity,
-            norm=norm,
-            num_convs=num_convs,
-            up=True
-            ) for ind in range(len(features) - 1, 0, -1)
-        ])
-        self.final_projection = nn.Conv1d(2*project_dim, inp_dim, kernel_size=1)
-        self.losses = []
-    def forward(self, x, time):
-        # INITIAL PROJECTION
-        x = self.initial_projection(x)
-        # TIME CONDITIONING
-        time = self.positional_encoding(time)
-        def _concat_time(x, time):
-            time = time.unsqueeze(2).expand(-1, -1, x.shape[-1])
-            x = torch.cat([x, time], -2)
-            return x
-        skips = []
-        # DOWNSAMPLING
-        for ind, downsample_layer in enumerate(self.downsample_layers):
-            # print(f'Down sample layer {ind}')
-            skips.append(x)
-            x = _concat_time(x, time)
-            x = downsample_layer(x)
-        skips.append(x)
-        # BOTTLENECK ATTENTION
-        x = torch.permute(x, (0, 2, 1))
-        x = self.attention_layers(x)
-        x = torch.permute(x, (0, 2, 1))
-        # pdb.set_trace()
-        # UPSAMPLING
-        for ind, upsample_layer in enumerate(self.upsample_layers):
-            # print(f'Up sample layer {ind}')
-            x = _concat_time(x, time)
-            x = torch.cat([x, skips.pop(-1)], 1)
-            x = upsample_layer(x)
-        x = torch.cat([x, skips.pop(-1)], 1)
-        # FINAL PROJECTION
-        x = self.final_projection(x)
-        return x
-    def pad_to(self, x, strides):
-        # modified from: https://stackoverflow.com/questions/66028743/how-to-handle-odd-resolutions-in-unet-architecture-pytorch
-        l = x.shape[-1]
-        if l % strides > 0:
-            new_l = l + strides - l % strides
-        else:
-            new_l = l
-        ll, ul = int((new_l-l) / 2), int(new_l-l) - int((new_l-l) / 2)
-        pads = (ll, ul)
-        out = F.pad(x, pads, "reflect").to(x)
-        return out, pads
-    def unpad(self, x, pad):
-        # modified from: https://stackoverflow.com/questions/66028743/how-to-handle-odd-resolutions-in-unet-architecture-pytorch
-        if pad[0]+pad[1] > 0:
-            x = x[:,:,pad[0]:-pad[1]]
-        return x
-    def loss(self, x):
-        padded_x, padding = self.pad_to(x, self.strides_prod)
-        t = torch.rand((padded_x.shape[0],)).to(padded_x)
-        noise = torch.normal(0, 1, padded_x.shape).to(padded_x)
-        # print(t.device, noise.device, x.device)
-        x_t = t[:, None, None] * padded_x + (1 - t[:, None, None]) * noise
-        # print(t.device, noise.device, x_t.device, x.device)
-        padded_y = self.forward(x_t, t)
-        unpadded_y = self.unpad(padded_y, padding)
-        if self.loss_w_padding:
-            target = padded_x - noise
-            return torch.mean((padded_y - target) ** 2)
-        else:
-            target = x - self.unpad(noise, padding) # x1 - x0
-            return torch.mean((unpadded_y - target) ** 2)
-    def training_step(self, batch, batch_idx):
-        # print('\n', batch_idx, batch.shape)
-        x = batch
-        loss = self.loss(x)
-        self.log('train_loss', loss)
-        return loss
-    def validation_step(self, batch, batch_idx):
-        x = batch
-        loss = self.loss(x)
-        self.log('val_loss', loss)
-        return loss
-    def sample_fn(self, batch_size: int, num_steps: int, prime=None):
-        if prime is not None:
-            prime = prime.to(self.device)
-        # CREATE INITIAL NOISE
-        noise = torch.normal(mean=0, std=1, size=(batch_size, self.inp_dim, self.seq_len)).to(self.device)
-        padded_noise, padding = self.pad_to(noise, self.strides_prod)
-        x_alpha_t = padded_noise.clone()
-        t_array = torch.ones((batch_size,)).to(self.device)
-        with torch.no_grad():
-            # SAMPLE FROM MODEL
-            for t in np.linspace(0, 1, num_steps + 1)[:-1]:
-                t_tensor = torch.tensor(t)
-                alpha_t = t_tensor * t_array
-                alpha_t = alpha_t.unsqueeze(1).unsqueeze(2).to(self.device)
-                if prime is not None:
-                    x_alpha_t[:, :, :prime.shape[-1]] = ((1 - alpha_t) * noise[:, :, :prime.shape[-1]]) + (alpha_t * prime) # fill in the prime in the beginning of each x_t
-                diff =  self.forward(x_alpha_t, t_tensor * t_array)
-                x_alpha_t = x_alpha_t + 1 / num_steps * diff
-            padded_y = x_alpha_t
-            unpadded_y = self.unpad(padded_y, padding)
-        return unpadded_y
-    def on_validation_epoch_end(self) -> None:
-        if self.current_epoch % self.log_samples_every == 0:
-            if self.ckpt is not None:
-                os.makedirs(os.path.join(self.ckpt, 'samples', str(self.current_epoch)), exist_ok=True)
-            samples = self.sample_fn(16, 100)
-            audio = p2a.normalized_mels_to_audio(samples, qt=self.qt)
-            beep = torch.sin(2 * torch.pi * 220 * torch.arange(0, 0.1 * self.sr) / self.sr).to(audio)
-            concat_audios = []
-            for sample in audio:
-                concat_audios.append(torch.cat([sample, beep]))
-            concat_audio = torch.cat(concat_audios, dim=-1).reshape(1, -1).to('cpu')
-            output_file = os.path.join(self.ckpt, 'samples', f'samples_{self.current_epoch}.wav')
-            torchaudio.save(output_file, concat_audio, self.sr)
-            if self.current_epoch % self.log_wandb_samples_every == 0:
-                if self.logger:
-                    wandb.log({
-                        "samples": [wandb.Audio(output_file, self.sr, caption="Samples")]})
-    def on_before_optimizer_step(self, optimizer, *_):
-        def calculate_grad_norm(module_list, norm_type=2):
-            total_norm = 0
-            if isinstance(module_list, nn.Module):
-                module_list = [module_list]
-            for module in module_list:
-                for name, param in module.named_parameters():
-                    if param.requires_grad:
-                        param_norm = torch.norm(param.grad.detach(), p=norm_type)
-                        total_norm += param_norm**2
-            # pdb.set_trace()
-            total_norm = torch.sqrt(total_norm)
-            return total_norm
-        if self.log_grad_norms_every is not None and self.global_step % self.log_grad_norms_every == 0:
-            self.log('Grad Norm/Downsample Layers', calculate_grad_norm(self.downsample_layers))
-            self.log('Grad Norm/Attention Layers', calculate_grad_norm(self.attention_layers))
-            self.log('Grad Norm/Upsample Layers', calculate_grad_norm(self.upsample_layers))
-    # def configure_optimizers(self):
-    #     return optim.Adam(self.parameters(), lr=1e-4)
-@gin.configurable
-class UNetPitchConditioned(UNetBase):
-    def __init__(self,
-                 inp_dim,
-                 time_dim,
-                 f0_dim,
-                 features,
-                 strides,
-                 kernel_size,
-                 audio_seq_len,
-                 project_dim=None,
-                 dropout=0.0,
-                 nonlinearity=None,
-                 norm=True,
-                 num_convs=2,
-                 num_attns=2,
-                 num_heads=8,
-                 log_samples_every=10,
-                 log_wandb_samples_every=10,
-                 ckpt=None,
-                 val_data=None,
-                 qt=None,
-                 singer_conditioning=False,
-                 singer_dim=128,
-                 singer_vocab=56,
-                 sr = 44100,
-                 cfg = False,
-                 f0_mask = 0,
-                 cond_drop_prob = 0.0,
-                 groups = None,
-                 nfft = None,
-                 loss_w_padding = False,
-                 log_grad_norms_every=10
-                 ):
-        super(UNetPitchConditioned, self).__init__()
-        self.inp_dim = inp_dim
-        self.time_dim = time_dim
-        self.features = features
-        self.strides = strides
-        self.kernel_size = kernel_size
-        self.seq_len = audio_seq_len
-        self.log_samples_every = log_samples_every
-        self.log_wandb_samples_every = log_wandb_samples_every
-        self.ckpt = ckpt
-        self.qt = qt
-        self.singer_conditioning = singer_conditioning
-        self.sr = sr    # used for logging audio to wandb
-        self.cond_drop_prob = cond_drop_prob
-        self.f0_masked_token = f0_mask
-        self.cfg = cfg
-        self.strides_prod = np.prod(strides)
-        self.loss_w_padding = loss_w_padding
-        self.log_grad_norms_every = log_grad_norms_every
-        conditioning_dim = time_dim
-        if singer_conditioning:
-            conditioning_dim += singer_dim
-        if project_dim is None:
-            project_dim = features[0]
-        self.initial_projection = nn.Conv1d(inp_dim, project_dim, kernel_size=1)
-        self.time_positional_encoding = PositionalEncoding(time_dim)
-        self.f0_positional_encoding = PositionalEncoding(f0_dim)
-        if singer_conditioning:
-            self.singer_embedding = nn.Embedding(singer_vocab + 1*self.cfg, singer_dim) # if cfg, add 1 to the singer vocabulary
-            self.singer_masked_token = singer_vocab
-        else:
-            self.singer_embedding = None
-        features = [project_dim] + features
-        f0_features = features.copy()
-        f0_features[0] = f0_dim # first layer should be the f0 dimension
-        strides = [None] + strides
-        self.downsample_layers = nn.ModuleList([
-            ResNetBlock(features[ind-1] + conditioning_dim,
-                        features[ind],
-                        kernel_size=kernel_size,
-                        stride=strides[ind],
-                        dropout=dropout,
-                        nonlinearity=nonlinearity,
-                        norm=norm,
-                        num_convs=num_convs,
-                        ) for ind in range(1, len(features))
-        ])
-        self.f0_conv_layers = nn.ModuleList([
-            nn.Conv1d(
-                f0_dim,
-                f0_dim,
-                kernel_size=2 * strides[ind],
-                stride=strides[ind],
-                padding=strides[ind]//2,
-            ) for ind in range(1, len(features))
-        ])
-        self.attention_layers = AttentionLayers(
-            dim = features[-1],
-            heads = num_heads,
-            depth = num_attns,
-        )
-        self.upsample_layers = nn.ModuleList([
-            ResNetBlock((features[ind] * 2) + (conditioning_dim) + f0_dim, # input size defined by features + skip dimension + time dimension
-            features[ind-1],
-            kernel_size=kernel_size,
-            stride=strides[ind],
-            dropout=dropout,
-            nonlinearity=nonlinearity,
-            norm=norm,
-            num_convs=num_convs,
-            up=True
-            ) for ind in range(len(features) - 1, 0, -1)
-        ])
-        self.final_projection = nn.Conv1d(2*project_dim + f0_dim, inp_dim, kernel_size=1)
-        # save 16 f0 values from to sample on
-        if val_data is not None:
-            val_ids = np.random.choice(len(val_data), 16)
-            val_samples = [val_data[i] for i in val_ids]
-            self.val_f0 = torch.stack([v[1] for v in val_samples], 0).to(self.device)
-            if self.singer_conditioning:
-                self.val_singer = torch.tensor([v[2] for v in val_samples]).long().to(self.device)
-            else:
-                self.val_singer = None
-            val_audio = torch.stack([v[0] for v in val_samples], 0).to(self.device)
-            if self.ckpt is not None:
-                # log the f0 and audio to wandb
-                os.makedirs(os.path.join(self.ckpt, 'samples'), exist_ok=True)
-                concat_audios = []
-                beep = torch.sin(2 * torch.pi * 220 * torch.arange(0, 0.1 * self.sr) / self.sr).to(val_audio)
-                recon_audios = p2a.normalized_mels_to_audio(val_audio, qt=self.qt)
-                fig, axs = plt.subplots(4, 4, figsize=(16, 16))
-                for i in range(4):
-                    for j in range(4):
-                        axs[i, j].plot(self.val_f0[i*4+j].squeeze())
-                        if self.singer_conditioning:
-                            axs[i, j].set_title(f'Singer {self.val_singer[i*4+j].item()}')
-                        concat_audios.append(torch.cat((recon_audios[i*4+j].squeeze(), beep)))
-                concat_audios = torch.cat(concat_audios, dim=-1).reshape(1, -1).to('cpu')
-                output_file = os.path.join(self.ckpt, 'samples', f'gt_samples.wav')
-                torchaudio.save(output_file, concat_audios, self.sr)
-                try:
-                    wandb.log({"sample f0 input": [wandb.Image(fig, caption="f0 conditioning on samples")]})
-                    wandb.log({
-                    "sample audio ground truth": [wandb.Audio(output_file, self.sr, caption="Samples")]})
-                except:
-                    pass
-                fig.savefig(os.path.join(self.ckpt, 'samples', 'f0_inputs.png'))
-    def pad_to(self, x, strides):
-        # modified from: https://stackoverflow.com/questions/66028743/how-to-handle-odd-resolutions-in-unet-architecture-pytorch
-        l = x.shape[-1]
-        if l % strides > 0:
-            new_l = l + strides - l % strides
-        else:
-            new_l = l
-        ll, ul = int((new_l-l) / 2), int(new_l-l) - int((new_l-l) / 2)
-        pads = (ll, ul)
-        out = F.pad(x, pads, "reflect").to(x)
-        return out, pads
-    def unpad(self, x, pad):
-        # modified from: https://stackoverflow.com/questions/66028743/how-to-handle-odd-resolutions-in-unet-architecture-pytorch
-        if pad[0]+pad[1] > 0:
-            x = x[:,:,pad[0]:-pad[1]]
-        return x
-    def forward(self, x, time, f0, singer, drop_tokens=True, drop_all=False):
-        # INITIAL PROJECTION
-        x = self.initial_projection(x)
-        bs = x.shape[0]
-        if self.cfg:
-            # pdb.set_trace()
-            if drop_all:
-                prob_keep_mask_pitch = torch.zeros((bs)).unsqueeze(1).repeat(1, f0.shape[1]).to(self.device).bool()
-                prob_keep_mask_singer = torch.zeros((bs)).to(self.device).bool()
-            elif drop_tokens:
-                prob_keep_mask_pitch = prob_mask_like((bs), 1. - self.cond_drop_prob, device = self.device).unsqueeze(1).repeat(1, f0.shape[1])
-                prob_keep_mask_singer = prob_mask_like((bs), 1. - self.cond_drop_prob, device = self.device)
-            else:
-                prob_keep_mask_pitch = torch.ones((bs)).unsqueeze(1).repeat(1, f0.shape[1]).to(self.device).bool()
-                prob_keep_mask_singer = torch.ones((bs)).to(self.device).bool()
-            f0 = torch.where(prob_keep_mask_pitch, f0, torch.empty((f0.shape[0], f0.shape[1])).fill_(self.f0_masked_token).to(self.device).long())
-            if self.singer_conditioning:
-                singer = torch.where(prob_keep_mask_singer, singer, torch.empty((bs)).fill_(self.singer_masked_token).to(self.device).long())
-        # TIME and F0 CONDITIONING
-        conditions = [self.time_positional_encoding(time)]
-        if self.singer_conditioning:
-            conditions.append(self.singer_embedding(singer))
-        f0 = self.f0_positional_encoding(f0)
-        def _concat_condition(x, condition):
-            condition = condition.unsqueeze(2).expand(-1, -1, x.shape[-1])
-            x = torch.cat([x, condition], -2)
-            return x
-        skips = []
-        # DOWNSAMPLING
-        # pdb.set_trace()
-        for ind, downsample_layer in enumerate(self.downsample_layers):
-            # print(f'Down sample layer {ind}')
-            # pdb.set_trace()
-            skips.append(torch.cat([x, f0], -2))
-            for cond in conditions:
-                x = _concat_condition(x, cond)
-            # print(x.shape, time.shape, f0.shape, skips[-1].shape)
-            x = downsample_layer(x)
-            f0 = self.f0_conv_layers[ind](f0)
-        skips.append(torch.cat([x, f0], -2))
-        # BOTTLENECK ATTENTION
-        x = torch.permute(x, (0, 2, 1))
-        x = self.attention_layers(x)
-        x = torch.permute(x, (0, 2, 1))
-        # print(x.shape, time.shape, f0.shape, skips[-1].shape)
-        # pdb.set_trace()
-        # UPSAMPLING
-        for ind, upsample_layer in enumerate(self.upsample_layers):
-            # print(f'Up sample layer {ind}')
-            for cond in conditions:
-                x = _concat_condition(x, cond)
-            x = torch.cat([x, skips.pop(-1)], 1)
-            # print(x.shape, time.shape, f0.shape)
-            x = upsample_layer(x)
-        x = torch.cat([x, skips.pop(-1)], 1)
-        # FINAL PROJECTION
-        x = self.final_projection(x)
-        return x
-    def loss(self, x, f0, singer, drop_tokens):
-        # pdb.set_trace()
-        padded_x, padding = self.pad_to(x, self.strides_prod)
-        padded_f0, _ = self.pad_to(f0, self.strides_prod)
-        t = torch.rand((padded_x.shape[0],)).to(padded_x)
-        noise = torch.normal(0, 1, padded_x.shape).to(padded_x)
-        # print(t.device, noise.device, x.device)
-        x_t = t[:, None, None] * padded_x + (1 - t[:, None, None]) * noise
-        # print(t.device, noise.device, x_t.device, x.device)
-        padded_y = self.forward(x_t, t, padded_f0, singer, drop_tokens)
-        unpadded_y = self.unpad(padded_y, padding)
-        if self.loss_w_padding:
-            target = padded_x - noise
-            return torch.mean((padded_y - target) ** 2)
-        else:
-            target = x - self.unpad(noise, padding) # x1 - x0
-            return torch.mean((unpadded_y - target) ** 2)
-    def training_step(self, batch, batch_idx):
-        # print('\n', batch_idx, batch.shape)
-        x, f0, singer = batch
-        x = x.to(self.device)
-        f0 = f0.to(self.device)
-        singer = singer.reshape(-1).long().to(self.device) if self.singer_conditioning else None
-        loss = self.loss(x, f0, singer, drop_tokens=True)
-        self.log('train_loss', loss, batch_size=x.shape[0])
-        return loss
-    def validation_step(self, batch, batch_idx):
-        # pdb.set_trace()
-        x, f0, singer = batch
-        x = x.to(self.device)
-        f0 = f0.to(self.device)
-        singer = singer.reshape(-1).long().to(self.device) if self.singer_conditioning else None
-        loss = self.loss(x, f0, singer, drop_tokens=False)
-        self.log('val_loss', loss, batch_size=x.shape[0])
-        return loss
-    def sample_fn(self, f0, singer, batch_size: int, num_steps: int):
-        # CREATE INITIAL NOISE
-        noise = torch.normal(mean=0, std=1, size=(batch_size, self.inp_dim, self.seq_len)).to(self.device)
-        padded_noise, padding = self.pad_to(noise, self.strides_prod)
-        t_array = torch.ones((batch_size,)).to(self.device)
-        f0 = f0.to(self.device)
-        padded_f0, _ = self.pad_to(f0, self.strides_prod)
-        singer = singer.to(self.device)
-        with torch.no_grad():
-            # SAMPLE FROM MODEL
-            for t in np.linspace(0, 1, num_steps + 1)[:-1]:
-                t_tensor = torch.tensor(t)
-                padded_noise = padded_noise + 1 / num_steps * self.forward(padded_noise, t_tensor * t_array, padded_f0, singer, drop_tokens=False)
-        noise = self.unpad(padded_noise, padding)
-        return noise
-    def sample_cfg(self, batch_size: int, num_steps: int, f0=None, singer=[4, 25, 45, 32], strength=1):
-        # CREATE INITIAL NOISE
-        noise = torch.normal(mean=0, std=1, size=(batch_size, self.inp_dim, self.seq_len)).to(self.device)
-        padded_noise, padding = self.pad_to(noise, self.strides_prod)
-        t_array = torch.ones((batch_size,)).to(self.device)
-        if f0 is None:
-            val_idx = np.random.choice(len(self.val_dataloader), batch_size)
-            val_samples = [self.val_dataloader[i][1] for i in val_idx]
-            f0 = torch.stack([sample for sample in val_samples]).to(self.device)
-        else:
-            assert len(f0) == batch_size
-            f0 = f0.to(self.device)
-            singer = singer.to(self.device)
-            # f0 = torch.tensor(f0).to(self.device)
-        # singer = torch.Tensor(np.choice(singer, batch_size, replace=True)).to(self.device)
-        padded_f0, _ = self.pad_to(f0, self.strides_prod)
-        with torch.no_grad():
-            # SAMPLE FROM MODEL
-            for t in np.linspace(0, 1, num_steps + 1)[:-1]:
-                t_tensor = torch.tensor(t)
-                unconditioned_logits = self.forward(padded_noise, t_tensor * t_array, padded_f0, singer, drop_tokens=False, drop_all=True)
-                conditioned_logits = self.forward(padded_noise, t_tensor * t_array, padded_f0, singer, drop_tokens=False, drop_all=False)
-                total_logits = strength * conditioned_logits + (1 - strength) * unconditioned_logits
-                padded_noise = padded_noise + 1 / num_steps * total_logits
-            noise = self.unpad(padded_noise, padding)
-        return noise, f0, singer
-    def on_validation_epoch_end(self) -> None:
-        with torch.no_grad():
-            # pdb.set_trace()
-            if self.current_epoch % self.log_samples_every == 0:
-                samples = self.sample_fn(self.val_f0, self.val_singer, 16, 100)
-                if self.ckpt is not None:
-                    audio = p2a.normalized_mels_to_audio(samples, qt=self.qt)
-                    beep = torch.sin(2 * torch.pi * 220 * torch.arange(0, 0.1 * self.sr) / self.sr).to(audio)
-                    concat_audio = []
-                    for sample in audio:
-                        concat_audio.append(torch.cat([sample, beep]))
-                    concat_audio = torch.cat(concat_audio, dim=-1).reshape(1, -1).to('cpu')
-                    output_file = os.path.join(self.ckpt, 'samples', f'samples_{self.current_epoch}.wav')
-                    torchaudio.save(output_file, concat_audio, self.sr)
-                if self.current_epoch % self.log_wandb_samples_every == 0:
-                    if self.logger:
-                        wandb.log({
-                            "samples": [wandb.Audio(output_file, self.sr, caption="Samples")]},
-                            step = self.global_step)
-    def on_before_optimizer_step(self, optimizer, *_):
-        def calculate_grad_norm(module_list, norm_type=2):
-            total_norm = 0
-            if isinstance(module_list, nn.Module):
-                module_list = [module_list]
-            for module in module_list:
-                for name, param in module.named_parameters():
-                    if param.requires_grad:
-                        param_norm = torch.norm(param.grad.detach(), p=norm_type)
-                        total_norm += param_norm**2
-            # pdb.set_trace()
-            total_norm = torch.sqrt(total_norm)
-            return total_norm
-        if self.log_grad_norms_every is not None and self.global_step % self.log_grad_norms_every == 0:
-            self.log('Grad Norm/Downsample Layers', calculate_grad_norm(self.downsample_layers))
-            self.log('Grad Norm/Attention Layers', calculate_grad_norm(self.attention_layers))
-            self.log('Grad Norm/Upsample Layers', calculate_grad_norm(self.upsample_layers))
-    # @gin.configurable
-    # def configure_optimizers(self, optimizer_cls: Callable[[], torch.optim.Optimizer],
-    # scheduler_cls: Callable[[],
-    #                         torch.optim.lr_scheduler._LRScheduler]):
-    #     # pdb.set_trace()
-    #     optimizer = optimizer_cls(self.parameters())
-    #     scheduler = scheduler_cls(optimizer)
-    #     return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]

src/pitch_to_audio_utils.py DELETED Viewed

@@ -1,121 +0,0 @@
-import math
-import librosa as li
-import torch
-from tqdm import tqdm
-import numpy as np
-import gin
-import logging
-import pdb
-@gin.configurable
-def torch_stft(x, nfft):
-    window = torch.hann_window(nfft).to(x)
-    x = torch.stft(
-        x,
-        n_fft=nfft,
-        hop_length=nfft // 4,
-        win_length=nfft,
-        window=window,
-        center=True,
-        return_complex=True,
-    )
-    x = 2 * x / torch.mean(window)
-    return x
-@gin.configurable
-def torch_istft(x, nfft):
-    # pdb.set_trace()
-    window = torch.hann_window(nfft).to(x.device)
-    x = x / 2 * torch.mean(window)
-    return torch.istft(
-        x,
-        n_fft=nfft,
-        hop_length=nfft // 4,
-        win_length=nfft,
-        window=window,
-        center=True,
-    )
-@gin.configurable
-def to_mels(stft, nfft, num_mels, sr, eps=1e-2):
-    mels = li.filters.mel(
-        sr=sr,
-        n_fft=nfft,
-        n_mels=num_mels,
-        fmin=40,
-    )
-    # pdb.set_trace()
-    mels = torch.from_numpy(mels).to(stft)
-    mel_stft = torch.einsum("mf,bft->bmt", mels, stft)
-    mel_stft = torch.log(mel_stft + eps)
-    return mel_stft
-@gin.configurable
-def from_mels(mel_stft, nfft, num_mels, sr, eps=1e-2):
-    mels = li.filters.mel(
-        sr=sr,
-        n_fft=nfft,
-        n_mels=num_mels,
-        fmin=40,
-    )
-    mels = torch.from_numpy(mels).to(mel_stft)
-    mels = torch.pinverse(mels)
-    mel_stft = torch.exp(mel_stft) - eps
-    stft = torch.einsum("fm,bmt->bft", mels, mel_stft)
-    return stft
-@gin.configurable
-def torch_gl(stft, nfft, sr, n_iter):
-    def _gl_iter(phase, xs, stft):
-        del xs
-        # pdb.set_trace()
-        c_stft = stft * torch.exp(1j * phase)
-        rec = torch_istft(c_stft, nfft)
-        r_stft = torch_stft(rec, nfft)
-        phase = torch.angle(r_stft)
-        return phase, None
-    phase = torch.rand_like(stft) * 2 * torch.pi
-    for _ in tqdm(range(n_iter)):
-        phase, _ = _gl_iter(phase, None, stft)
-    c_stft = stft * torch.exp(1j * phase)
-    audio = torch_istft(c_stft, nfft)
-    return audio
-@gin.configurable
-def normalize(x, qt=None):
-    x_flat = x.reshape(-1, 1)
-    if qt is None:
-        logging.warning('No quantile transformer found, returning input')
-        return x
-    return torch.Tensor(qt.transform(x_flat).reshape(x.shape))
-@gin.configurable
-def unnormalize(x, qt=None):
-    x_flat = x.reshape(-1, 1)
-    if qt is None:
-        logging.warning('No quantile transformer found, returning input')
-        return x
-    if isinstance(x_flat, torch.Tensor):
-        x_flat = x_flat.detach().cpu().numpy()
-    return torch.Tensor(qt.inverse_transform(x_flat).reshape(x.shape))
-@gin.configurable
-def audio_to_normalized_mels(x, nfft, num_mels, sr, qt):
-    # pdb.set_trace()
-    stfts = torch_stft(x, nfft=nfft).abs()[..., :-1]
-    mel_stfts = to_mels(stfts, nfft, num_mels, sr)
-    return normalize(mel_stfts, qt).to(x)
-@gin.configurable
-def normalized_mels_to_audio(x, nfft, num_mels, sr, qt, n_iter=20):
-    x = unnormalize(x, qt).to(x)
-    x = from_mels(x, nfft, num_mels, sr)
-    x = torch.clamp(x, 0, nfft)
-    x = torch_gl(x, nfft, sr, n_iter=n_iter)
-    return x

src/preprocess_utils.py DELETED Viewed

@@ -1,127 +0,0 @@
-import subprocess
-import numpy as np
-import pandas as pd
-from typing import Iterable, Tuple, Callable
-import multiprocessing
-import functools
-from itertools import repeat
-from protobuf.data_example import AudioExample, DTYPE_TO_PRECISION
-import librosa
-import pdb
-# from memory_profiler import profile
-# @profile
-def load_chunk(
-    row: pd.Series,
-    n_signal_audio: int,
-    n_signal_pitch: int,
-    sr_audio: int,
-    sr_pitch: int,
-    error_path: str = None,
-    ) -> Iterable[np.ndarray]:
-    audio_path = row['audio_path']
-    csv_path = row['pitch_path']
-    # print (audio_path, csv_path)
-    # pdb.set_trace()
-    try:
-        chunk_csv = pd.read_csv(csv_path, chunksize=n_signal_pitch)
-    except:
-        if error_path is not None:
-            with open(error_path, 'a') as f:
-                f.write(f'Error reading {csv_path}\n')
-        return
-    chunk_iter = iter(chunk_csv)
-    chunk_pitch = next(chunk_iter)
-    f0 = chunk_pitch['filtered_f0'].fillna(0).to_numpy()
-    # print('Number of chunks: ', pd.read_csv(csv_path).shape[0]//n_signal_pitch, '\n')
-    while len(f0) == n_signal_pitch:
-        start_time = chunk_pitch['time'].values[0]
-        # print(start_time, chunk_pitch['time'].values[-1] - ((n_signal_pitch - 1)/sr_pitch))
-        assert abs(start_time - (chunk_pitch['time'].values[-1] - ((n_signal_pitch - 1)/sr_pitch))) < 1e-6 # check that no time stamps were skipped
-        chunk_audio = librosa.load(audio_path, sr=sr_audio, offset=start_time, duration=n_signal_audio/sr_audio, dtype=np.float32)[0]
-        assert chunk_audio.shape[0] == n_signal_audio
-        # and len(f0) == n_signal_pitch:
-        # chunk_audio /= 2**15
-        # pdb.set_trace()
-        yield chunk_audio, f0, row, start_time
-        try:
-            chunk_pitch = next(chunk_iter)
-            f0 = chunk_pitch['filtered_f0'].fillna(0).to_numpy()
-        except StopIteration:
-            return
-def flatmap(
-        pool: multiprocessing.Pool,
-        func: Callable,
-        iterable: Iterable,
-        queue_size: int,
-        chunksize=None,
-    ):
-        queue = multiprocessing.Manager().Queue(maxsize=queue_size)
-        pool.map_async(
-            functools.partial(flat_mappper, func),
-            zip(iterable, repeat(queue)),
-            chunksize,
-            lambda _: queue.put(None),
-            lambda *e: print(e),
-        )
-        item = queue.get()
-        while item is not None:
-            # print(item)
-            yield item
-            item = queue.get()
-def flat_mappper(func, arg):
-    data, queue = arg
-    for item in func(data):
-        queue.put(item)
-def batch(iterator: Iterable, batch_size: int):
-    batch = []
-    for elm in iterator:
-        batch.append(elm)
-        if len(batch) == batch_size:
-            yield batch
-            batch = []
-    if len(batch):
-        yield batch
-def preprocess_batch(
-    preprocessed_array,
-    sr_audio: int,
-    sr_pitch: int,
-):
-    # pdb.set_trace()
-    dtype = np.float32
-    data_examples = [AudioExample() for _ in range(len(preprocessed_array))]
-    for ae, data in zip(data_examples, preprocessed_array):
-        # pdb.set_trace()
-        audio_data, csv_data, row, start_time = data
-        buffer_audio = ae.ae.buffers['audio']
-        buffer_audio.data = audio_data.astype(dtype).tobytes()
-        buffer_audio.shape.extend(audio_data.shape)
-        buffer_audio.precision = DTYPE_TO_PRECISION[dtype]
-        buffer_audio.sampling_rate = sr_audio
-        buffer_audio.data_path = row['audio_path']
-        buffer_audio.start_time = start_time
-        buffer_csv = ae.ae.buffers['pitch']
-        buffer_csv.data = csv_data.astype(dtype).tobytes()
-        buffer_csv.shape.extend(csv_data.shape)
-        buffer_csv.precision = DTYPE_TO_PRECISION[dtype]
-        buffer_csv.sampling_rate = sr_pitch
-        buffer_csv.data_path = row['pitch_path']
-        buffer_csv.start_time = start_time
-        ae.ae.global_conditions.tonic = row['tonic']
-        ae.ae.global_conditions.raga = row['raga']
-        ae.ae.global_conditions.singer = row['singer']
-    return data_examples

src/process_encodec.py DELETED Viewed

@@ -1,22 +0,0 @@
-import gin
-from sklearn.preprocessing import QuantileTransformer
-from transformers import EncodecModel, AutoProcessor
-import librosa as li
-import pdb
-@gin.configurable
-def read_tokens(
-        inputs,
-        encodec_model: EncodecModel,
-        encodec_processor: AutoProcessor,
-        target_bandwidth: int = 3
-        ):
-    # pdb.set_trace()
-    audio = inputs['audio']['data']
-    audio = li.resample(y=audio, orig_sr= inputs['audio']['sampling_rate'], target_sr=encodec_processor.sampling_rate)
-    encodec_inputs = encodec_processor(raw_audio=audio, sampling_rate=encodec_processor.sampling_rate, return_tensors='pt')
-    encodec_tokens = encodec_model.encode(encodec_inputs['input_values'], bandwidth=target_bandwidth).audio_codes
-    return encodec_tokens.detach().cpu().numpy()

src/utils.py DELETED Viewed

@@ -1,65 +0,0 @@
-from pathlib import Path
-import os
-import random
-import torch
-import numpy as np
-import gin
-def search_for_run(run_path, mode="last"):
-    if run_path is None: return None
-    if ".ckpt" in run_path: return run_path
-    ckpts = map(str, Path(run_path).rglob("*.ckpt"))
-    ckpts = filter(lambda e: mode in os.path.basename(str(e)), ckpts)
-    ckpts = sorted(ckpts)
-    if len(ckpts):
-        if len(ckpts) > 1 and 'last.ckpt' in ckpts:
-            return ckpts[-2]    # last.ckpt is always at the end, so we take the second last
-        else:
-            return ckpts[-1]
-    else: return None
-def set_seed(seed: int):
-    """Set seed"""
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if torch.cuda.is_available():
-        torch.cuda.manual_seed(seed)
-        torch.cuda.manual_seed_all(seed)
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False
-    os.environ["PYTHONHASHSEED"] = str(seed)
-@gin.configurable
-def build_warmed_exponential_lr_scheduler(
-        optim: torch.optim.Optimizer, start_factor: float, peak_iteration: int,
-        decay_factor: float=None, cycle_length: int=None, eta_min: float=None, eta_max: float=None) -> torch.optim.lr_scheduler._LRScheduler:
-    linear = torch.optim.lr_scheduler.LinearLR(
-        optim,
-        start_factor=start_factor,
-        end_factor=1.,
-        total_iters=peak_iteration,
-    )
-    if decay_factor:
-        exp = torch.optim.lr_scheduler.ExponentialLR(
-            optim,
-            gamma=decay_factor,
-        )
-        return torch.optim.lr_scheduler.SequentialLR(optim, [linear, exp],
-                                                    milestones=[peak_iteration])
-    if cycle_length:
-        cosine = torch.optim.lr_scheduler.CosineAnnealingLR(
-            optim,
-            T_max=cycle_length,
-            eta_min = eta_min * eta_max
-        )
-        return torch.optim.lr_scheduler.SequentialLR(optim, [linear, cosine],
-                                                    milestones=[peak_iteration])
-def prob_mask_like(shape, prob, device):
-    if prob == 1:
-        return torch.ones(shape, device = device, dtype = torch.bool)
-    elif prob == 0:
-        return torch.zeros(shape, device = device, dtype = torch.bool)
-    else:
-        return torch.zeros(shape, device = device).float().uniform_(0, 1) < prob