Spaces:
Sleeping
Sleeping
Nithya commited on
Commit Β·
01188ff
1
Parent(s): 7dc7036
rearranged model files and added config
Browse files- app.py +8 -9
- models/diffusion_pitch/config.gin +69 -0
- diffusion_pitch_model-model.ckpt β models/diffusion_pitch/last.ckpt +0 -0
- diffusion_pitch_model-qt.joblib β models/diffusion_pitch/qt.joblib +0 -0
- models/pitch_to_audio/config.gin +91 -0
- pitch_to_audio_model-model.ckpt β models/pitch_to_audio/last.ckpt +0 -0
- pitch_to_audio_model-qt.joblib β models/pitch_to_audio/qt.joblib +0 -0
- requirements.txt +2 -0
app.py
CHANGED
|
@@ -23,10 +23,8 @@ from hmmlearn import hmm
|
|
| 23 |
import time
|
| 24 |
import soundfile as sf
|
| 25 |
|
| 26 |
-
pitch_path = '/
|
| 27 |
-
audio_path = '/
|
| 28 |
-
pitch_primes = '/network/scratch/n/nithya.shikarpur/pitch-diffusion/data/merged_data-final/listening_study_primes.npz'
|
| 29 |
-
output_folder = '/network/scratch/n/nithya.shikarpur/pitch-diffusion/user-studies/listening-study-2/task-3'
|
| 30 |
device = 'cpu'
|
| 31 |
|
| 32 |
global_ind = -1
|
|
@@ -232,11 +230,12 @@ def set_prime_and_generate(audio, full_pitch, full_audio, full_user):
|
|
| 232 |
return audio, pitch, full_pitch, full_audio, full_user, fig
|
| 233 |
|
| 234 |
def save_session(full_pitch, full_audio, full_user):
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
|
|
|
| 240 |
|
| 241 |
with gr.Blocks() as demo:
|
| 242 |
full_audio = gr.State((16000, np.array([])))
|
|
|
|
| 23 |
import time
|
| 24 |
import soundfile as sf
|
| 25 |
|
| 26 |
+
pitch_path = 'models/diffusion_pitch/'
|
| 27 |
+
audio_path = 'models/pitch_to_audio/'
|
|
|
|
|
|
|
| 28 |
device = 'cpu'
|
| 29 |
|
| 30 |
global_ind = -1
|
|
|
|
| 230 |
return audio, pitch, full_pitch, full_audio, full_user, fig
|
| 231 |
|
| 232 |
def save_session(full_pitch, full_audio, full_user):
|
| 233 |
+
pass
|
| 234 |
+
# os.makedirs(output_folder, exist_ok=True)
|
| 235 |
+
# filename = f'session-{time.time()}'
|
| 236 |
+
# logging.log(logging.INFO, f"Saving session to {filename}")
|
| 237 |
+
# pd.DataFrame({'pitch': full_pitch, 'time': np.arange(0, len(full_pitch)/100, 0.01), 'user': full_user}).to_csv(os.path.join(output_folder, filename + '.csv'), index=False)
|
| 238 |
+
# sf.write(os.path.join(output_folder, filename + '.wav'), full_audio[1], 16000)
|
| 239 |
|
| 240 |
with gr.Blocks() as demo:
|
| 241 |
full_audio = gr.State((16000, np.array([])))
|
models/diffusion_pitch/config.gin
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __gin__ import dynamic_registration
|
| 2 |
+
from src import dataset
|
| 3 |
+
from src import model
|
| 4 |
+
from src import utils
|
| 5 |
+
import torch
|
| 6 |
+
|
| 7 |
+
# Macros:
|
| 8 |
+
# ==============================================================================
|
| 9 |
+
LR = 0.0001
|
| 10 |
+
SEQ_LEN = 1200
|
| 11 |
+
TRANSPOSE_VALUE = 400
|
| 12 |
+
|
| 13 |
+
# Parameters for torch.optim.AdamW:
|
| 14 |
+
# ==============================================================================
|
| 15 |
+
torch.optim.AdamW.betas = (0.9, 0.99)
|
| 16 |
+
torch.optim.AdamW.lr = %LR
|
| 17 |
+
|
| 18 |
+
# Parameters for utils.build_warmed_exponential_lr_scheduler:
|
| 19 |
+
# ==============================================================================
|
| 20 |
+
utils.build_warmed_exponential_lr_scheduler.cycle_length = 200000
|
| 21 |
+
utils.build_warmed_exponential_lr_scheduler.eta_max = %LR
|
| 22 |
+
utils.build_warmed_exponential_lr_scheduler.eta_min = 0.1
|
| 23 |
+
utils.build_warmed_exponential_lr_scheduler.peak_iteration = 10000
|
| 24 |
+
utils.build_warmed_exponential_lr_scheduler.start_factor = 0.01
|
| 25 |
+
|
| 26 |
+
# Parameters for model.UNetBase.configure_optimizers:
|
| 27 |
+
# ==============================================================================
|
| 28 |
+
model.UNetBase.configure_optimizers.optimizer_cls = @torch.optim.AdamW
|
| 29 |
+
model.UNetBase.configure_optimizers.scheduler_cls = \
|
| 30 |
+
@utils.build_warmed_exponential_lr_scheduler
|
| 31 |
+
|
| 32 |
+
# Parameters for dataset.pitch_read_w_downsample:
|
| 33 |
+
# ==============================================================================
|
| 34 |
+
dataset.pitch_read_w_downsample.add_noise_to_silence = True
|
| 35 |
+
dataset.pitch_read_w_downsample.decoder_key = 'pitch'
|
| 36 |
+
dataset.pitch_read_w_downsample.max_clip = 600
|
| 37 |
+
dataset.pitch_read_w_downsample.min_clip = 200
|
| 38 |
+
dataset.pitch_read_w_downsample.min_norm_pitch = -4915
|
| 39 |
+
dataset.pitch_read_w_downsample.pitch_downsample = 10
|
| 40 |
+
dataset.pitch_read_w_downsample.seq_len = %SEQ_LEN
|
| 41 |
+
dataset.pitch_read_w_downsample.time_downsample = 2
|
| 42 |
+
|
| 43 |
+
# Parameters for train/dataset.pitch_read_w_downsample:
|
| 44 |
+
# ==============================================================================
|
| 45 |
+
train/dataset.pitch_read_w_downsample.transpose_pitch = %TRANSPOSE_VALUE
|
| 46 |
+
|
| 47 |
+
# Parameters for train/dataset.SequenceDataset:
|
| 48 |
+
# ==============================================================================
|
| 49 |
+
train/dataset.SequenceDataset.task_fn = @train/dataset.pitch_read_w_downsample
|
| 50 |
+
|
| 51 |
+
# Parameters for val/dataset.SequenceDataset:
|
| 52 |
+
# ==============================================================================
|
| 53 |
+
val/dataset.SequenceDataset.task_fn = @dataset.pitch_read_w_downsample
|
| 54 |
+
|
| 55 |
+
# Parameters for model.UNet:
|
| 56 |
+
# ==============================================================================
|
| 57 |
+
model.UNet.dropout = 0.3
|
| 58 |
+
model.UNet.features = [512, 640, 1024]
|
| 59 |
+
model.UNet.inp_dim = 1
|
| 60 |
+
model.UNet.kernel_size = 5
|
| 61 |
+
model.UNet.nonlinearity = 'mish'
|
| 62 |
+
model.UNet.norm = True
|
| 63 |
+
model.UNet.num_attns = 4
|
| 64 |
+
model.UNet.num_convs = 4
|
| 65 |
+
model.UNet.num_heads = 8
|
| 66 |
+
model.UNet.project_dim = 256
|
| 67 |
+
model.UNet.seq_len = %SEQ_LEN
|
| 68 |
+
model.UNet.strides = [4, 2, 2]
|
| 69 |
+
model.UNet.time_dim = 128
|
diffusion_pitch_model-model.ckpt β models/diffusion_pitch/last.ckpt
RENAMED
|
File without changes
|
diffusion_pitch_model-qt.joblib β models/diffusion_pitch/qt.joblib
RENAMED
|
File without changes
|
models/pitch_to_audio/config.gin
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __gin__ import dynamic_registration
|
| 2 |
+
from src import dataset
|
| 3 |
+
from src import model
|
| 4 |
+
from src import pitch_to_audio_utils
|
| 5 |
+
from src import utils
|
| 6 |
+
import torch
|
| 7 |
+
|
| 8 |
+
# Macros:
|
| 9 |
+
# ==============================================================================
|
| 10 |
+
AUDIO_SEQ_LEN = 750
|
| 11 |
+
LR = 0.0001
|
| 12 |
+
NFFT = 1024
|
| 13 |
+
NUM_MELS = 192
|
| 14 |
+
SINGER_CONDITIONING = True
|
| 15 |
+
SR = 16000
|
| 16 |
+
|
| 17 |
+
# Parameters for torch.optim.AdamW:
|
| 18 |
+
# ==============================================================================
|
| 19 |
+
torch.optim.AdamW.betas = (0.9, 0.99)
|
| 20 |
+
torch.optim.AdamW.lr = 0.0001
|
| 21 |
+
|
| 22 |
+
# Parameters for utils.build_warmed_exponential_lr_scheduler:
|
| 23 |
+
# ==============================================================================
|
| 24 |
+
utils.build_warmed_exponential_lr_scheduler.cycle_length = 480000
|
| 25 |
+
utils.build_warmed_exponential_lr_scheduler.eta_max = %LR
|
| 26 |
+
utils.build_warmed_exponential_lr_scheduler.eta_min = 0.1
|
| 27 |
+
utils.build_warmed_exponential_lr_scheduler.peak_iteration = 10000
|
| 28 |
+
utils.build_warmed_exponential_lr_scheduler.start_factor = 0.01
|
| 29 |
+
|
| 30 |
+
# Parameters for model.UNetBase.configure_optimizers:
|
| 31 |
+
# ==============================================================================
|
| 32 |
+
model.UNetBase.configure_optimizers.optimizer_cls = @torch.optim.AdamW
|
| 33 |
+
model.UNetBase.configure_optimizers.scheduler_cls = \
|
| 34 |
+
@utils.build_warmed_exponential_lr_scheduler
|
| 35 |
+
|
| 36 |
+
# Parameters for pitch_to_audio_utils.from_mels:
|
| 37 |
+
# ==============================================================================
|
| 38 |
+
pitch_to_audio_utils.from_mels.nfft = %NFFT
|
| 39 |
+
pitch_to_audio_utils.from_mels.num_mels = %NUM_MELS
|
| 40 |
+
pitch_to_audio_utils.from_mels.sr = %SR
|
| 41 |
+
|
| 42 |
+
# Parameters for dataset.load_cached_dataset:
|
| 43 |
+
# ==============================================================================
|
| 44 |
+
dataset.load_cached_dataset.audio_len = %AUDIO_SEQ_LEN
|
| 45 |
+
dataset.load_cached_dataset.return_singer = %SINGER_CONDITIONING
|
| 46 |
+
|
| 47 |
+
# Parameters for pitch_to_audio_utils.normalized_mels_to_audio:
|
| 48 |
+
# ==============================================================================
|
| 49 |
+
pitch_to_audio_utils.normalized_mels_to_audio.n_iter = 100
|
| 50 |
+
pitch_to_audio_utils.normalized_mels_to_audio.nfft = %NFFT
|
| 51 |
+
pitch_to_audio_utils.normalized_mels_to_audio.num_mels = %NUM_MELS
|
| 52 |
+
pitch_to_audio_utils.normalized_mels_to_audio.sr = %SR
|
| 53 |
+
|
| 54 |
+
# Parameters for dataset.SequenceDataset:
|
| 55 |
+
# ==============================================================================
|
| 56 |
+
dataset.SequenceDataset.task_fn = @dataset.load_cached_dataset
|
| 57 |
+
|
| 58 |
+
# Parameters for pitch_to_audio_utils.torch_gl:
|
| 59 |
+
# ==============================================================================
|
| 60 |
+
pitch_to_audio_utils.torch_gl.n_iter = 200
|
| 61 |
+
pitch_to_audio_utils.torch_gl.nfft = %NFFT
|
| 62 |
+
pitch_to_audio_utils.torch_gl.sr = %SR
|
| 63 |
+
|
| 64 |
+
# Parameters for pitch_to_audio_utils.torch_istft:
|
| 65 |
+
# ==============================================================================
|
| 66 |
+
pitch_to_audio_utils.torch_istft.nfft = %NFFT
|
| 67 |
+
|
| 68 |
+
# Parameters for model.UNetPitchConditioned:
|
| 69 |
+
# ==============================================================================
|
| 70 |
+
model.UNetPitchConditioned.audio_seq_len = %AUDIO_SEQ_LEN
|
| 71 |
+
model.UNetPitchConditioned.cfg = True
|
| 72 |
+
model.UNetPitchConditioned.cond_drop_prob = 0.2
|
| 73 |
+
model.UNetPitchConditioned.dropout = 0.3
|
| 74 |
+
model.UNetPitchConditioned.f0_dim = 128
|
| 75 |
+
model.UNetPitchConditioned.features = [512, 640, 1024]
|
| 76 |
+
model.UNetPitchConditioned.inp_dim = %NUM_MELS
|
| 77 |
+
model.UNetPitchConditioned.kernel_size = 5
|
| 78 |
+
model.UNetPitchConditioned.log_samples_every = 10
|
| 79 |
+
model.UNetPitchConditioned.log_wandb_samples_every = 50
|
| 80 |
+
model.UNetPitchConditioned.nonlinearity = 'mish'
|
| 81 |
+
model.UNetPitchConditioned.norm = False
|
| 82 |
+
model.UNetPitchConditioned.num_attns = 4
|
| 83 |
+
model.UNetPitchConditioned.num_convs = 4
|
| 84 |
+
model.UNetPitchConditioned.num_heads = 8
|
| 85 |
+
model.UNetPitchConditioned.project_dim = 256
|
| 86 |
+
model.UNetPitchConditioned.singer_conditioning = %SINGER_CONDITIONING
|
| 87 |
+
model.UNetPitchConditioned.singer_dim = 128
|
| 88 |
+
model.UNetPitchConditioned.singer_vocab = 55
|
| 89 |
+
model.UNetPitchConditioned.sr = %SR
|
| 90 |
+
model.UNetPitchConditioned.strides = [4, 2, 2]
|
| 91 |
+
model.UNetPitchConditioned.time_dim = 128
|
pitch_to_audio_model-model.ckpt β models/pitch_to_audio/last.ckpt
RENAMED
|
File without changes
|
pitch_to_audio_model-qt.joblib β models/pitch_to_audio/qt.joblib
RENAMED
|
File without changes
|
requirements.txt
CHANGED
|
@@ -16,4 +16,6 @@ torchaudio==2.4.0
|
|
| 16 |
tqdm==4.65.0
|
| 17 |
wandb==0.15.4
|
| 18 |
x_transformers==1.32.15
|
|
|
|
|
|
|
| 19 |
|
|
|
|
| 16 |
tqdm==4.65.0
|
| 17 |
wandb==0.15.4
|
| 18 |
x_transformers==1.32.15
|
| 19 |
+
crepe==0.0.15
|
| 20 |
+
hmmlearn==0.3.2
|
| 21 |
|