Nithya commited on
Commit
01188ff
Β·
1 Parent(s): 7dc7036

rearranged model files and added config

Browse files
app.py CHANGED
@@ -23,10 +23,8 @@ from hmmlearn import hmm
23
  import time
24
  import soundfile as sf
25
 
26
- pitch_path = '/network/scratch/n/nithya.shikarpur/checkpoints/pitch-diffusion/corrected-attention-v3/4833583'
27
- audio_path = '/network/scratch/n/nithya.shikarpur/checkpoints/pitch-diffusion/corrected-attention-v3/4835364'
28
- pitch_primes = '/network/scratch/n/nithya.shikarpur/pitch-diffusion/data/merged_data-final/listening_study_primes.npz'
29
- output_folder = '/network/scratch/n/nithya.shikarpur/pitch-diffusion/user-studies/listening-study-2/task-3'
30
  device = 'cpu'
31
 
32
  global_ind = -1
@@ -232,11 +230,12 @@ def set_prime_and_generate(audio, full_pitch, full_audio, full_user):
232
  return audio, pitch, full_pitch, full_audio, full_user, fig
233
 
234
  def save_session(full_pitch, full_audio, full_user):
235
- os.makedirs(output_folder, exist_ok=True)
236
- filename = f'session-{time.time()}'
237
- logging.log(logging.INFO, f"Saving session to {filename}")
238
- pd.DataFrame({'pitch': full_pitch, 'time': np.arange(0, len(full_pitch)/100, 0.01), 'user': full_user}).to_csv(os.path.join(output_folder, filename + '.csv'), index=False)
239
- sf.write(os.path.join(output_folder, filename + '.wav'), full_audio[1], 16000)
 
240
 
241
  with gr.Blocks() as demo:
242
  full_audio = gr.State((16000, np.array([])))
 
23
  import time
24
  import soundfile as sf
25
 
26
+ pitch_path = 'models/diffusion_pitch/'
27
+ audio_path = 'models/pitch_to_audio/'
 
 
28
  device = 'cpu'
29
 
30
  global_ind = -1
 
230
  return audio, pitch, full_pitch, full_audio, full_user, fig
231
 
232
  def save_session(full_pitch, full_audio, full_user):
233
+ pass
234
+ # os.makedirs(output_folder, exist_ok=True)
235
+ # filename = f'session-{time.time()}'
236
+ # logging.log(logging.INFO, f"Saving session to {filename}")
237
+ # pd.DataFrame({'pitch': full_pitch, 'time': np.arange(0, len(full_pitch)/100, 0.01), 'user': full_user}).to_csv(os.path.join(output_folder, filename + '.csv'), index=False)
238
+ # sf.write(os.path.join(output_folder, filename + '.wav'), full_audio[1], 16000)
239
 
240
  with gr.Blocks() as demo:
241
  full_audio = gr.State((16000, np.array([])))
models/diffusion_pitch/config.gin ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ from src import dataset
3
+ from src import model
4
+ from src import utils
5
+ import torch
6
+
7
+ # Macros:
8
+ # ==============================================================================
9
+ LR = 0.0001
10
+ SEQ_LEN = 1200
11
+ TRANSPOSE_VALUE = 400
12
+
13
+ # Parameters for torch.optim.AdamW:
14
+ # ==============================================================================
15
+ torch.optim.AdamW.betas = (0.9, 0.99)
16
+ torch.optim.AdamW.lr = %LR
17
+
18
+ # Parameters for utils.build_warmed_exponential_lr_scheduler:
19
+ # ==============================================================================
20
+ utils.build_warmed_exponential_lr_scheduler.cycle_length = 200000
21
+ utils.build_warmed_exponential_lr_scheduler.eta_max = %LR
22
+ utils.build_warmed_exponential_lr_scheduler.eta_min = 0.1
23
+ utils.build_warmed_exponential_lr_scheduler.peak_iteration = 10000
24
+ utils.build_warmed_exponential_lr_scheduler.start_factor = 0.01
25
+
26
+ # Parameters for model.UNetBase.configure_optimizers:
27
+ # ==============================================================================
28
+ model.UNetBase.configure_optimizers.optimizer_cls = @torch.optim.AdamW
29
+ model.UNetBase.configure_optimizers.scheduler_cls = \
30
+ @utils.build_warmed_exponential_lr_scheduler
31
+
32
+ # Parameters for dataset.pitch_read_w_downsample:
33
+ # ==============================================================================
34
+ dataset.pitch_read_w_downsample.add_noise_to_silence = True
35
+ dataset.pitch_read_w_downsample.decoder_key = 'pitch'
36
+ dataset.pitch_read_w_downsample.max_clip = 600
37
+ dataset.pitch_read_w_downsample.min_clip = 200
38
+ dataset.pitch_read_w_downsample.min_norm_pitch = -4915
39
+ dataset.pitch_read_w_downsample.pitch_downsample = 10
40
+ dataset.pitch_read_w_downsample.seq_len = %SEQ_LEN
41
+ dataset.pitch_read_w_downsample.time_downsample = 2
42
+
43
+ # Parameters for train/dataset.pitch_read_w_downsample:
44
+ # ==============================================================================
45
+ train/dataset.pitch_read_w_downsample.transpose_pitch = %TRANSPOSE_VALUE
46
+
47
+ # Parameters for train/dataset.SequenceDataset:
48
+ # ==============================================================================
49
+ train/dataset.SequenceDataset.task_fn = @train/dataset.pitch_read_w_downsample
50
+
51
+ # Parameters for val/dataset.SequenceDataset:
52
+ # ==============================================================================
53
+ val/dataset.SequenceDataset.task_fn = @dataset.pitch_read_w_downsample
54
+
55
+ # Parameters for model.UNet:
56
+ # ==============================================================================
57
+ model.UNet.dropout = 0.3
58
+ model.UNet.features = [512, 640, 1024]
59
+ model.UNet.inp_dim = 1
60
+ model.UNet.kernel_size = 5
61
+ model.UNet.nonlinearity = 'mish'
62
+ model.UNet.norm = True
63
+ model.UNet.num_attns = 4
64
+ model.UNet.num_convs = 4
65
+ model.UNet.num_heads = 8
66
+ model.UNet.project_dim = 256
67
+ model.UNet.seq_len = %SEQ_LEN
68
+ model.UNet.strides = [4, 2, 2]
69
+ model.UNet.time_dim = 128
diffusion_pitch_model-model.ckpt β†’ models/diffusion_pitch/last.ckpt RENAMED
File without changes
diffusion_pitch_model-qt.joblib β†’ models/diffusion_pitch/qt.joblib RENAMED
File without changes
models/pitch_to_audio/config.gin ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __gin__ import dynamic_registration
2
+ from src import dataset
3
+ from src import model
4
+ from src import pitch_to_audio_utils
5
+ from src import utils
6
+ import torch
7
+
8
+ # Macros:
9
+ # ==============================================================================
10
+ AUDIO_SEQ_LEN = 750
11
+ LR = 0.0001
12
+ NFFT = 1024
13
+ NUM_MELS = 192
14
+ SINGER_CONDITIONING = True
15
+ SR = 16000
16
+
17
+ # Parameters for torch.optim.AdamW:
18
+ # ==============================================================================
19
+ torch.optim.AdamW.betas = (0.9, 0.99)
20
+ torch.optim.AdamW.lr = 0.0001
21
+
22
+ # Parameters for utils.build_warmed_exponential_lr_scheduler:
23
+ # ==============================================================================
24
+ utils.build_warmed_exponential_lr_scheduler.cycle_length = 480000
25
+ utils.build_warmed_exponential_lr_scheduler.eta_max = %LR
26
+ utils.build_warmed_exponential_lr_scheduler.eta_min = 0.1
27
+ utils.build_warmed_exponential_lr_scheduler.peak_iteration = 10000
28
+ utils.build_warmed_exponential_lr_scheduler.start_factor = 0.01
29
+
30
+ # Parameters for model.UNetBase.configure_optimizers:
31
+ # ==============================================================================
32
+ model.UNetBase.configure_optimizers.optimizer_cls = @torch.optim.AdamW
33
+ model.UNetBase.configure_optimizers.scheduler_cls = \
34
+ @utils.build_warmed_exponential_lr_scheduler
35
+
36
+ # Parameters for pitch_to_audio_utils.from_mels:
37
+ # ==============================================================================
38
+ pitch_to_audio_utils.from_mels.nfft = %NFFT
39
+ pitch_to_audio_utils.from_mels.num_mels = %NUM_MELS
40
+ pitch_to_audio_utils.from_mels.sr = %SR
41
+
42
+ # Parameters for dataset.load_cached_dataset:
43
+ # ==============================================================================
44
+ dataset.load_cached_dataset.audio_len = %AUDIO_SEQ_LEN
45
+ dataset.load_cached_dataset.return_singer = %SINGER_CONDITIONING
46
+
47
+ # Parameters for pitch_to_audio_utils.normalized_mels_to_audio:
48
+ # ==============================================================================
49
+ pitch_to_audio_utils.normalized_mels_to_audio.n_iter = 100
50
+ pitch_to_audio_utils.normalized_mels_to_audio.nfft = %NFFT
51
+ pitch_to_audio_utils.normalized_mels_to_audio.num_mels = %NUM_MELS
52
+ pitch_to_audio_utils.normalized_mels_to_audio.sr = %SR
53
+
54
+ # Parameters for dataset.SequenceDataset:
55
+ # ==============================================================================
56
+ dataset.SequenceDataset.task_fn = @dataset.load_cached_dataset
57
+
58
+ # Parameters for pitch_to_audio_utils.torch_gl:
59
+ # ==============================================================================
60
+ pitch_to_audio_utils.torch_gl.n_iter = 200
61
+ pitch_to_audio_utils.torch_gl.nfft = %NFFT
62
+ pitch_to_audio_utils.torch_gl.sr = %SR
63
+
64
+ # Parameters for pitch_to_audio_utils.torch_istft:
65
+ # ==============================================================================
66
+ pitch_to_audio_utils.torch_istft.nfft = %NFFT
67
+
68
+ # Parameters for model.UNetPitchConditioned:
69
+ # ==============================================================================
70
+ model.UNetPitchConditioned.audio_seq_len = %AUDIO_SEQ_LEN
71
+ model.UNetPitchConditioned.cfg = True
72
+ model.UNetPitchConditioned.cond_drop_prob = 0.2
73
+ model.UNetPitchConditioned.dropout = 0.3
74
+ model.UNetPitchConditioned.f0_dim = 128
75
+ model.UNetPitchConditioned.features = [512, 640, 1024]
76
+ model.UNetPitchConditioned.inp_dim = %NUM_MELS
77
+ model.UNetPitchConditioned.kernel_size = 5
78
+ model.UNetPitchConditioned.log_samples_every = 10
79
+ model.UNetPitchConditioned.log_wandb_samples_every = 50
80
+ model.UNetPitchConditioned.nonlinearity = 'mish'
81
+ model.UNetPitchConditioned.norm = False
82
+ model.UNetPitchConditioned.num_attns = 4
83
+ model.UNetPitchConditioned.num_convs = 4
84
+ model.UNetPitchConditioned.num_heads = 8
85
+ model.UNetPitchConditioned.project_dim = 256
86
+ model.UNetPitchConditioned.singer_conditioning = %SINGER_CONDITIONING
87
+ model.UNetPitchConditioned.singer_dim = 128
88
+ model.UNetPitchConditioned.singer_vocab = 55
89
+ model.UNetPitchConditioned.sr = %SR
90
+ model.UNetPitchConditioned.strides = [4, 2, 2]
91
+ model.UNetPitchConditioned.time_dim = 128
pitch_to_audio_model-model.ckpt β†’ models/pitch_to_audio/last.ckpt RENAMED
File without changes
pitch_to_audio_model-qt.joblib β†’ models/pitch_to_audio/qt.joblib RENAMED
File without changes
requirements.txt CHANGED
@@ -16,4 +16,6 @@ torchaudio==2.4.0
16
  tqdm==4.65.0
17
  wandb==0.15.4
18
  x_transformers==1.32.15
 
 
19
 
 
16
  tqdm==4.65.0
17
  wandb==0.15.4
18
  x_transformers==1.32.15
19
+ crepe==0.0.15
20
+ hmmlearn==0.3.2
21