ak36 commited on Jun 12, 2025

Commit

bf65828

1 Parent(s): c32984a

second_stage_v1

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.ipynb_checkpoints/train-checkpoint.log +342 -0
.ipynb_checkpoints/train_second-checkpoint.py +879 -0
logs/pod_90h_30k/config_ft_single.yml → Configs/.ipynb_checkpoints/config_ft_single-checkpoint.yml +19 -19
Configs/.ipynb_checkpoints/config_libritts-checkpoint.yml +113 -0
Configs/config_ft_single.yml +16 -16
Demo/.ipynb_checkpoints/Inference_LibriTTS-checkpoint.ipynb +1155 -0
Demo/.ipynb_checkpoints/Inference_pod_90h_30k-checkpoint.ipynb +1155 -0
Demo/Inference_pod_90h_30k.ipynb +1360 -0
Modules/.ipynb_checkpoints/slmadv-checkpoint.py +177 -0
Modules/slmadv.py +126 -144
__pycache__/losses.cpython-310.pyc +0 -0
__pycache__/meldataset.cpython-310.pyc +0 -0
__pycache__/models.cpython-310.pyc +0 -0
__pycache__/optimizers.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
events.out.tfevents.1749451143.164-152-17-237.47710.0 +0 -3
events.out.tfevents.1749451143.164-152-17-237.47712.0 +0 -3
events.out.tfevents.1749451144.164-152-17-237.47706.0 +0 -3
events.out.tfevents.1749451144.164-152-17-237.47708.0 +0 -3
events.out.tfevents.1749451144.164-152-17-237.47709.0 +0 -3
events.out.tfevents.1749451144.164-152-17-237.47711.0 +0 -3
events.out.tfevents.1749451220.164-152-17-237.48862.0 +0 -3
events.out.tfevents.1749451220.164-152-17-237.48863.0 +0 -3
events.out.tfevents.1749451220.164-152-17-237.48864.0 +0 -3
events.out.tfevents.1749451220.164-152-17-237.48865.0 +0 -3
events.out.tfevents.1749451220.164-152-17-237.48868.0 +0 -3
events.out.tfevents.1749451221.164-152-17-237.48861.0 +0 -3
events.out.tfevents.1749451221.164-152-17-237.48867.0 +0 -3
events.out.tfevents.1749451222.164-152-17-237.48866.0 +0 -3
events.out.tfevents.1749453792.164-152-17-237.51057.0 +0 -3
events.out.tfevents.1749453792.164-152-17-237.51059.0 +0 -3
events.out.tfevents.1749453792.164-152-17-237.51061.0 +0 -3
events.out.tfevents.1749453792.164-152-17-237.51063.0 +0 -3
events.out.tfevents.1749453793.164-152-17-237.51056.0 +0 -3
events.out.tfevents.1749453793.164-152-17-237.51058.0 +0 -3
events.out.tfevents.1749453793.164-152-17-237.51060.0 +0 -3
events.out.tfevents.1749453794.164-152-17-237.51062.0 +0 -3
events.out.tfevents.1749453905.164-152-17-237.52357.0 +0 -3
events.out.tfevents.1749453905.164-152-17-237.52358.0 +0 -3
events.out.tfevents.1749453905.164-152-17-237.52360.0 +0 -3
events.out.tfevents.1749453905.164-152-17-237.52361.0 +0 -3
events.out.tfevents.1749453906.164-152-17-237.52355.0 +0 -3
events.out.tfevents.1749453906.164-152-17-237.52356.0 +0 -3
events.out.tfevents.1749453906.164-152-17-237.52359.0 +0 -3
events.out.tfevents.1749453906.164-152-17-237.52362.0 +0 -3
events.out.tfevents.1749453977.164-152-17-237.53096.0 +0 -3
events.out.tfevents.1749453977.164-152-17-237.53097.0 +0 -3
events.out.tfevents.1749453977.164-152-17-237.53098.0 +0 -3
events.out.tfevents.1749453977.164-152-17-237.53099.0 +0 -3
events.out.tfevents.1749453977.164-152-17-237.53100.0 +0 -3

.ipynb_checkpoints/train-checkpoint.log ADDED Viewed

	@@ -0,0 +1,342 @@

+INFO:2025-06-09 01:00:12,153: Epoch [3/25], Step [50/3970], Mel Loss: 0.63656, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:00:55,689: Epoch [3/25], Step [100/3970], Mel Loss: 0.63674, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:01:45,433: Epoch [3/25], Step [150/3970], Mel Loss: 0.63203, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:02:37,587: Epoch [3/25], Step [200/3970], Mel Loss: 0.62929, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:03:28,825: Epoch [3/25], Step [250/3970], Mel Loss: 0.63209, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:04:18,272: Epoch [3/25], Step [300/3970], Mel Loss: 0.62710, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:05:09,751: Epoch [3/25], Step [350/3970], Mel Loss: 0.62325, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:06:00,396: Epoch [3/25], Step [400/3970], Mel Loss: 0.62540, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:06:51,713: Epoch [3/25], Step [450/3970], Mel Loss: 0.61673, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:08:07,720: Validation loss: 0.568
+INFO:2025-06-09 01:09:07,489: Epoch [4/25], Step [50/3970], Mel Loss: 0.62133, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:09:59,049: Epoch [4/25], Step [100/3970], Mel Loss: 0.61368, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:10:46,786: Epoch [4/25], Step [150/3970], Mel Loss: 0.61887, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:11:36,393: Epoch [4/25], Step [200/3970], Mel Loss: 0.61688, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:12:21,624: Epoch [4/25], Step [250/3970], Mel Loss: 0.61630, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:13:08,125: Epoch [4/25], Step [300/3970], Mel Loss: 0.61238, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:13:53,747: Epoch [4/25], Step [350/3970], Mel Loss: 0.61566, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:14:42,113: Epoch [4/25], Step [400/3970], Mel Loss: 0.61601, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:15:29,167: Epoch [4/25], Step [450/3970], Mel Loss: 0.61588, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:16:42,289: Validation loss: 0.559
+INFO:2025-06-09 01:17:33,371: Epoch [5/25], Step [50/3970], Mel Loss: 0.61289, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:18:21,238: Epoch [5/25], Step [100/3970], Mel Loss: 0.61256, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:19:08,129: Epoch [5/25], Step [150/3970], Mel Loss: 0.60756, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:19:55,306: Epoch [5/25], Step [200/3970], Mel Loss: 0.60886, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:20:38,852: Epoch [5/25], Step [250/3970], Mel Loss: 0.61364, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:21:23,920: Epoch [5/25], Step [300/3970], Mel Loss: 0.60994, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:22:13,541: Epoch [5/25], Step [350/3970], Mel Loss: 0.59860, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:22:59,673: Epoch [5/25], Step [400/3970], Mel Loss: 0.61045, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:23:48,982: Epoch [5/25], Step [450/3970], Mel Loss: 0.59750, Gen Loss: 0.00000, Disc Loss: 0.00000, Mono Loss: 0.00000, S2S Loss: 0.00000, SLM Loss: 0.00000
+INFO:2025-06-09 01:25:03,693: Validation loss: 0.554
+INFO:2025-06-09 01:26:38,581: Epoch [6/25], Step [50/3970], Mel Loss: 0.97984, Gen Loss: 23.17753, Disc Loss: 2.41794, Mono Loss: 0.03876, S2S Loss: 2.62273, SLM Loss: 2.94417
+INFO:2025-06-09 01:28:02,840: Epoch [6/25], Step [100/3970], Mel Loss: 1.07962, Gen Loss: 9.85569, Disc Loss: 3.64044, Mono Loss: 0.04134, S2S Loss: 2.85493, SLM Loss: 2.91264
+INFO:2025-06-09 01:29:28,320: Epoch [6/25], Step [150/3970], Mel Loss: 0.86465, Gen Loss: 7.72572, Disc Loss: 3.54385, Mono Loss: 0.03521, S2S Loss: 2.93994, SLM Loss: 2.88020
+INFO:2025-06-09 01:30:52,824: Epoch [6/25], Step [200/3970], Mel Loss: 0.79431, Gen Loss: 6.71339, Disc Loss: 3.60161, Mono Loss: 0.03136, S2S Loss: 2.63649, SLM Loss: 2.84435
+INFO:2025-06-09 01:32:17,827: Epoch [6/25], Step [250/3970], Mel Loss: 0.69364, Gen Loss: 7.94329, Disc Loss: 3.65054, Mono Loss: 0.02880, S2S Loss: 2.69598, SLM Loss: 2.74597
+INFO:2025-06-09 01:33:40,309: Epoch [6/25], Step [300/3970], Mel Loss: 0.64917, Gen Loss: 6.08137, Disc Loss: 3.72214, Mono Loss: 0.02712, S2S Loss: 2.67866, SLM Loss: 2.67792
+INFO:2025-06-09 01:35:00,338: Epoch [6/25], Step [350/3970], Mel Loss: 0.61861, Gen Loss: 5.89171, Disc Loss: 3.76209, Mono Loss: 0.03497, S2S Loss: 2.41264, SLM Loss: 2.63655
+INFO:2025-06-09 01:36:24,903: Epoch [6/25], Step [400/3970], Mel Loss: 0.60979, Gen Loss: 7.66665, Disc Loss: 3.91850, Mono Loss: 0.03728, S2S Loss: 2.28397, SLM Loss: 2.58636
+INFO:2025-06-09 01:37:48,840: Epoch [6/25], Step [450/3970], Mel Loss: 0.58291, Gen Loss: 6.77857, Disc Loss: 3.68198, Mono Loss: 0.03211, S2S Loss: 2.31591, SLM Loss: 2.60305
+INFO:2025-06-09 01:39:34,991: Validation loss: 0.537
+INFO:2025-06-09 01:41:06,553: Epoch [7/25], Step [50/3970], Mel Loss: 0.56623, Gen Loss: 9.29693, Disc Loss: 3.64386, Mono Loss: 0.03481, S2S Loss: 2.34774, SLM Loss: 2.54822
+INFO:2025-06-09 01:42:30,275: Epoch [7/25], Step [100/3970], Mel Loss: 0.56485, Gen Loss: 7.86536, Disc Loss: 3.62585, Mono Loss: 0.02787, S2S Loss: 2.28482, SLM Loss: 2.35773
+INFO:2025-06-09 01:43:57,072: Epoch [7/25], Step [150/3970], Mel Loss: 0.55288, Gen Loss: 6.11425, Disc Loss: 3.71416, Mono Loss: 0.03559, S2S Loss: 2.12334, SLM Loss: 2.29561
+INFO:2025-06-09 01:45:19,320: Epoch [7/25], Step [200/3970], Mel Loss: 0.54845, Gen Loss: 8.49720, Disc Loss: 3.60499, Mono Loss: 0.03008, S2S Loss: 2.01824, SLM Loss: 2.54976
+INFO:2025-06-09 01:46:43,392: Epoch [7/25], Step [250/3970], Mel Loss: 0.54677, Gen Loss: 8.23377, Disc Loss: 3.75765, Mono Loss: 0.03607, S2S Loss: 1.90671, SLM Loss: 2.42518
+INFO:2025-06-09 01:48:06,193: Epoch [7/25], Step [300/3970], Mel Loss: 0.53016, Gen Loss: 6.69302, Disc Loss: 3.70597, Mono Loss: 0.02783, S2S Loss: 1.66317, SLM Loss: 2.40360
+INFO:2025-06-09 01:49:31,131: Epoch [7/25], Step [350/3970], Mel Loss: 0.53427, Gen Loss: 9.37859, Disc Loss: 3.71469, Mono Loss: 0.03775, S2S Loss: 1.82596, SLM Loss: 2.32477
+INFO:2025-06-09 01:50:55,050: Epoch [7/25], Step [400/3970], Mel Loss: 0.52960, Gen Loss: 7.75816, Disc Loss: 3.68042, Mono Loss: 0.03405, S2S Loss: 1.99194, SLM Loss: 2.36643
+INFO:2025-06-09 01:52:17,195: Epoch [7/25], Step [450/3970], Mel Loss: 0.54899, Gen Loss: 6.99428, Disc Loss: 3.58416, Mono Loss: 0.02682, S2S Loss: 1.87868, SLM Loss: 2.30144
+INFO:2025-06-09 01:54:03,014: Validation loss: 0.483
+INFO:2025-06-09 01:55:38,965: Epoch [8/25], Step [50/3970], Mel Loss: 0.52470, Gen Loss: 10.28693, Disc Loss: 3.63978, Mono Loss: 0.03907, S2S Loss: 1.83708, SLM Loss: 2.34067
+INFO:2025-06-09 01:57:02,183: Epoch [8/25], Step [100/3970], Mel Loss: 0.53876, Gen Loss: 7.51378, Disc Loss: 3.64213, Mono Loss: 0.03625, S2S Loss: 1.72809, SLM Loss: 2.16873
+INFO:2025-06-09 01:58:25,854: Epoch [8/25], Step [150/3970], Mel Loss: 0.52859, Gen Loss: 7.03971, Disc Loss: 3.77774, Mono Loss: 0.03803, S2S Loss: 2.06151, SLM Loss: 2.42842
+INFO:2025-06-09 01:59:48,325: Epoch [8/25], Step [200/3970], Mel Loss: 0.52193, Gen Loss: 8.06612, Disc Loss: 3.57948, Mono Loss: 0.02800, S2S Loss: 1.74277, SLM Loss: 2.34817
+INFO:2025-06-09 02:01:13,028: Epoch [8/25], Step [250/3970], Mel Loss: 0.51478, Gen Loss: 8.66409, Disc Loss: 3.49825, Mono Loss: 0.04338, S2S Loss: 1.61835, SLM Loss: 2.15467
+INFO:2025-06-09 02:02:38,248: Epoch [8/25], Step [300/3970], Mel Loss: 0.52305, Gen Loss: 6.17309, Disc Loss: 3.58619, Mono Loss: 0.02633, S2S Loss: 1.72876, SLM Loss: 2.30287
+INFO:2025-06-09 02:03:59,711: Epoch [8/25], Step [350/3970], Mel Loss: 0.52308, Gen Loss: 9.16025, Disc Loss: 3.61594, Mono Loss: 0.03729, S2S Loss: 1.62949, SLM Loss: 2.24222
+INFO:2025-06-09 02:05:25,200: Epoch [8/25], Step [400/3970], Mel Loss: 0.52249, Gen Loss: 10.42249, Disc Loss: 3.28414, Mono Loss: 0.03355, S2S Loss: 1.73984, SLM Loss: 2.41117
+INFO:2025-06-09 02:06:52,106: Epoch [8/25], Step [450/3970], Mel Loss: 0.53768, Gen Loss: 9.69683, Disc Loss: 3.94595, Mono Loss: 0.03386, S2S Loss: 1.51221, SLM Loss: 2.22714
+INFO:2025-06-09 02:08:43,461: Validation loss: 0.576
+INFO:2025-06-09 02:10:15,610: Epoch [9/25], Step [50/3970], Mel Loss: 0.54530, Gen Loss: 8.99131, Disc Loss: 3.51233, Mono Loss: 0.04022, S2S Loss: 1.83184, SLM Loss: 2.27819
+INFO:2025-06-09 02:11:40,932: Epoch [9/25], Step [100/3970], Mel Loss: 0.52112, Gen Loss: 10.46535, Disc Loss: 3.27846, Mono Loss: 0.04066, S2S Loss: 1.40977, SLM Loss: 2.23133
+INFO:2025-06-09 02:13:05,924: Epoch [9/25], Step [150/3970], Mel Loss: 0.52145, Gen Loss: 7.49124, Disc Loss: 3.61879, Mono Loss: 0.03863, S2S Loss: 1.37945, SLM Loss: 1.97726
+INFO:2025-06-09 02:14:33,751: Epoch [9/25], Step [200/3970], Mel Loss: 0.52140, Gen Loss: 9.70458, Disc Loss: 3.41580, Mono Loss: 0.02937, S2S Loss: 1.11212, SLM Loss: 2.06116
+INFO:2025-06-09 02:16:00,853: Epoch [9/25], Step [250/3970], Mel Loss: 0.51334, Gen Loss: 9.93914, Disc Loss: 3.31054, Mono Loss: 0.03239, S2S Loss: 1.65795, SLM Loss: 2.17712
+INFO:2025-06-09 02:17:34,691: Epoch [9/25], Step [300/3970], Mel Loss: 0.51978, Gen Loss: 8.67425, Disc Loss: 3.50655, Mono Loss: 0.03035, S2S Loss: 1.59474, SLM Loss: 2.07509
+INFO:2025-06-09 02:19:00,242: Epoch [9/25], Step [350/3970], Mel Loss: 0.53707, Gen Loss: 9.08727, Disc Loss: 3.54805, Mono Loss: 0.02839, S2S Loss: 1.37795, SLM Loss: 2.33604
+INFO:2025-06-09 02:20:26,229: Epoch [9/25], Step [400/3970], Mel Loss: 0.53049, Gen Loss: 10.96715, Disc Loss: 3.46047, Mono Loss: 0.01226, S2S Loss: 1.65820, SLM Loss: 2.27651
+INFO:2025-06-09 02:21:51,865: Epoch [9/25], Step [450/3970], Mel Loss: 0.54155, Gen Loss: 12.13123, Disc Loss: 3.59190, Mono Loss: 0.03659, S2S Loss: 1.53226, SLM Loss: 2.27701
+INFO:2025-06-09 02:23:43,778: Validation loss: 0.570
+INFO:2025-06-09 02:25:19,386: Epoch [10/25], Step [50/3970], Mel Loss: 0.51787, Gen Loss: 8.53281, Disc Loss: 3.29783, Mono Loss: 0.03446, S2S Loss: 1.45484, SLM Loss: 2.05988
+INFO:2025-06-09 02:26:47,838: Epoch [10/25], Step [100/3970], Mel Loss: 0.50870, Gen Loss: 10.23766, Disc Loss: 3.65731, Mono Loss: 0.02026, S2S Loss: 1.49496, SLM Loss: 2.13103
+INFO:2025-06-09 02:28:14,726: Epoch [10/25], Step [150/3970], Mel Loss: 0.54335, Gen Loss: 10.79815, Disc Loss: 3.13370, Mono Loss: 0.03967, S2S Loss: 1.45980, SLM Loss: 2.08032
+INFO:2025-06-09 02:29:40,057: Epoch [10/25], Step [200/3970], Mel Loss: 0.54070, Gen Loss: 16.04053, Disc Loss: 2.25289, Mono Loss: 0.03900, S2S Loss: 1.27599, SLM Loss: 1.92492
+INFO:2025-06-09 02:31:03,537: Epoch [10/25], Step [250/3970], Mel Loss: 0.52524, Gen Loss: 10.16288, Disc Loss: 3.41181, Mono Loss: 0.03559, S2S Loss: 1.43502, SLM Loss: 2.11398
+INFO:2025-06-09 02:32:27,638: Epoch [10/25], Step [300/3970], Mel Loss: 0.50069, Gen Loss: 10.37155, Disc Loss: 3.37344, Mono Loss: 0.02960, S2S Loss: 1.31690, SLM Loss: 2.10520
+INFO:2025-06-09 02:33:55,403: Epoch [10/25], Step [350/3970], Mel Loss: 0.51798, Gen Loss: 8.71367, Disc Loss: 3.56202, Mono Loss: 0.02943, S2S Loss: 1.46521, SLM Loss: 1.94630
+INFO:2025-06-09 02:35:20,742: Epoch [10/25], Step [400/3970], Mel Loss: 0.50403, Gen Loss: 10.07516, Disc Loss: 3.33516, Mono Loss: 0.03924, S2S Loss: 1.33150, SLM Loss: 2.00906
+INFO:2025-06-09 02:36:48,130: Epoch [10/25], Step [450/3970], Mel Loss: 0.51103, Gen Loss: 13.04639, Disc Loss: 2.76208, Mono Loss: 0.03206, S2S Loss: 1.34592, SLM Loss: 1.97846
+INFO:2025-06-09 02:38:35,829: Validation loss: 0.494
+INFO:2025-06-09 02:40:09,941: Epoch [11/25], Step [50/3970], Mel Loss: 0.50742, Gen Loss: 14.54983, Disc Loss: 3.16779, Mono Loss: 0.03301, S2S Loss: 1.24001, SLM Loss: 2.09207
+INFO:2025-06-09 02:41:31,696: Epoch [11/25], Step [100/3970], Mel Loss: 0.51317, Gen Loss: 10.19775, Disc Loss: 3.39758, Mono Loss: 0.03635, S2S Loss: 1.18542, SLM Loss: 1.87807
+INFO:2025-06-09 02:43:00,303: Epoch [11/25], Step [150/3970], Mel Loss: 0.49535, Gen Loss: 10.78111, Disc Loss: 3.42311, Mono Loss: 0.02923, S2S Loss: 1.30470, SLM Loss: 2.03685
+INFO:2025-06-09 02:44:25,379: Epoch [11/25], Step [200/3970], Mel Loss: 0.52437, Gen Loss: 12.13915, Disc Loss: 3.48997, Mono Loss: 0.03481, S2S Loss: 1.40437, SLM Loss: 2.22972
+INFO:2025-06-09 02:45:48,875: Epoch [11/25], Step [250/3970], Mel Loss: 0.52986, Gen Loss: 10.21078, Disc Loss: 3.29919, Mono Loss: 0.03813, S2S Loss: 1.31922, SLM Loss: 2.12596
+INFO:2025-06-09 02:47:12,758: Epoch [11/25], Step [300/3970], Mel Loss: 0.50854, Gen Loss: 11.45082, Disc Loss: 3.70037, Mono Loss: 0.02635, S2S Loss: 1.30175, SLM Loss: 2.10208
+INFO:2025-06-09 02:48:37,380: Epoch [11/25], Step [350/3970], Mel Loss: 0.51107, Gen Loss: 12.95888, Disc Loss: 2.67743, Mono Loss: 0.03419, S2S Loss: 1.20725, SLM Loss: 1.90229
+INFO:2025-06-09 02:50:00,179: Epoch [11/25], Step [400/3970], Mel Loss: 0.50464, Gen Loss: 14.90269, Disc Loss: 2.48575, Mono Loss: 0.02608, S2S Loss: 1.25147, SLM Loss: 2.09807
+INFO:2025-06-09 02:51:23,892: Epoch [11/25], Step [450/3970], Mel Loss: 0.50813, Gen Loss: 11.16796, Disc Loss: 3.33927, Mono Loss: 0.02247, S2S Loss: 1.23355, SLM Loss: 2.03762
+INFO:2025-06-09 02:53:11,784: Validation loss: 0.500
+INFO:2025-06-09 02:54:42,588: Epoch [12/25], Step [50/3970], Mel Loss: 0.51559, Gen Loss: 12.73433, Disc Loss: 3.10313, Mono Loss: 0.02916, S2S Loss: 1.15304, SLM Loss: 2.04169
+INFO:2025-06-09 02:56:06,203: Epoch [12/25], Step [100/3970], Mel Loss: 0.53166, Gen Loss: 15.15074, Disc Loss: 2.44015, Mono Loss: 0.02971, S2S Loss: 1.16761, SLM Loss: 2.05299
+INFO:2025-06-09 02:57:34,086: Epoch [12/25], Step [150/3970], Mel Loss: 0.52280, Gen Loss: 13.14780, Disc Loss: 2.62888, Mono Loss: 0.02973, S2S Loss: 1.36648, SLM Loss: 2.20558
+INFO:2025-06-09 02:59:02,556: Epoch [12/25], Step [200/3970], Mel Loss: 0.51851, Gen Loss: 11.34044, Disc Loss: 3.37104, Mono Loss: 0.02984, S2S Loss: 1.27682, SLM Loss: 2.09497
+INFO:2025-06-09 03:00:26,363: Epoch [12/25], Step [250/3970], Mel Loss: 0.50483, Gen Loss: 9.85288, Disc Loss: 3.57650, Mono Loss: 0.02718, S2S Loss: 1.23974, SLM Loss: 2.03210
+INFO:2025-06-09 03:01:51,385: Epoch [12/25], Step [300/3970], Mel Loss: 0.49458, Gen Loss: 11.01429, Disc Loss: 3.23517, Mono Loss: 0.03900, S2S Loss: 1.20380, SLM Loss: 2.05801
+INFO:2025-06-09 03:03:16,092: Epoch [12/25], Step [350/3970], Mel Loss: 0.52576, Gen Loss: 9.24854, Disc Loss: 3.31379, Mono Loss: 0.03915, S2S Loss: 1.13775, SLM Loss: 2.09123
+INFO:2025-06-09 03:04:46,606: Epoch [12/25], Step [400/3970], Mel Loss: 0.50221, Gen Loss: 10.17654, Disc Loss: 3.53851, Mono Loss: 0.03372, S2S Loss: 1.20534, SLM Loss: 2.00393
+INFO:2025-06-09 03:06:09,986: Epoch [12/25], Step [450/3970], Mel Loss: 0.52733, Gen Loss: 14.04951, Disc Loss: 2.97020, Mono Loss: 0.02758, S2S Loss: 1.06250, SLM Loss: 1.99839
+INFO:2025-06-09 03:07:59,553: Validation loss: 0.471
+INFO:2025-06-09 03:09:33,778: Epoch [13/25], Step [50/3970], Mel Loss: 0.49267, Gen Loss: 11.16026, Disc Loss: 3.05907, Mono Loss: 0.02503, S2S Loss: 1.00237, SLM Loss: 1.82247
+INFO:2025-06-09 03:11:01,558: Epoch [13/25], Step [100/3970], Mel Loss: 0.49195, Gen Loss: 9.90175, Disc Loss: 3.69180, Mono Loss: 0.02045, S2S Loss: 1.30512, SLM Loss: 1.89232
+INFO:2025-06-09 03:12:28,265: Epoch [13/25], Step [150/3970], Mel Loss: 0.49494, Gen Loss: 12.47827, Disc Loss: 2.95689, Mono Loss: 0.02328, S2S Loss: 1.05090, SLM Loss: 1.93738
+INFO:2025-06-09 03:13:50,919: Epoch [13/25], Step [200/3970], Mel Loss: 0.51474, Gen Loss: 11.96166, Disc Loss: 3.02671, Mono Loss: 0.03436, S2S Loss: 1.09315, SLM Loss: 1.92749
+INFO:2025-06-09 03:15:17,888: Epoch [13/25], Step [250/3970], Mel Loss: 0.51118, Gen Loss: 17.41326, Disc Loss: 2.00980, Mono Loss: 0.02579, S2S Loss: 1.27793, SLM Loss: 2.11468
+INFO:2025-06-09 03:16:42,783: Epoch [13/25], Step [300/3970], Mel Loss: 0.52890, Gen Loss: 10.31864, Disc Loss: 3.39953, Mono Loss: 0.03986, S2S Loss: 1.07551, SLM Loss: 1.85846
+INFO:2025-06-09 03:18:10,773: Epoch [13/25], Step [350/3970], Mel Loss: 0.49484, Gen Loss: 9.64177, Disc Loss: 3.54298, Mono Loss: 0.03428, S2S Loss: 1.03875, SLM Loss: 1.94741
+INFO:2025-06-09 03:19:34,116: Epoch [13/25], Step [400/3970], Mel Loss: 0.49129, Gen Loss: 15.92753, Disc Loss: 2.42741, Mono Loss: 0.03079, S2S Loss: 1.08931, SLM Loss: 1.93091
+INFO:2025-06-09 03:20:57,906: Epoch [13/25], Step [450/3970], Mel Loss: 0.50837, Gen Loss: 11.88165, Disc Loss: 3.05828, Mono Loss: 0.02700, S2S Loss: 1.02226, SLM Loss: 2.08510
+INFO:2025-06-09 03:22:45,754: Validation loss: 0.481
+INFO:2025-06-09 03:24:21,819: Epoch [14/25], Step [50/3970], Mel Loss: 0.48985, Gen Loss: 12.91563, Disc Loss: 2.76847, Mono Loss: 0.04083, S2S Loss: 1.14642, SLM Loss: 1.95628
+INFO:2025-06-09 03:25:47,789: Epoch [14/25], Step [100/3970], Mel Loss: 0.49605, Gen Loss: 12.36551, Disc Loss: 2.68910, Mono Loss: 0.02049, S2S Loss: 1.21105, SLM Loss: 1.98918
+INFO:2025-06-09 03:27:15,432: Epoch [14/25], Step [150/3970], Mel Loss: 0.50257, Gen Loss: 11.92622, Disc Loss: 3.19436, Mono Loss: 0.02369, S2S Loss: 1.07385, SLM Loss: 1.89891
+INFO:2025-06-09 03:28:39,067: Epoch [14/25], Step [200/3970], Mel Loss: 0.49241, Gen Loss: 10.28760, Disc Loss: 3.14552, Mono Loss: 0.02857, S2S Loss: 1.18241, SLM Loss: 1.92434
+INFO:2025-06-09 03:30:02,661: Epoch [14/25], Step [250/3970], Mel Loss: 0.50479, Gen Loss: 14.76222, Disc Loss: 2.38417, Mono Loss: 0.03464, S2S Loss: 1.11464, SLM Loss: 1.90056
+INFO:2025-06-09 03:31:25,232: Epoch [14/25], Step [300/3970], Mel Loss: 0.51008, Gen Loss: 10.93404, Disc Loss: 3.09978, Mono Loss: 0.02636, S2S Loss: 1.06983, SLM Loss: 2.01580
+INFO:2025-06-09 03:32:47,763: Epoch [14/25], Step [350/3970], Mel Loss: 0.49603, Gen Loss: 13.14640, Disc Loss: 2.93398, Mono Loss: 0.03515, S2S Loss: 1.13013, SLM Loss: 1.87904
+INFO:2025-06-09 03:34:09,946: Epoch [14/25], Step [400/3970], Mel Loss: 0.49311, Gen Loss: 15.16063, Disc Loss: 2.26509, Mono Loss: 0.02470, S2S Loss: 1.13779, SLM Loss: 1.98486
+INFO:2025-06-09 03:35:35,944: Epoch [14/25], Step [450/3970], Mel Loss: 0.49960, Gen Loss: 14.25554, Disc Loss: 2.94801, Mono Loss: 0.03173, S2S Loss: 1.06616, SLM Loss: 1.87259
+INFO:2025-06-09 03:37:23,594: Validation loss: 0.474
+INFO:2025-06-09 03:38:59,186: Epoch [15/25], Step [50/3970], Mel Loss: 0.49733, Gen Loss: 15.70522, Disc Loss: 2.52600, Mono Loss: 0.02580, S2S Loss: 1.16987, SLM Loss: 1.96256
+INFO:2025-06-09 03:40:28,804: Epoch [15/25], Step [100/3970], Mel Loss: 0.50295, Gen Loss: 14.09082, Disc Loss: 2.55250, Mono Loss: 0.02887, S2S Loss: 0.98603, SLM Loss: 1.95487
+INFO:2025-06-09 03:41:51,594: Epoch [15/25], Step [150/3970], Mel Loss: 0.50385, Gen Loss: 9.58022, Disc Loss: 3.57700, Mono Loss: 0.02735, S2S Loss: 1.30176, SLM Loss: 1.89019
+INFO:2025-06-09 03:43:21,769: Epoch [15/25], Step [200/3970], Mel Loss: 0.50567, Gen Loss: 13.83009, Disc Loss: 2.59786, Mono Loss: 0.02999, S2S Loss: 0.97563, SLM Loss: 1.98239
+INFO:2025-06-09 03:44:48,877: Epoch [15/25], Step [250/3970], Mel Loss: 0.50670, Gen Loss: 15.52833, Disc Loss: 2.77497, Mono Loss: 0.03401, S2S Loss: 1.08127, SLM Loss: 1.98682
+INFO:2025-06-09 03:46:14,023: Epoch [15/25], Step [300/3970], Mel Loss: 0.48451, Gen Loss: 10.18036, Disc Loss: 3.13374, Mono Loss: 0.03035, S2S Loss: 1.07857, SLM Loss: 1.76348
+INFO:2025-06-09 03:47:40,871: Epoch [15/25], Step [350/3970], Mel Loss: 0.48213, Gen Loss: 14.40100, Disc Loss: 2.57544, Mono Loss: 0.03224, S2S Loss: 1.27353, SLM Loss: 2.20880
+INFO:2025-06-09 03:49:02,418: Epoch [15/25], Step [400/3970], Mel Loss: 0.49212, Gen Loss: 12.04808, Disc Loss: 3.22365, Mono Loss: 0.04031, S2S Loss: 1.16138, SLM Loss: 1.93176
+INFO:2025-06-09 03:50:29,288: Epoch [15/25], Step [450/3970], Mel Loss: 0.49498, Gen Loss: 15.64781, Disc Loss: 2.42535, Mono Loss: 0.02771, S2S Loss: 1.12834, SLM Loss: 1.92330
+INFO:2025-06-09 03:52:16,555: Validation loss: 0.484
+INFO:2025-06-09 03:53:54,302: Epoch [16/25], Step [50/3970], Mel Loss: 0.51999, Gen Loss: 14.74118, Disc Loss: 3.12985, Mono Loss: 0.03898, S2S Loss: 1.17255, SLM Loss: 2.04765
+INFO:2025-06-09 03:55:21,475: Epoch [16/25], Step [100/3970], Mel Loss: 0.50182, Gen Loss: 12.08678, Disc Loss: 2.86320, Mono Loss: 0.03554, S2S Loss: 1.07750, SLM Loss: 1.93926
+INFO:2025-06-09 03:56:47,475: Epoch [16/25], Step [150/3970], Mel Loss: 0.50050, Gen Loss: 9.98121, Disc Loss: 3.66234, Mono Loss: 0.04147, S2S Loss: 1.05245, SLM Loss: 1.79418
+INFO:2025-06-09 03:58:18,119: Epoch [16/25], Step [200/3970], Mel Loss: 0.48289, Gen Loss: 11.81459, Disc Loss: 3.18255, Mono Loss: 0.02753, S2S Loss: 1.00464, SLM Loss: 2.07043
+INFO:2025-06-09 03:59:52,641: Epoch [16/25], Step [250/3970], Mel Loss: 0.50084, Gen Loss: 12.98343, Disc Loss: 2.73330, Mono Loss: 0.02163, S2S Loss: 1.01344, SLM Loss: 2.09205
+INFO:2025-06-09 04:01:18,443: Epoch [16/25], Step [300/3970], Mel Loss: 0.48698, Gen Loss: 10.43984, Disc Loss: 3.65968, Mono Loss: 0.02921, S2S Loss: 1.09594, SLM Loss: 1.76368
+INFO:2025-06-09 04:02:43,362: Epoch [16/25], Step [350/3970], Mel Loss: 0.49274, Gen Loss: 14.27684, Disc Loss: 2.40941, Mono Loss: 0.02796, S2S Loss: 1.00817, SLM Loss: 1.93564
+INFO:2025-06-09 04:04:11,908: Epoch [16/25], Step [400/3970], Mel Loss: 0.48388, Gen Loss: 12.83471, Disc Loss: 2.84156, Mono Loss: 0.02278, S2S Loss: 1.02146, SLM Loss: 1.84777
+INFO:2025-06-09 04:05:42,332: Epoch [16/25], Step [450/3970], Mel Loss: 0.47624, Gen Loss: 11.16790, Disc Loss: 3.20130, Mono Loss: 0.03327, S2S Loss: 1.30728, SLM Loss: 2.05759
+INFO:2025-06-09 04:07:30,779: Validation loss: 0.441
+INFO:2025-06-09 04:09:06,393: Epoch [17/25], Step [50/3970], Mel Loss: 0.47322, Gen Loss: 10.21182, Disc Loss: 3.17271, Mono Loss: 0.03857, S2S Loss: 1.05606, SLM Loss: 1.89386
+INFO:2025-06-09 04:10:36,875: Epoch [17/25], Step [100/3970], Mel Loss: 0.49200, Gen Loss: 11.82951, Disc Loss: 2.96425, Mono Loss: 0.03267, S2S Loss: 0.93129, SLM Loss: 1.86663
+INFO:2025-06-09 04:12:04,591: Epoch [17/25], Step [150/3970], Mel Loss: 0.47137, Gen Loss: 8.62027, Disc Loss: 3.83087, Mono Loss: 0.03580, S2S Loss: 0.96079, SLM Loss: 1.77226
+INFO:2025-06-09 04:13:32,190: Epoch [17/25], Step [200/3970], Mel Loss: 0.47893, Gen Loss: 10.65770, Disc Loss: 3.56187, Mono Loss: 0.02644, S2S Loss: 1.01706, SLM Loss: 2.04362
+INFO:2025-06-09 04:14:56,130: Epoch [17/25], Step [250/3970], Mel Loss: 0.48653, Gen Loss: 8.75337, Disc Loss: 3.41021, Mono Loss: 0.03018, S2S Loss: 1.21360, SLM Loss: 1.96406
+INFO:2025-06-09 04:16:22,783: Epoch [17/25], Step [300/3970], Mel Loss: 0.47912, Gen Loss: 13.35522, Disc Loss: 2.50100, Mono Loss: 0.03506, S2S Loss: 0.97357, SLM Loss: 1.86547
+INFO:2025-06-09 04:17:48,667: Epoch [17/25], Step [350/3970], Mel Loss: 0.48305, Gen Loss: 11.61836, Disc Loss: 2.81784, Mono Loss: 0.03363, S2S Loss: 1.08002, SLM Loss: 2.06560
+INFO:2025-06-09 04:19:14,326: Epoch [17/25], Step [400/3970], Mel Loss: 0.47853, Gen Loss: 12.34376, Disc Loss: 2.69610, Mono Loss: 0.02721, S2S Loss: 1.04334, SLM Loss: 1.70590
+INFO:2025-06-09 04:20:35,887: Epoch [17/25], Step [450/3970], Mel Loss: 0.49008, Gen Loss: 13.10175, Disc Loss: 2.95339, Mono Loss: 0.02577, S2S Loss: 0.94490, SLM Loss: 1.85765
+INFO:2025-06-09 04:22:27,589: Validation loss: 0.452
+INFO:2025-06-09 04:24:06,620: Epoch [18/25], Step [50/3970], Mel Loss: 0.47998, Gen Loss: 11.54100, Disc Loss: 2.96952, Mono Loss: 0.03144, S2S Loss: 1.16552, SLM Loss: 2.03827
+INFO:2025-06-09 04:25:34,179: Epoch [18/25], Step [100/3970], Mel Loss: 0.47805, Gen Loss: 13.14331, Disc Loss: 2.78030, Mono Loss: 0.03480, S2S Loss: 1.11621, SLM Loss: 1.77017
+INFO:2025-06-09 04:26:56,768: Epoch [18/25], Step [150/3970], Mel Loss: 0.47885, Gen Loss: 13.77032, Disc Loss: 2.75735, Mono Loss: 0.03735, S2S Loss: 1.01647, SLM Loss: 1.84030
+INFO:2025-06-09 04:28:22,414: Epoch [18/25], Step [200/3970], Mel Loss: 0.48841, Gen Loss: 9.09012, Disc Loss: 4.03486, Mono Loss: 0.02713, S2S Loss: 0.98568, SLM Loss: 2.00300
+INFO:2025-06-09 04:29:49,349: Epoch [18/25], Step [250/3970], Mel Loss: 0.47842, Gen Loss: 11.78235, Disc Loss: 3.07922, Mono Loss: 0.03915, S2S Loss: 1.13136, SLM Loss: 1.98326
+INFO:2025-06-09 04:31:14,759: Epoch [18/25], Step [300/3970], Mel Loss: 0.47979, Gen Loss: 11.35575, Disc Loss: 3.51251, Mono Loss: 0.02926, S2S Loss: 0.97950, SLM Loss: 1.76771
+INFO:2025-06-09 04:32:36,423: Epoch [18/25], Step [350/3970], Mel Loss: 0.47505, Gen Loss: 11.21230, Disc Loss: 3.30019, Mono Loss: 0.04272, S2S Loss: 0.82646, SLM Loss: 1.81805
+INFO:2025-06-09 04:34:01,703: Epoch [18/25], Step [400/3970], Mel Loss: 0.47216, Gen Loss: 12.97506, Disc Loss: 2.76475, Mono Loss: 0.02976, S2S Loss: 0.98103, SLM Loss: 1.91380
+INFO:2025-06-09 04:35:26,512: Epoch [18/25], Step [450/3970], Mel Loss: 0.47869, Gen Loss: 14.93690, Disc Loss: 2.70676, Mono Loss: 0.02386, S2S Loss: 0.86181, SLM Loss: 1.86082
+INFO:2025-06-09 04:37:13,388: Validation loss: 0.472
+INFO:2025-06-09 04:38:47,409: Epoch [19/25], Step [50/3970], Mel Loss: 0.50106, Gen Loss: 10.75373, Disc Loss: 3.20791, Mono Loss: 0.03283, S2S Loss: 1.05680, SLM Loss: 2.04265
+INFO:2025-06-09 04:40:11,614: Epoch [19/25], Step [100/3970], Mel Loss: 0.49229, Gen Loss: 12.93675, Disc Loss: 2.80900, Mono Loss: 0.02687, S2S Loss: 1.00646, SLM Loss: 1.95273
+INFO:2025-06-09 04:41:38,257: Epoch [19/25], Step [150/3970], Mel Loss: 0.48437, Gen Loss: 9.62462, Disc Loss: 3.65687, Mono Loss: 0.03190, S2S Loss: 1.10174, SLM Loss: 1.83353
+INFO:2025-06-09 04:43:10,165: Epoch [19/25], Step [200/3970], Mel Loss: 0.47472, Gen Loss: 14.23377, Disc Loss: 2.64775, Mono Loss: 0.03328, S2S Loss: 0.93959, SLM Loss: 1.89750
+INFO:2025-06-09 04:44:38,022: Epoch [19/25], Step [250/3970], Mel Loss: 0.47779, Gen Loss: 14.43170, Disc Loss: 2.78379, Mono Loss: 0.01906, S2S Loss: 0.99975, SLM Loss: 1.75941
+INFO:2025-06-09 04:46:03,339: Epoch [19/25], Step [300/3970], Mel Loss: 0.48885, Gen Loss: 13.73142, Disc Loss: 2.71020, Mono Loss: 0.03351, S2S Loss: 0.97011, SLM Loss: 1.83246
+INFO:2025-06-09 04:47:31,942: Epoch [19/25], Step [350/3970], Mel Loss: 0.49685, Gen Loss: 11.76912, Disc Loss: 3.36635, Mono Loss: 0.03053, S2S Loss: 1.02552, SLM Loss: 2.14654
+INFO:2025-06-09 04:49:03,189: Epoch [19/25], Step [400/3970], Mel Loss: 0.50997, Gen Loss: 12.43883, Disc Loss: 3.06039, Mono Loss: 0.02553, S2S Loss: 1.06563, SLM Loss: 2.07121
+INFO:2025-06-09 04:50:29,105: Epoch [19/25], Step [450/3970], Mel Loss: 0.47820, Gen Loss: 14.01424, Disc Loss: 2.55081, Mono Loss: 0.02975, S2S Loss: 0.90710, SLM Loss: 1.83058
+INFO:2025-06-09 04:52:19,425: Validation loss: 0.458
+INFO:2025-06-09 04:53:52,052: Epoch [20/25], Step [50/3970], Mel Loss: 0.46500, Gen Loss: 10.54346, Disc Loss: 3.31023, Mono Loss: 0.02908, S2S Loss: 0.90407, SLM Loss: 1.78864
+INFO:2025-06-09 04:55:17,797: Epoch [20/25], Step [100/3970], Mel Loss: 0.47071, Gen Loss: 12.65996, Disc Loss: 2.62974, Mono Loss: 0.03045, S2S Loss: 0.92329, SLM Loss: 1.80409
+INFO:2025-06-09 04:56:43,658: Epoch [20/25], Step [150/3970], Mel Loss: 0.50137, Gen Loss: 10.70967, Disc Loss: 3.02257, Mono Loss: 0.03576, S2S Loss: 1.06377, SLM Loss: 1.94652
+INFO:2025-06-09 04:58:10,885: Epoch [20/25], Step [200/3970], Mel Loss: 0.48577, Gen Loss: 13.93800, Disc Loss: 2.42582, Mono Loss: 0.03042, S2S Loss: 1.00638, SLM Loss: 1.94606
+INFO:2025-06-09 04:59:36,042: Epoch [20/25], Step [250/3970], Mel Loss: 0.47842, Gen Loss: 11.98050, Disc Loss: 2.85478, Mono Loss: 0.03991, S2S Loss: 0.88393, SLM Loss: 1.78679
+INFO:2025-06-09 05:01:05,055: Epoch [20/25], Step [300/3970], Mel Loss: 0.47698, Gen Loss: 14.17083, Disc Loss: 2.71589, Mono Loss: 0.03090, S2S Loss: 0.95709, SLM Loss: 1.76501
+INFO:2025-06-09 05:02:31,554: Epoch [20/25], Step [350/3970], Mel Loss: 0.47476, Gen Loss: 12.08669, Disc Loss: 2.79200, Mono Loss: 0.03368, S2S Loss: 1.02864, SLM Loss: 1.86428
+INFO:2025-06-09 05:03:58,164: Epoch [20/25], Step [400/3970], Mel Loss: 0.47778, Gen Loss: 16.65605, Disc Loss: 1.92339, Mono Loss: 0.02666, S2S Loss: 1.15662, SLM Loss: 1.86039
+INFO:2025-06-09 05:05:20,371: Epoch [20/25], Step [450/3970], Mel Loss: 0.47590, Gen Loss: 10.66579, Disc Loss: 2.86761, Mono Loss: 0.02992, S2S Loss: 0.91479, SLM Loss: 1.80008
+INFO:2025-06-09 05:07:11,411: Validation loss: 0.443
+INFO:2025-06-09 05:08:44,874: Epoch [21/25], Step [50/3970], Mel Loss: 0.46999, Gen Loss: 13.12624, Disc Loss: 2.57343, Mono Loss: 0.03491, S2S Loss: 1.04637, SLM Loss: 1.88191
+INFO:2025-06-09 05:10:12,520: Epoch [21/25], Step [100/3970], Mel Loss: 0.48069, Gen Loss: 13.96414, Disc Loss: 2.44672, Mono Loss: 0.02962, S2S Loss: 1.05849, SLM Loss: 1.87176
+INFO:2025-06-09 05:11:34,572: Epoch [21/25], Step [150/3970], Mel Loss: 0.47060, Gen Loss: 9.93725, Disc Loss: 3.41884, Mono Loss: 0.03497, S2S Loss: 0.67804, SLM Loss: 1.80349
+INFO:2025-06-09 05:13:00,537: Epoch [21/25], Step [200/3970], Mel Loss: 0.46755, Gen Loss: 13.78485, Disc Loss: 2.85819, Mono Loss: 0.02449, S2S Loss: 1.11917, SLM Loss: 1.90021
+INFO:2025-06-09 05:14:25,136: Epoch [21/25], Step [250/3970], Mel Loss: 0.47410, Gen Loss: 15.36125, Disc Loss: 2.36199, Mono Loss: 0.02162, S2S Loss: 0.80983, SLM Loss: 1.83971
+INFO:2025-06-09 05:15:52,212: Epoch [21/25], Step [300/3970], Mel Loss: 0.48966, Gen Loss: 10.06612, Disc Loss: 3.59549, Mono Loss: 0.02996, S2S Loss: 0.83736, SLM Loss: 1.80890
+INFO:2025-06-09 05:17:15,903: Epoch [21/25], Step [350/3970], Mel Loss: 0.46942, Gen Loss: 13.45973, Disc Loss: 2.75646, Mono Loss: 0.02494, S2S Loss: 0.99167, SLM Loss: 1.77071
+INFO:2025-06-09 05:18:39,464: Epoch [21/25], Step [400/3970], Mel Loss: 0.50879, Gen Loss: 12.60575, Disc Loss: 2.79759, Mono Loss: 0.02696, S2S Loss: 0.86569, SLM Loss: 1.84605
+INFO:2025-06-09 05:20:04,744: Epoch [21/25], Step [450/3970], Mel Loss: 0.46598, Gen Loss: 11.56731, Disc Loss: 3.28558, Mono Loss: 0.03384, S2S Loss: 1.14885, SLM Loss: 1.89640
+INFO:2025-06-09 05:21:56,848: Validation loss: 0.531
+INFO:2025-06-09 05:23:29,412: Epoch [22/25], Step [50/3970], Mel Loss: 0.48972, Gen Loss: 9.84740, Disc Loss: 3.27302, Mono Loss: 0.03283, S2S Loss: 0.90118, SLM Loss: 1.72501
+INFO:2025-06-09 05:24:56,674: Epoch [22/25], Step [100/3970], Mel Loss: 0.46670, Gen Loss: 12.74649, Disc Loss: 3.23306, Mono Loss: 0.02487, S2S Loss: 0.86857, SLM Loss: 1.82290
+INFO:2025-06-09 05:26:22,931: Epoch [22/25], Step [150/3970], Mel Loss: 0.47883, Gen Loss: 11.48265, Disc Loss: 2.94219, Mono Loss: 0.02303, S2S Loss: 0.74939, SLM Loss: 1.80694
+INFO:2025-06-09 05:27:48,330: Epoch [22/25], Step [200/3970], Mel Loss: 0.47217, Gen Loss: 13.39269, Disc Loss: 2.95101, Mono Loss: 0.02749, S2S Loss: 1.05043, SLM Loss: 1.96215
+INFO:2025-06-09 05:29:13,642: Epoch [22/25], Step [250/3970], Mel Loss: 0.46282, Gen Loss: 11.55498, Disc Loss: 3.11340, Mono Loss: 0.02747, S2S Loss: 1.03113, SLM Loss: 1.80026
+INFO:2025-06-09 05:30:37,418: Epoch [22/25], Step [300/3970], Mel Loss: 0.46544, Gen Loss: 11.89728, Disc Loss: 3.54280, Mono Loss: 0.02800, S2S Loss: 0.78287, SLM Loss: 1.71039
+INFO:2025-06-09 05:32:00,981: Epoch [22/25], Step [350/3970], Mel Loss: 0.47136, Gen Loss: 10.04337, Disc Loss: 3.48481, Mono Loss: 0.02932, S2S Loss: 0.85002, SLM Loss: 1.70120
+INFO:2025-06-09 05:33:23,173: Epoch [22/25], Step [400/3970], Mel Loss: 0.47503, Gen Loss: 13.56164, Disc Loss: 2.41358, Mono Loss: 0.03078, S2S Loss: 1.02305, SLM Loss: 1.93645
+INFO:2025-06-09 05:34:50,016: Epoch [22/25], Step [450/3970], Mel Loss: 0.46342, Gen Loss: 10.59329, Disc Loss: 3.17809, Mono Loss: 0.02502, S2S Loss: 0.97440, SLM Loss: 1.87193
+INFO:2025-06-09 05:36:40,602: Validation loss: 0.433
+INFO:2025-06-09 05:38:17,194: Epoch [23/25], Step [50/3970], Mel Loss: 0.46734, Gen Loss: 6.01506, Disc Loss: 3.87277, Mono Loss: 0.03631, S2S Loss: 0.86797, SLM Loss: 1.78308
+INFO:2025-06-09 05:39:45,134: Epoch [23/25], Step [100/3970], Mel Loss: 0.45456, Gen Loss: 10.32937, Disc Loss: 3.33413, Mono Loss: 0.03403, S2S Loss: 0.69475, SLM Loss: 1.64199
+INFO:2025-06-09 05:41:10,925: Epoch [23/25], Step [150/3970], Mel Loss: 0.47284, Gen Loss: 10.40523, Disc Loss: 3.22554, Mono Loss: 0.03001, S2S Loss: 1.01316, SLM Loss: 1.79790
+INFO:2025-06-09 05:42:38,603: Epoch [23/25], Step [200/3970], Mel Loss: 0.46535, Gen Loss: 13.34508, Disc Loss: 3.19096, Mono Loss: 0.03172, S2S Loss: 0.90094, SLM Loss: 1.90261
+INFO:2025-06-09 05:44:04,344: Epoch [23/25], Step [250/3970], Mel Loss: 0.46423, Gen Loss: 13.47969, Disc Loss: 2.72158, Mono Loss: 0.02347, S2S Loss: 0.84805, SLM Loss: 1.84319
+INFO:2025-06-09 05:45:29,913: Epoch [23/25], Step [300/3970], Mel Loss: 0.47467, Gen Loss: 13.43347, Disc Loss: 2.60238, Mono Loss: 0.02359, S2S Loss: 0.73610, SLM Loss: 1.73985
+INFO:2025-06-09 05:46:57,724: Epoch [23/25], Step [350/3970], Mel Loss: 0.47841, Gen Loss: 14.26015, Disc Loss: 2.87291, Mono Loss: 0.02988, S2S Loss: 0.85957, SLM Loss: 1.76982
+INFO:2025-06-09 05:48:22,973: Epoch [23/25], Step [400/3970], Mel Loss: 0.46632, Gen Loss: 11.81628, Disc Loss: 2.90987, Mono Loss: 0.02264, S2S Loss: 0.88753, SLM Loss: 1.71842
+INFO:2025-06-09 05:49:48,069: Epoch [23/25], Step [450/3970], Mel Loss: 0.45516, Gen Loss: 12.15128, Disc Loss: 2.99541, Mono Loss: 0.02988, S2S Loss: 0.89353, SLM Loss: 1.90833
+INFO:2025-06-09 05:51:36,052: Validation loss: 0.414
+INFO:2025-06-09 05:53:10,255: Epoch [24/25], Step [50/3970], Mel Loss: 0.46737, Gen Loss: 10.23576, Disc Loss: 3.82643, Mono Loss: 0.02651, S2S Loss: 1.06037, SLM Loss: 1.99568
+INFO:2025-06-09 05:54:37,468: Epoch [24/25], Step [100/3970], Mel Loss: 0.47177, Gen Loss: 12.41867, Disc Loss: 2.91510, Mono Loss: 0.02961, S2S Loss: 0.85282, SLM Loss: 1.90385
+INFO:2025-06-09 05:56:04,209: Epoch [24/25], Step [150/3970], Mel Loss: 0.46322, Gen Loss: 11.11626, Disc Loss: 3.14571, Mono Loss: 0.02543, S2S Loss: 1.12388, SLM Loss: 1.97046
+INFO:2025-06-09 05:57:29,491: Epoch [24/25], Step [200/3970], Mel Loss: 0.46526, Gen Loss: 15.59772, Disc Loss: 2.67776, Mono Loss: 0.03296, S2S Loss: 0.79784, SLM Loss: 1.69807
+INFO:2025-06-09 05:58:52,805: Epoch [24/25], Step [250/3970], Mel Loss: 0.46915, Gen Loss: 11.35859, Disc Loss: 2.94053, Mono Loss: 0.03207, S2S Loss: 0.66795, SLM Loss: 1.76326
+INFO:2025-06-09 06:00:20,548: Epoch [24/25], Step [300/3970], Mel Loss: 0.46567, Gen Loss: 11.95895, Disc Loss: 3.04587, Mono Loss: 0.03336, S2S Loss: 1.00292, SLM Loss: 1.68971
+INFO:2025-06-09 06:01:46,336: Epoch [24/25], Step [350/3970], Mel Loss: 0.46672, Gen Loss: 12.67813, Disc Loss: 3.09165, Mono Loss: 0.02409, S2S Loss: 1.00270, SLM Loss: 1.76963
+INFO:2025-06-09 06:03:11,223: Epoch [24/25], Step [400/3970], Mel Loss: 0.47140, Gen Loss: 12.71134, Disc Loss: 2.75087, Mono Loss: 0.03305, S2S Loss: 0.78916, SLM Loss: 1.78080
+INFO:2025-06-09 06:04:34,537: Epoch [24/25], Step [450/3970], Mel Loss: 0.47079, Gen Loss: 8.86274, Disc Loss: 3.47894, Mono Loss: 0.03888, S2S Loss: 0.75467, SLM Loss: 1.68494
+INFO:2025-06-09 06:06:21,549: Validation loss: 0.421
+INFO:2025-06-09 06:07:56,930: Epoch [25/25], Step [50/3970], Mel Loss: 0.46430, Gen Loss: 10.24589, Disc Loss: 3.36352, Mono Loss: 0.03448, S2S Loss: 0.82375, SLM Loss: 1.89295
+INFO:2025-06-09 06:09:24,068: Epoch [25/25], Step [100/3970], Mel Loss: 0.47254, Gen Loss: 9.77706, Disc Loss: 3.08066, Mono Loss: 0.02397, S2S Loss: 0.90991, SLM Loss: 1.88774
+INFO:2025-06-09 06:10:48,092: Epoch [25/25], Step [150/3970], Mel Loss: 0.46583, Gen Loss: 11.92317, Disc Loss: 3.11952, Mono Loss: 0.02979, S2S Loss: 0.85678, SLM Loss: 1.75618
+INFO:2025-06-09 06:12:13,142: Epoch [25/25], Step [200/3970], Mel Loss: 0.47014, Gen Loss: 11.47270, Disc Loss: 3.22507, Mono Loss: 0.02897, S2S Loss: 1.13971, SLM Loss: 1.97860
+INFO:2025-06-09 06:13:37,846: Epoch [25/25], Step [250/3970], Mel Loss: 0.46148, Gen Loss: 10.36515, Disc Loss: 3.29522, Mono Loss: 0.02770, S2S Loss: 0.90695, SLM Loss: 1.83152
+INFO:2025-06-09 06:15:03,448: Epoch [25/25], Step [300/3970], Mel Loss: 0.47940, Gen Loss: 14.23678, Disc Loss: 2.69026, Mono Loss: 0.03347, S2S Loss: 0.90296, SLM Loss: 1.91125
+INFO:2025-06-09 06:16:31,839: Epoch [25/25], Step [350/3970], Mel Loss: 0.46467, Gen Loss: 12.48178, Disc Loss: 2.69238, Mono Loss: 0.03317, S2S Loss: 0.71102, SLM Loss: 1.83116
+INFO:2025-06-09 06:17:56,540: Epoch [25/25], Step [400/3970], Mel Loss: 0.47247, Gen Loss: 11.85046, Disc Loss: 3.40187, Mono Loss: 0.03009, S2S Loss: 0.67559, SLM Loss: 1.67930
+INFO:2025-06-09 06:19:22,839: Epoch [25/25], Step [450/3970], Mel Loss: 0.46736, Gen Loss: 11.12021, Disc Loss: 2.89506, Mono Loss: 0.03046, S2S Loss: 0.86089, SLM Loss: 1.69023
+INFO:2025-06-09 06:21:10,431: Validation loss: 0.427
+INFO:2025-06-09 07:59:08,543: Epoch [1/15], Step [50/15883], Loss: 0.59463, Disc Loss: 0.00000, Dur Loss: 1.86396, CE Loss: 0.14515, Norm Loss: 1.16517, F0 Loss: 7.99762, LM Loss: 1.75907, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:00:16,205: Epoch [1/15], Step [100/15883], Loss: 0.58628, Disc Loss: 0.00000, Dur Loss: 1.41872, CE Loss: 0.10803, Norm Loss: 3.45058, F0 Loss: 8.65071, LM Loss: 2.24374, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:03:19,805: Epoch [1/15], Step [50/15883], Loss: 0.62156, Disc Loss: 0.00000, Dur Loss: 1.35592, CE Loss: 0.12068, Norm Loss: 4.53912, F0 Loss: 12.41762, LM Loss: 2.12398, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:04:26,944: Epoch [1/15], Step [100/15883], Loss: 0.57647, Disc Loss: 0.00000, Dur Loss: 1.64274, CE Loss: 0.13532, Norm Loss: 2.14458, F0 Loss: 10.61257, LM Loss: 2.36555, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:05:34,608: Epoch [1/15], Step [150/15883], Loss: 0.57745, Disc Loss: 0.00000, Dur Loss: 1.28145, CE Loss: 0.09792, Norm Loss: 2.62505, F0 Loss: 7.29611, LM Loss: 1.92034, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:06:41,940: Epoch [1/15], Step [200/15883], Loss: 0.56741, Disc Loss: 0.00000, Dur Loss: 1.18127, CE Loss: 0.08711, Norm Loss: 1.55673, F0 Loss: 9.18722, LM Loss: 1.94882, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:07:51,378: Epoch [1/15], Step [250/15883], Loss: 0.58637, Disc Loss: 0.00000, Dur Loss: 1.34848, CE Loss: 0.10076, Norm Loss: 2.22754, F0 Loss: 3.84831, LM Loss: 1.77836, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:09:00,842: Epoch [1/15], Step [300/15883], Loss: 0.52509, Disc Loss: 0.00000, Dur Loss: 2.02432, CE Loss: 0.15954, Norm Loss: 2.11042, F0 Loss: 6.38905, LM Loss: 2.07231, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:10:11,156: Epoch [1/15], Step [350/15883], Loss: 0.52738, Disc Loss: 0.00000, Dur Loss: 1.24347, CE Loss: 0.07718, Norm Loss: 2.49583, F0 Loss: 6.60764, LM Loss: 2.14540, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:11:21,010: Epoch [1/15], Step [400/15883], Loss: 0.53797, Disc Loss: 0.00000, Dur Loss: 1.27058, CE Loss: 0.08413, Norm Loss: 2.08075, F0 Loss: 4.33628, LM Loss: 2.04978, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:12:31,649: Epoch [1/15], Step [450/15883], Loss: 0.53851, Disc Loss: 0.00000, Dur Loss: 1.21631, CE Loss: 0.08424, Norm Loss: 2.76793, F0 Loss: 5.36540, LM Loss: 1.88584, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:13:39,582: Epoch [1/15], Step [500/15883], Loss: 0.53331, Disc Loss: 0.00000, Dur Loss: 1.04547, CE Loss: 0.07928, Norm Loss: 1.58679, F0 Loss: 3.28320, LM Loss: 1.99279, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:14:47,552: Epoch [1/15], Step [550/15883], Loss: 0.53619, Disc Loss: 0.00000, Dur Loss: 1.58742, CE Loss: 0.11445, Norm Loss: 3.74290, F0 Loss: 4.74286, LM Loss: 1.91775, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:15:55,037: Epoch [1/15], Step [600/15883], Loss: 0.54803, Disc Loss: 0.00000, Dur Loss: 1.44735, CE Loss: 0.09927, Norm Loss: 2.30749, F0 Loss: 6.71221, LM Loss: 1.95221, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:17:02,005: Epoch [1/15], Step [650/15883], Loss: 0.53016, Disc Loss: 0.00000, Dur Loss: 1.47494, CE Loss: 0.12974, Norm Loss: 2.32919, F0 Loss: 7.98697, LM Loss: 2.14263, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:18:09,942: Epoch [1/15], Step [700/15883], Loss: 0.53085, Disc Loss: 0.00000, Dur Loss: 1.08772, CE Loss: 0.05854, Norm Loss: 1.36569, F0 Loss: 3.09399, LM Loss: 1.99569, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:19:15,731: Epoch [1/15], Step [750/15883], Loss: 0.53651, Disc Loss: 0.00000, Dur Loss: 0.83910, CE Loss: 0.05060, Norm Loss: 2.25991, F0 Loss: 11.18594, LM Loss: 1.93452, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:20:24,249: Epoch [1/15], Step [800/15883], Loss: 0.52069, Disc Loss: 0.00000, Dur Loss: 1.07455, CE Loss: 0.06327, Norm Loss: 5.22029, F0 Loss: 7.52101, LM Loss: 2.07087, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:21:34,678: Epoch [1/15], Step [850/15883], Loss: 0.51518, Disc Loss: 0.00000, Dur Loss: 1.63769, CE Loss: 0.10630, Norm Loss: 2.12592, F0 Loss: 2.67696, LM Loss: 1.75673, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000
+INFO:2025-06-09 08:22:43,786: Epoch [1/15], Step [900/15883], Loss: 0.50269, Disc Loss: 0.00000, Dur Loss: 1.29939, CE Loss: 0.07080, Norm Loss: 1.26741, F0 Loss: 2.30947, LM Loss: 1.68987, Gen Loss: 0.00000, Sty Loss: 0.00000, Diff Loss: 0.00000, DiscLM Loss: 0.00000, GenLM Loss: 0.00000

.ipynb_checkpoints/train_second-checkpoint.py ADDED Viewed

	@@ -0,0 +1,879 @@

+# load packages
+import random
+import yaml
+import time
+from munch import Munch
+import numpy as np
+import torch
+from torch import nn
+import torch.nn.functional as F
+import torchaudio
+import librosa
+import click
+import shutil
+import traceback
+import warnings
+warnings.simplefilter('ignore')
+from torch.utils.tensorboard import SummaryWriter
+from meldataset import build_dataloader
+from Utils.ASR.models import ASRCNN
+from Utils.JDC.model import JDCNet
+from Utils.PLBERT.util import load_plbert
+from models import *
+from losses import *
+from utils import *
+from Modules.slmadv import SLMAdversarialLoss
+from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule
+from optimizers import build_optimizer
+def clip_to_bert(texts, mask, max_len: int = 510):
+    """
+    Hard-clip batch to ≤ max_len tokens and return
+    (texts_clipped, **fresh full-width mask**, new_lengths).
+    """
+    if texts.size(1) > max_len:
+        texts = texts[:, :max_len]
+    lengths  = (texts != 0).sum(dim=1)               # PAD id = 0
+    seq_len  = texts.size(1)
+    mask     = torch.arange(seq_len, device=texts.device).unsqueeze(0) >= \
+               lengths.unsqueeze(1)                  # shape [B, seq_len]
+    return texts, mask, lengths
+# simple fix for dataparallel that allows access to class attributes
+class MyDataParallel(torch.nn.DataParallel):
+    def __getattr__(self, name):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.module, name)
+import logging
+from logging import StreamHandler
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+handler = StreamHandler()
+handler.setLevel(logging.DEBUG)
+logger.addHandler(handler)
+@click.command()
+@click.option('-p', '--config_path', default='Configs/config.yml', type=str)
+def main(config_path):
+    config = yaml.safe_load(open(config_path))
+    log_dir = config['log_dir']
+    if not osp.exists(log_dir): os.makedirs(log_dir, exist_ok=True)
+    shutil.copy(config_path, osp.join(log_dir, osp.basename(config_path)))
+    writer = SummaryWriter(log_dir + "/tensorboard")
+    # write logs
+    file_handler = logging.FileHandler(osp.join(log_dir, 'train.log'))
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(logging.Formatter('%(levelname)s:%(asctime)s: %(message)s'))
+    logger.addHandler(file_handler)
+    batch_size = config.get('batch_size', 10)
+    epochs = config.get('epochs_2nd', 200)
+    save_freq = config.get('save_freq', 2)
+    log_interval = config.get('log_interval', 10)
+    saving_epoch = config.get('save_freq', 2)
+    data_params = config.get('data_params', None)
+    sr = config['preprocess_params'].get('sr', 24000)
+    train_path = data_params['train_data']
+    val_path = data_params['val_data']
+    root_path = data_params['root_path']
+    min_length = data_params['min_length']
+    OOD_data = data_params['OOD_data']
+    max_len = config.get('max_len', 200)
+    loss_params = Munch(config['loss_params'])
+    diff_epoch = loss_params.diff_epoch
+    joint_epoch = loss_params.joint_epoch
+    optimizer_params = Munch(config['optimizer_params'])
+    train_list, val_list = get_data_path_list(train_path, val_path)
+    device = 'cuda'
+    train_dataloader = build_dataloader(train_list,
+                                        root_path,
+                                        OOD_data=OOD_data,
+                                        min_length=min_length,
+                                        batch_size=batch_size,
+                                        num_workers=2,
+                                        dataset_config={},
+                                        device=device)
+    val_dataloader = build_dataloader(val_list,
+                                      root_path,
+                                      OOD_data=OOD_data,
+                                      min_length=min_length,
+                                      batch_size=batch_size,
+                                      validation=True,
+                                      num_workers=0,
+                                      device=device,
+                                      dataset_config={})
+    # load pretrained ASR model
+    ASR_config = config.get('ASR_config', False)
+    ASR_path = config.get('ASR_path', False)
+    text_aligner = load_ASR_models(ASR_path, ASR_config)
+    # load pretrained F0 model
+    F0_path = config.get('F0_path', False)
+    pitch_extractor = load_F0_models(F0_path)
+    # load PL-BERT model
+    BERT_path = config.get('PLBERT_dir', False)
+    plbert = load_plbert(BERT_path)
+    # build model
+    model_params = recursive_munch(config['model_params'])
+    multispeaker = model_params.multispeaker
+    model = build_model(model_params, text_aligner, pitch_extractor, plbert)
+    _ = [model[key].to(device) for key in model]
+    # DP
+    for key in model:
+        if key != "mpd" and key != "msd" and key != "wd":
+            model[key] = MyDataParallel(model[key])
+    start_epoch = 0
+    iters = 0
+    load_pretrained = config.get('pretrained_model', '') != '' and config.get('second_stage_load_pretrained', False)
+    if not load_pretrained:
+        if config.get('first_stage_path', '') != '':
+            first_stage_path = osp.join(log_dir, config.get('first_stage_path', 'first_stage.pth'))
+            print('Loading the first stage model at %s ...' % first_stage_path)
+            model, _, start_epoch, iters = load_checkpoint(model,
+                None,
+                first_stage_path,
+                load_only_params=True,
+                ignore_modules=['bert', 'bert_encoder', 'predictor', 'predictor_encoder', 'msd', 'mpd', 'wd', 'diffusion']) # keep starting epoch for tensorboard log
+            # these epochs should be counted from the start epoch
+            diff_epoch += start_epoch
+            joint_epoch += start_epoch
+            epochs += start_epoch
+            model.predictor_encoder = copy.deepcopy(model.style_encoder)
+        else:
+            raise ValueError('You need to specify the path to the first stage model.')
+    gl = GeneratorLoss(model.mpd, model.msd).to(device)
+    dl = DiscriminatorLoss(model.mpd, model.msd).to(device)
+    wl = WavLMLoss(model_params.slm.model,
+                   model.wd,
+                   sr,
+                   model_params.slm.sr).to(device)
+    gl = MyDataParallel(gl)
+    dl = MyDataParallel(dl)
+    wl = MyDataParallel(wl)
+    sampler = DiffusionSampler(
+        model.diffusion.diffusion,
+        sampler=ADPM2Sampler(),
+        sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters
+        clamp=False
+    )
+    scheduler_params = {
+        "max_lr": optimizer_params.lr,
+        "pct_start": float(0),
+        "epochs": epochs,
+        "steps_per_epoch": len(train_dataloader),
+    }
+    scheduler_params_dict= {key: scheduler_params.copy() for key in model}
+    scheduler_params_dict['bert']['max_lr'] = optimizer_params.bert_lr * 2
+    scheduler_params_dict['decoder']['max_lr'] = optimizer_params.ft_lr * 2
+    scheduler_params_dict['style_encoder']['max_lr'] = optimizer_params.ft_lr * 2
+    optimizer = build_optimizer({key: model[key].parameters() for key in model},
+                                          scheduler_params_dict=scheduler_params_dict, lr=optimizer_params.lr)
+    # adjust BERT learning rate
+    for g in optimizer.optimizers['bert'].param_groups:
+        g['betas'] = (0.9, 0.99)
+        g['lr'] = optimizer_params.bert_lr
+        g['initial_lr'] = optimizer_params.bert_lr
+        g['min_lr'] = 0
+        g['weight_decay'] = 0.01
+    # adjust acoustic module learning rate
+    for module in ["decoder", "style_encoder"]:
+        for g in optimizer.optimizers[module].param_groups:
+            g['betas'] = (0.0, 0.99)
+            g['lr'] = optimizer_params.ft_lr
+            g['initial_lr'] = optimizer_params.ft_lr
+            g['min_lr'] = 0
+            g['weight_decay'] = 1e-4
+    # load models if there is a model
+    if load_pretrained:
+        model, optimizer, start_epoch, iters = load_checkpoint(model,  optimizer, config['pretrained_model'],
+                                    load_only_params=config.get('load_only_params', True))
+    n_down = model.text_aligner.n_down
+    best_loss = float('inf')  # best test loss
+    loss_train_record = list([])
+    loss_test_record = list([])
+    iters = 0
+    criterion = nn.L1Loss() # F0 loss (regression)
+    torch.cuda.empty_cache()
+    stft_loss = MultiResolutionSTFTLoss().to(device)
+    print('BERT', optimizer.optimizers['bert'])
+    print('decoder', optimizer.optimizers['decoder'])
+    start_ds = False
+    running_std = []
+    slmadv_params = Munch(config['slmadv_params'])
+    slmadv = SLMAdversarialLoss(model, wl, sampler,
+                                slmadv_params.min_len,
+                                slmadv_params.max_len,
+                                batch_percentage=slmadv_params.batch_percentage,
+                                skip_update=slmadv_params.iter,
+                                sig=slmadv_params.sig
+                               )
+    for epoch in range(start_epoch, epochs):
+        running_loss = 0
+        start_time = time.time()
+        _ = [model[key].eval() for key in model]
+        model.predictor.train()
+        model.bert_encoder.train()
+        model.bert.train()
+        model.msd.train()
+        model.mpd.train()
+        if epoch >= diff_epoch:
+            start_ds = True
+        for i, batch in enumerate(train_dataloader):
+            waves = batch[0]
+            batch = [b.to(device) for b in batch[1:]]
+            texts, input_lengths, ref_texts, ref_lengths, mels, mel_input_length, ref_mels = batch
+            # --------------- CLIP TEXTS *ONCE* -----------------
+            text_mask = length_to_mask(input_lengths).to(texts.device)
+            texts, text_mask, input_lengths = clip_to_bert(texts, text_mask)
+            # ── drop rows that became all-PAD after clipping ───────────
+            keep = (input_lengths > 0).nonzero(as_tuple=True)[0]
+            if keep.numel() != texts.size(0):
+                texts, text_mask, input_lengths      = texts[keep], text_mask[keep], input_lengths[keep]
+                ref_texts, ref_lengths               = ref_texts[keep], ref_lengths[keep]
+                mels, mel_input_length, ref_mels     = mels[keep], mel_input_length[keep], ref_mels[keep]
+                waves = [waves[i] for i in keep.tolist()]
+            # ----------------------------------------------------
+            with torch.no_grad():
+                mask = length_to_mask(mel_input_length // (2 ** n_down)).to(device)
+                mel_mask = length_to_mask(mel_input_length).to(device)
+                try:
+                    _, _, s2s_attn = model.text_aligner(mels, mask, texts)
+                    s2s_attn = s2s_attn.transpose(-1, -2)
+                    s2s_attn = s2s_attn[..., 1:]
+                    s2s_attn = s2s_attn.transpose(-1, -2)
+                except:
+                    continue
+                mask_ST = mask_from_lens(s2s_attn, input_lengths, mel_input_length // (2 ** n_down))
+                s2s_attn_mono = maximum_path(s2s_attn, mask_ST)
+                # encode
+                t_en = model.text_encoder(texts, input_lengths, text_mask)
+                asr = (t_en @ s2s_attn_mono)
+                d_gt = s2s_attn_mono.sum(axis=-1).detach()
+                # compute reference styles
+                if multispeaker and epoch >= diff_epoch:
+                    ref_ss = model.style_encoder(ref_mels.unsqueeze(1))
+                    ref_sp = model.predictor_encoder(ref_mels.unsqueeze(1))
+                    ref = torch.cat([ref_ss, ref_sp], dim=1)
+            # compute the style of the entire utterance
+            # this operation cannot be done in batch because of the avgpool layer (may need to work on masked avgpool)
+            ss = []
+            gs = []
+            for bib in range(len(mel_input_length)):
+                mel_length = int(mel_input_length[bib].item())
+                mel = mels[bib, :, :mel_input_length[bib]]
+                s = model.predictor_encoder(mel.unsqueeze(0).unsqueeze(1))
+                ss.append(s)
+                s = model.style_encoder(mel.unsqueeze(0).unsqueeze(1))
+                gs.append(s)
+            s_dur = torch.stack(ss).squeeze()  # global prosodic styles
+            gs = torch.stack(gs).squeeze() # global acoustic styles
+            s_trg = torch.cat([gs, s_dur], dim=-1).detach() # ground truth for denoiser
+            # texts, input_lengths, ref_texts, ref_lengths, mels, mel_input_length, ref_mels = batch
+            # # ────── PATCH: keep PL-BERT below 512 tokens ─────────
+            # MAX_BERT_LEN = 510                 # leave room for [CLS] and [SEP]
+            # if texts.size(1) > MAX_BERT_LEN:   # truncate batch-wise
+            #     texts      = texts[:, :MAX_BERT_LEN]
+            # seq_len       = texts.size(1)                              # current padded width
+            # input_lengths = (texts != 0).sum(1)                        # 0 is PAD
+            # arange_row    = torch.arange(seq_len, device=texts.device) # shape [L]
+            # text_mask     = arange_row.unsqueeze(0) >= input_lengths.unsqueeze(1)
+            #                                                    # shape [B, L]
+            # # keep only rows that still have at least one real token
+            # keep = (input_lengths > 0).nonzero(as_tuple=True)[0]
+            # if keep.numel() != texts.size(0):        # a row was truncated to length 0
+            #     texts, text_mask, input_lengths      = texts[keep], text_mask[keep], input_lengths[keep]
+            #     ref_texts, ref_lengths               = ref_texts[keep], ref_lengths[keep]
+            #     mels, mel_input_length, ref_mels     = mels[keep], mel_input_length[keep], ref_mels[keep]
+            #     waves = [waves[i] for i in keep.tolist()]
+            # # clip alignments to the *current* width (seq_len)
+            # s2s_attn_mono = s2s_attn_mono[:, :seq_len, :]
+            # d_gt          = d_gt[:,          :seq_len]
+            # # ─────────────────────────────────────────────────────
+            # -------------------------------------------------------------
+            # Now build *everything* that depends on token count
+            with torch.no_grad():
+                t_en = model.text_encoder(texts, input_lengths, text_mask)
+                _, _, s2s_attn = model.text_aligner(mels, mask, texts)
+                s2s_attn = s2s_attn.transpose(-1, -2)[..., 1:].transpose(-1, -2)
+                mask_ST       = mask_from_lens(s2s_attn, input_lengths,
+                                               mel_input_length // 2**n_down)
+                s2s_attn_mono = maximum_path(s2s_attn, mask_ST)
+            asr   = t_en @ s2s_attn_mono
+            d_gt  = s2s_attn_mono.sum(dim=-1)
+            bert_dur = model.bert(texts, attention_mask=(~text_mask).int())
+            d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
+            # denoiser training
+            if epoch >= diff_epoch:
+                num_steps = np.random.randint(3, 5)
+                if model_params.diffusion.dist.estimate_sigma_data:
+                    model.diffusion.module.diffusion.sigma_data = s_trg.std(axis=-1).mean().item() # batch-wise std estimation
+                    running_std.append(model.diffusion.module.diffusion.sigma_data)
+                if multispeaker:
+                    s_preds = sampler(noise = torch.randn_like(s_trg).unsqueeze(1).to(device),
+                          embedding=bert_dur,
+                          embedding_scale=1,
+                                   features=ref, # reference from the same speaker as the embedding
+                             embedding_mask_proba=0.1,
+                             num_steps=num_steps).squeeze(1)
+                    loss_diff = model.diffusion(s_trg.unsqueeze(1), embedding=bert_dur, features=ref).mean() # EDM loss
+                    loss_sty = F.l1_loss(s_preds, s_trg.detach()) # style reconstruction loss
+                else:
+                    s_preds = sampler(noise = torch.randn_like(s_trg).unsqueeze(1).to(device),
+                          embedding=bert_dur,
+                          embedding_scale=1,
+                             embedding_mask_proba=0.1,
+                             num_steps=num_steps).squeeze(1)
+                    loss_diff = model.diffusion.module.diffusion(s_trg.unsqueeze(1), embedding=bert_dur).mean() # EDM loss
+                    loss_sty = F.l1_loss(s_preds, s_trg.detach()) # style reconstruction loss
+            else:
+                loss_sty = 0
+                loss_diff = 0
+            d, p = model.predictor(d_en, s_dur,
+                                                    input_lengths,
+                                                    s2s_attn_mono,
+                                                    text_mask)
+            mel_len = min(int(mel_input_length.min().item() / 2 - 1), max_len // 2)
+            mel_len_st = int(mel_input_length.min().item() / 2 - 1)
+            en = []
+            gt = []
+            st = []
+            p_en = []
+            wav = []
+            for bib in range(len(mel_input_length)):
+                mel_length = int(mel_input_length[bib].item() / 2)
+                random_start = np.random.randint(0, mel_length - mel_len)
+                en.append(asr[bib, :, random_start:random_start+mel_len])
+                p_en.append(p[bib, :, random_start:random_start+mel_len])
+                gt.append(mels[bib, :, (random_start * 2):((random_start+mel_len) * 2)])
+                y = waves[bib][(random_start * 2) * 300:((random_start+mel_len) * 2) * 300]
+                wav.append(torch.from_numpy(y).to(device))
+                # style reference (better to be different from the GT)
+                random_start = np.random.randint(0, mel_length - mel_len_st)
+                st.append(mels[bib, :, (random_start * 2):((random_start+mel_len_st) * 2)])
+            wav = torch.stack(wav).float().detach()
+            en = torch.stack(en)
+            p_en = torch.stack(p_en)
+            gt = torch.stack(gt).detach()
+            st = torch.stack(st).detach()
+            if gt.size(-1) < 80:
+                continue
+            s_dur = model.predictor_encoder(st.unsqueeze(1) if multispeaker else gt.unsqueeze(1))
+            s = model.style_encoder(st.unsqueeze(1) if multispeaker else gt.unsqueeze(1))
+            with torch.no_grad():
+                F0_real, _, F0 = model.pitch_extractor(gt.unsqueeze(1))
+                F0 = F0.reshape(F0.shape[0], F0.shape[1] * 2, F0.shape[2], 1).squeeze()
+                asr_real = model.text_aligner.get_feature(gt)
+                N_real = log_norm(gt.unsqueeze(1)).squeeze(1)
+                y_rec_gt = wav.unsqueeze(1)
+                y_rec_gt_pred = model.decoder(en, F0_real, N_real, s)
+                if epoch >= joint_epoch:
+                    # ground truth from recording
+                    wav = y_rec_gt # use recording since decoder is tuned
+                else:
+                    # ground truth from reconstruction
+                    wav = y_rec_gt_pred # use reconstruction since decoder is fixed
+            F0_fake, N_fake = model.predictor.F0Ntrain(p_en, s_dur)
+            y_rec = model.decoder(en, F0_fake, N_fake, s)
+            loss_F0_rec =  (F.smooth_l1_loss(F0_real, F0_fake)) / 10
+            loss_norm_rec = F.smooth_l1_loss(N_real, N_fake)
+            if start_ds:
+                optimizer.zero_grad()
+                d_loss = dl(wav.detach(), y_rec.detach()).mean()
+                d_loss.backward()
+                optimizer.step('msd')
+                optimizer.step('mpd')
+            else:
+                d_loss = 0
+            # generator loss
+            optimizer.zero_grad()
+            loss_mel = stft_loss(y_rec, wav)
+            if start_ds:
+                loss_gen_all = gl(wav, y_rec).mean()
+            else:
+                loss_gen_all = 0
+            loss_lm = wl(wav.detach().squeeze(), y_rec.squeeze()).mean()
+            loss_ce = 0
+            loss_dur = 0
+            for _s2s_pred, _text_input, _text_length in zip(d, (d_gt), input_lengths):
+                _s2s_pred = _s2s_pred[:_text_length, :]
+                _text_input = _text_input[:_text_length].long()
+                _s2s_trg = torch.zeros_like(_s2s_pred)
+                for p in range(_s2s_trg.shape[0]):
+                    _s2s_trg[p, :_text_input[p]] = 1
+                _dur_pred = torch.sigmoid(_s2s_pred).sum(axis=1)
+                loss_dur += F.l1_loss(_dur_pred[1:_text_length-1],
+                                       _text_input[1:_text_length-1])
+                loss_ce += F.binary_cross_entropy_with_logits(_s2s_pred.flatten(), _s2s_trg.flatten())
+            loss_ce /= texts.size(0)
+            loss_dur /= texts.size(0)
+            g_loss = loss_params.lambda_mel * loss_mel + \
+                     loss_params.lambda_F0 * loss_F0_rec + \
+                     loss_params.lambda_ce * loss_ce + \
+                     loss_params.lambda_norm * loss_norm_rec + \
+                     loss_params.lambda_dur * loss_dur + \
+                     loss_params.lambda_gen * loss_gen_all + \
+                     loss_params.lambda_slm * loss_lm + \
+                     loss_params.lambda_sty * loss_sty + \
+                     loss_params.lambda_diff * loss_diff
+            running_loss += loss_mel.item()
+            g_loss.backward()
+            if torch.isnan(g_loss):
+                from IPython.core.debugger import set_trace
+                set_trace()
+            optimizer.step('bert_encoder')
+            optimizer.step('bert')
+            optimizer.step('predictor')
+            optimizer.step('predictor_encoder')
+            if epoch >= diff_epoch:
+                optimizer.step('diffusion')
+            if epoch >= joint_epoch:
+                optimizer.step('style_encoder')
+                optimizer.step('decoder')
+                # randomly pick whether to use in-distribution text
+                if np.random.rand() < 0.5:
+                    use_ind = True
+                else:
+                    use_ind = False
+                if use_ind:
+                    ref_lengths = input_lengths
+                    ref_texts = texts
+                # ---- clip reference text exactly the same way ----
+                ref_mask = length_to_mask(ref_lengths).to(ref_texts.device)
+                ref_texts, ref_mask, ref_lengths = clip_to_bert(ref_texts, ref_mask)
+                slm_out = slmadv(i,
+                                 y_rec_gt,
+                                 y_rec_gt_pred,
+                                 waves,
+                                 mel_input_length,
+                                 ref_texts,
+                                 ref_lengths, use_ind, s_trg.detach(), ref if multispeaker else None)
+                if slm_out is None:
+                    continue
+                d_loss_slm, loss_gen_lm, y_pred = slm_out
+                # SLM generator loss
+                optimizer.zero_grad()
+                loss_gen_lm.backward()
+                # compute the gradient norm
+                total_norm = {}
+                for key in model.keys():
+                    total_norm[key] = 0
+                    parameters = [p for p in model[key].parameters() if p.grad is not None and p.requires_grad]
+                    for p in parameters:
+                        param_norm = p.grad.detach().data.norm(2)
+                        total_norm[key] += param_norm.item() ** 2
+                    total_norm[key] = total_norm[key] ** 0.5
+                # gradient scaling
+                if total_norm['predictor'] > slmadv_params.thresh:
+                    for key in model.keys():
+                        for p in model[key].parameters():
+                            if p.grad is not None:
+                                p.grad *= (1 / total_norm['predictor'])
+                for p in model.predictor.duration_proj.parameters():
+                    if p.grad is not None:
+                        p.grad *= slmadv_params.scale
+                for p in model.predictor.lstm.parameters():
+                    if p.grad is not None:
+                        p.grad *= slmadv_params.scale
+                for p in model.diffusion.parameters():
+                    if p.grad is not None:
+                        p.grad *= slmadv_params.scale
+                optimizer.step('bert_encoder')
+                optimizer.step('bert')
+                optimizer.step('predictor')
+                optimizer.step('diffusion')
+                # SLM discriminator loss
+                if d_loss_slm != 0:
+                    optimizer.zero_grad()
+                    d_loss_slm.backward(retain_graph=True)
+                    optimizer.step('wd')
+            else:
+                d_loss_slm, loss_gen_lm = 0, 0
+            iters = iters + 1
+            if (i+1)%log_interval == 0:
+                logger.info ('Epoch [%d/%d], Step [%d/%d], Loss: %.5f, Disc Loss: %.5f, Dur Loss: %.5f, CE Loss: %.5f, Norm Loss: %.5f, F0 Loss: %.5f, LM Loss: %.5f, Gen Loss: %.5f, Sty Loss: %.5f, Diff Loss: %.5f, DiscLM Loss: %.5f, GenLM Loss: %.5f'
+                    %(epoch+1, epochs, i+1, len(train_list)//batch_size, running_loss / log_interval, d_loss, loss_dur, loss_ce, loss_norm_rec, loss_F0_rec, loss_lm, loss_gen_all, loss_sty, loss_diff, d_loss_slm, loss_gen_lm))
+                writer.add_scalar('train/mel_loss', running_loss / log_interval, iters)
+                writer.add_scalar('train/gen_loss', loss_gen_all, iters)
+                writer.add_scalar('train/d_loss', d_loss, iters)
+                writer.add_scalar('train/ce_loss', loss_ce, iters)
+                writer.add_scalar('train/dur_loss', loss_dur, iters)
+                writer.add_scalar('train/slm_loss', loss_lm, iters)
+                writer.add_scalar('train/norm_loss', loss_norm_rec, iters)
+                writer.add_scalar('train/F0_loss', loss_F0_rec, iters)
+                writer.add_scalar('train/sty_loss', loss_sty, iters)
+                writer.add_scalar('train/diff_loss', loss_diff, iters)
+                writer.add_scalar('train/d_loss_slm', d_loss_slm, iters)
+                writer.add_scalar('train/gen_loss_slm', loss_gen_lm, iters)
+                running_loss = 0
+                print('Time elasped:', time.time()-start_time)
+        loss_test = 0
+        loss_align = 0
+        loss_f = 0
+        _ = [model[key].eval() for key in model]
+        with torch.no_grad():
+            iters_test = 0
+            for batch_idx, batch in enumerate(val_dataloader):
+                optimizer.zero_grad()
+                try:
+                    waves = batch[0]
+                    batch = [b.to(device) for b in batch[1:]]
+                    texts, input_lengths, ref_texts, ref_lengths, mels, mel_input_length, ref_mels = batch
+                    texts, text_mask, input_lengths = clip_to_bert(texts, text_mask)
+                    keep = (input_lengths > 0).nonzero(as_tuple=True)[0]
+                    if keep.numel() != texts.size(0):
+                        texts, text_mask, input_lengths      = texts[keep], text_mask[keep], input_lengths[keep]
+                        ref_texts, ref_lengths               = ref_texts[keep], ref_lengths[keep]
+                        mels, mel_input_length, ref_mels     = mels[keep], mel_input_length[keep], ref_mels[keep]
+                        waves = [waves[i] for i in keep.tolist()]
+                    with torch.no_grad():
+                        mask = length_to_mask(mel_input_length // (2 ** n_down)).to(texts.device)
+                        # mask = length_to_mask(mel_input_length // (2 ** n_down)).to('cuda')
+                        # _, _, s2s_attn = model.text_aligner(mels, mask, texts)
+                        # s2s_attn = s2s_attn.transpose(-1, -2)
+                        # s2s_attn = s2s_attn[..., 1:]
+                        # s2s_attn = s2s_attn.transpose(-1, -2)
+                        # mask_ST = mask_from_lens(s2s_attn, input_lengths, mel_input_length // (2 ** n_down))
+                        # s2s_attn_mono = maximum_path(s2s_attn, mask_ST)
+                        # # encode
+                        # t_en = model.text_encoder(texts, input_lengths, text_mask)
+                        # asr = (t_en @ s2s_attn_mono)
+                        # d_gt = s2s_attn_mono.sum(axis=-1).detach()
+                        _, _, s2s_attn = model.text_aligner(mels, mask, texts)
+                        s2s_attn = s2s_attn.transpose(-1, -2)[..., 1:].transpose(-1, -2)
+                        mask_ST  = mask_from_lens(s2s_attn, input_lengths,
+                                                  mel_input_length // 2 ** n_down)
+                        s2s_attn_mono = maximum_path(s2s_attn, mask_ST)
+                        t_en = model.text_encoder(texts, input_lengths, text_mask)
+                        asr  = t_en @ s2s_attn_mono
+                        d_gt = s2s_attn_mono.sum(dim=-1).detach()
+                    ss = []
+                    gs = []
+                    for bib in range(len(mel_input_length)):
+                        mel_length = int(mel_input_length[bib].item())
+                        mel = mels[bib, :, :mel_input_length[bib]]
+                        s = model.predictor_encoder(mel.unsqueeze(0).unsqueeze(1))
+                        ss.append(s)
+                        s = model.style_encoder(mel.unsqueeze(0).unsqueeze(1))
+                        gs.append(s)
+                    s = torch.stack(ss).squeeze()
+                    gs = torch.stack(gs).squeeze()
+                    s_trg = torch.cat([s, gs], dim=-1).detach()
+                    bert_dur = model.bert(texts, attention_mask=(~text_mask).int())
+                    d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
+                    d, p = model.predictor(d_en, s,
+                                                        input_lengths,
+                                                        s2s_attn_mono,
+                                                        text_mask)
+                    # get clips
+                    mel_len = int(mel_input_length.min().item() / 2 - 1)
+                    en = []
+                    gt = []
+                    p_en = []
+                    wav = []
+                    for bib in range(len(mel_input_length)):
+                        mel_length = int(mel_input_length[bib].item() / 2)
+                        random_start = np.random.randint(0, mel_length - mel_len)
+                        en.append(asr[bib, :, random_start:random_start+mel_len])
+                        p_en.append(p[bib, :, random_start:random_start+mel_len])
+                        gt.append(mels[bib, :, (random_start * 2):((random_start+mel_len) * 2)])
+                        y = waves[bib][(random_start * 2) * 300:((random_start+mel_len) * 2) * 300]
+                        wav.append(torch.from_numpy(y).to(device))
+                    wav = torch.stack(wav).float().detach()
+                    en = torch.stack(en)
+                    p_en = torch.stack(p_en)
+                    gt = torch.stack(gt).detach()
+                    s = model.predictor_encoder(gt.unsqueeze(1))
+                    F0_fake, N_fake = model.predictor.F0Ntrain(p_en, s)
+                    loss_dur = 0
+                    for _s2s_pred, _text_input, _text_length in zip(d, (d_gt), input_lengths):
+                        _s2s_pred = _s2s_pred[:_text_length, :]
+                        _text_input = _text_input[:_text_length].long()
+                        _s2s_trg = torch.zeros_like(_s2s_pred)
+                        for bib in range(_s2s_trg.shape[0]):
+                            _s2s_trg[bib, :_text_input[bib]] = 1
+                        _dur_pred = torch.sigmoid(_s2s_pred).sum(axis=1)
+                        loss_dur += F.l1_loss(_dur_pred[1:_text_length-1],
+                                               _text_input[1:_text_length-1])
+                    loss_dur /= texts.size(0)
+                    s = model.style_encoder(gt.unsqueeze(1))
+                    y_rec = model.decoder(en, F0_fake, N_fake, s)
+                    loss_mel = stft_loss(y_rec.squeeze(), wav.detach())
+                    F0_real, _, F0 = model.pitch_extractor(gt.unsqueeze(1))
+                    loss_F0 = F.l1_loss(F0_real, F0_fake) / 10
+                    loss_test += (loss_mel).mean()
+                    loss_align += (loss_dur).mean()
+                    loss_f += (loss_F0).mean()
+                    iters_test += 1
+                except Exception as e:
+                    print(f"run into exception", e)
+                    traceback.print_exc()
+                    continue
+        print('Epochs:', epoch + 1)
+        logger.info('Validation loss: %.3f, Dur loss: %.3f, F0 loss: %.3f' % (loss_test / iters_test, loss_align / iters_test, loss_f / iters_test) + '\n\n\n')
+        print('\n\n\n')
+        writer.add_scalar('eval/mel_loss', loss_test / iters_test, epoch + 1)
+        writer.add_scalar('eval/dur_loss', loss_align / iters_test, epoch + 1)
+        writer.add_scalar('eval/F0_loss', loss_f / iters_test, epoch + 1)
+        if epoch < joint_epoch:
+            # generating reconstruction examples with GT duration
+            with torch.no_grad():
+                for bib in range(len(asr)):
+                    mel_length = int(mel_input_length[bib].item())
+                    gt = mels[bib, :, :mel_length].unsqueeze(0)
+                    en = asr[bib, :, :mel_length // 2].unsqueeze(0)
+                    F0_real, _, _ = model.pitch_extractor(gt.unsqueeze(1))
+                    F0_real = F0_real.unsqueeze(0)
+                    s = model.style_encoder(gt.unsqueeze(1))
+                    real_norm = log_norm(gt.unsqueeze(1)).squeeze(1)
+                    y_rec = model.decoder(en, F0_real, real_norm, s)
+                    writer.add_audio('eval/y' + str(bib), y_rec.cpu().numpy().squeeze(), epoch, sample_rate=sr)
+                    s_dur = model.predictor_encoder(gt.unsqueeze(1))
+                    p_en = p[bib, :, :mel_length // 2].unsqueeze(0)
+                    F0_fake, N_fake = model.predictor.F0Ntrain(p_en, s_dur)
+                    y_pred = model.decoder(en, F0_fake, N_fake, s)
+                    writer.add_audio('pred/y' + str(bib), y_pred.cpu().numpy().squeeze(), epoch, sample_rate=sr)
+                    if epoch == 0:
+                        writer.add_audio('gt/y' + str(bib), waves[bib].squeeze(), epoch, sample_rate=sr)
+                    if bib >= 5:
+                        break
+        else:
+            # generating sampled speech from text directly
+            with torch.no_grad():
+                # compute reference styles
+                if multispeaker and epoch >= diff_epoch:
+                    ref_ss = model.style_encoder(ref_mels.unsqueeze(1))
+                    ref_sp = model.predictor_encoder(ref_mels.unsqueeze(1))
+                    ref_s = torch.cat([ref_ss, ref_sp], dim=1)
+                for bib in range(len(d_en)):
+                    if multispeaker:
+                        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(texts.device),
+                              embedding=bert_dur[bib].unsqueeze(0),
+                              embedding_scale=1,
+                                features=ref_s[bib].unsqueeze(0), # reference from the same speaker as the embedding
+                                 num_steps=5).squeeze(1)
+                    else:
+                        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(texts.device),
+                              embedding=bert_dur[bib].unsqueeze(0),
+                              embedding_scale=1,
+                                 num_steps=5).squeeze(1)
+                    s = s_pred[:, 128:]
+                    ref = s_pred[:, :128]
+                    d = model.predictor.text_encoder(d_en[bib, :, :input_lengths[bib]].unsqueeze(0),
+                                                     s, input_lengths[bib, ...].unsqueeze(0), text_mask[bib, :input_lengths[bib]].unsqueeze(0))
+                    x, _ = model.predictor.lstm(d)
+                    duration = model.predictor.duration_proj(x)
+                    duration = torch.sigmoid(duration).sum(axis=-1)
+                    pred_dur = torch.round(duration.squeeze()).clamp(min=1)
+                    pred_dur[-1] += 5
+                    pred_aln_trg = torch.zeros(input_lengths[bib], int(pred_dur.sum().data))
+                    c_frame = 0
+                    for i in range(pred_aln_trg.size(0)):
+                        pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1
+                        c_frame += int(pred_dur[i].data)
+                    # encode prosody
+                    en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(texts.device))
+                    F0_pred, N_pred = model.predictor.F0Ntrain(en, s)
+                    out = model.decoder((t_en[bib, :, :input_lengths[bib]].unsqueeze(0) @ pred_aln_trg.unsqueeze(0).to(texts.device)),
+                                            F0_pred, N_pred, ref.squeeze().unsqueeze(0))
+                    writer.add_audio('pred/y' + str(bib), out.cpu().numpy().squeeze(), epoch, sample_rate=sr)
+                    if bib >= 5:
+                        break
+        if epoch % saving_epoch == 0:
+            if (loss_test / iters_test) < best_loss:
+                best_loss = loss_test / iters_test
+            print('Saving..')
+            state = {
+                'net':  {key: model[key].state_dict() for key in model},
+                'optimizer': optimizer.state_dict(),
+                'iters': iters,
+                'val_loss': loss_test / iters_test,
+                'epoch': epoch,
+            }
+            save_path = osp.join(log_dir, 'epoch_2nd_%05d.pth' % epoch)
+            torch.save(state, save_path)
+            # if estimate sigma, save the estimated simga
+            if model_params.diffusion.dist.estimate_sigma_data:
+                config['model_params']['diffusion']['dist']['sigma_data'] = float(np.mean(running_std))
+                with open(osp.join(log_dir, osp.basename(config_path)), 'w') as outfile:
+                    yaml.dump(config, outfile, default_flow_style=True)
+if __name__=="__main__":
+    main()

logs/pod_90h_30k/config_ft_single.yml → Configs/.ipynb_checkpoints/config_ft_single-checkpoint.yml RENAMED Viewed

@@ -1,18 +1,18 @@
 # ─── GLOBAL ──────────────────────────────────────────────────────────
-log_dir: logs/pod_90h_30k
 device: "cuda"
-batch_size: 8        # 40 GB A100, fp16
-max_len: 160          # ≈ 8 s (200 × 40 ms)
-epochs_1st: 13        # first-stage schedule
-epochs_2nd: 13        # second-stage schedule (later)
-save_freq: 2
 log_interval: 50
 # leave blank on first run
-pretrained_model: ""
-second_stage_load_pretrained: false
 load_only_params: false
 # ─── PRE-PROCESS ─────────────────────────────────────────────────────
@@ -25,11 +25,11 @@ preprocess_params:
 # ─── DATA ────────────────────────────────────────────────────────────
 data_params:
-  root_path:  /home/ubuntu/styletts2-ft/data/wavs
-  train_data: /home/ubuntu/styletts2-ft/data/train_list.txt
-  val_data:   /home/ubuntu/styletts2-ft/data/val_list.txt
   min_length: 50    # sample until texts with this size are obtained for OOD texts
-  OOD_data:   /home/ubuntu/styletts2-ft/data/OOD_texts.txt
 # ─── LOSS SCHEDULE ──────────────────────────────────────────────────
 loss_params:
@@ -39,7 +39,7 @@ loss_params:
     lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
     lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
-    TMA_epoch: 50 # TMA starting epoch (1st stage)
     lambda_F0: 1. # F0 reconstruction loss (2nd stage)
     lambda_norm: 1. # norm reconstruction loss (2nd stage)
@@ -48,14 +48,14 @@ loss_params:
     lambda_sty: 1. # style reconstruction loss (2nd stage)
     lambda_diff: 1. # score matching loss (2nd stage)
-    diff_epoch: 20 # style diffusion starting epoch (2nd stage)
-    joint_epoch: 50 # joint training starting epoch (2nd stage)
 # ─── OPTIMISER ──────────────────────────────────────────────────────
 optimizer_params:
-  lr: 0.0008
-  bert_lr: 0.00002
-  ft_lr: 0.0002
   grad_accum_steps: 2
 # ─── MODEL (core network & sub-modules) ─────────────────────────────
@@ -105,7 +105,7 @@ F0_path: "Utils/JDC/bst.t7"
 ASR_config: "Utils/ASR/config.yml"
 ASR_path: "Utils/ASR/epoch_00080.pth"
 PLBERT_dir: 'Utils/PLBERT/'
-first_stage_path: ""        # filled automatically after this run
 # ─── SLM ADVERSARIAL (ignored in stage-1, kept default) ─────────────
 slmadv_params:

 # ─── GLOBAL ──────────────────────────────────────────────────────────
+log_dir: logs/pod_90h_30k_second_lr1
 device: "cuda"
+batch_size: 12        # 40 GB A100, fp16
+max_len: 300          # ≈ 8 s (200 × 40 ms)
+epochs_1st: 25        # first-stage schedule
+epochs_2nd: 20        # second-stage schedule (later)
+save_freq: 1
 log_interval: 50
 # leave blank on first run
+pretrained_model: "" #"/workspace/styletts2/logs/pod_90h_30k/epoch_2nd_00003.pth"
+second_stage_load_pretrained: true
 load_only_params: false
 # ─── PRE-PROCESS ─────────────────────────────────────────────────────
 # ─── DATA ────────────────────────────────────────────────────────────
 data_params:
+  root_path:  /workspace
+  train_data: /workspace/styletts2/data/train_list.txt
+  val_data:   /workspace/styletts2/data/val_list.txt
   min_length: 50    # sample until texts with this size are obtained for OOD texts
+  OOD_data:   /workspace/styletts2/data/OOD_texts.txt
 # ─── LOSS SCHEDULE ──────────────────────────────────────────────────
 loss_params:
     lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
     lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
+    TMA_epoch: 14 # TMA starting epoch (1st stage)
     lambda_F0: 1. # F0 reconstruction loss (2nd stage)
     lambda_norm: 1. # norm reconstruction loss (2nd stage)
     lambda_sty: 1. # style reconstruction loss (2nd stage)
     lambda_diff: 1. # score matching loss (2nd stage)
+    diff_epoch: 0 # style diffusion starting epoch (2nd stage)
+    joint_epoch: 0 # joint training starting epoch (2nd stage)
 # ─── OPTIMISER ──────────────────────────────────────────────────────
 optimizer_params:
+  lr: 0.0001
+  bert_lr: 0.00001
+  ft_lr: 0.0001
   grad_accum_steps: 2
 # ─── MODEL (core network & sub-modules) ─────────────────────────────
 ASR_config: "Utils/ASR/config.yml"
 ASR_path: "Utils/ASR/epoch_00080.pth"
 PLBERT_dir: 'Utils/PLBERT/'
+first_stage_path: "/workspace/styletts2/stage1_final.pth"        # filled automatically after this run
 # ─── SLM ADVERSARIAL (ignored in stage-1, kept default) ─────────────
 slmadv_params:

Configs/.ipynb_checkpoints/config_libritts-checkpoint.yml ADDED Viewed

	@@ -0,0 +1,113 @@

+log_dir: "Models/LibriTTS"
+first_stage_path: "first_stage.pth"
+save_freq: 1
+log_interval: 10
+device: "cuda"
+epochs_1st: 50 # number of epochs for first stage training (pre-training)
+epochs_2nd: 30 # number of peochs for second stage training (joint training)
+batch_size: 16
+max_len: 300 # maximum number of frames
+pretrained_model: ""
+second_stage_load_pretrained: true # set to true if the pre-trained model is for 2nd stage
+load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
+F0_path: "Utils/JDC/bst.t7"
+ASR_config: "Utils/ASR/config.yml"
+ASR_path: "Utils/ASR/epoch_00080.pth"
+PLBERT_dir: 'Utils/PLBERT/'
+data_params:
+  train_data: "Data/train_list.txt"
+  val_data: "Data/val_list.txt"
+  root_path: ""
+  OOD_data: "Data/OOD_texts.txt"
+  min_length: 50 # sample until texts with this size are obtained for OOD texts
+preprocess_params:
+  sr: 24000
+  spect_params:
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+model_params:
+  multispeaker: true
+  dim_in: 64
+  hidden_dim: 512
+  max_conv_dim: 512
+  n_layer: 3
+  n_mels: 80
+  n_token: 178 # number of phoneme tokens
+  max_dur: 50 # maximum duration of a single phoneme
+  style_dim: 128 # style vector size
+  dropout: 0.2
+  # config for decoder
+  decoder:
+      type: 'hifigan' # either hifigan or istftnet
+      resblock_kernel_sizes: [3,7,11]
+      upsample_rates :  [10,5,3,2]
+      upsample_initial_channel: 512
+      resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
+      upsample_kernel_sizes: [20,10,6,4]
+  # speech language model config
+  slm:
+      model: 'microsoft/wavlm-base-plus'
+      sr: 16000 # sampling rate of SLM
+      hidden: 768 # hidden size of SLM
+      nlayers: 13 # number of layers of SLM
+      initial_channel: 64 # initial channels of SLM discriminator head
+  # style diffusion model config
+  diffusion:
+    embedding_mask_proba: 0.1
+    # transformer config
+    transformer:
+      num_layers: 3
+      num_heads: 8
+      head_features: 64
+      multiplier: 2
+    # diffusion distribution config
+    dist:
+      sigma_data: 0.2 # placeholder for estimate_sigma_data set to false
+      estimate_sigma_data: true # estimate sigma_data from the current batch if set to true
+      mean: -3.0
+      std: 1.0
+loss_params:
+    lambda_mel: 5. # mel reconstruction loss
+    lambda_gen: 1. # generator loss
+    lambda_slm: 1. # slm feature matching loss
+    lambda_mono: 1. # monotonic alignment loss (1st stage, TMA)
+    lambda_s2s: 1. # sequence-to-sequence loss (1st stage, TMA)
+    TMA_epoch: 5 # TMA starting epoch (1st stage)
+    lambda_F0: 1. # F0 reconstruction loss (2nd stage)
+    lambda_norm: 1. # norm reconstruction loss (2nd stage)
+    lambda_dur: 1. # duration loss (2nd stage)
+    lambda_ce: 20. # duration predictor probability output CE loss (2nd stage)
+    lambda_sty: 1. # style reconstruction loss (2nd stage)
+    lambda_diff: 1. # score matching loss (2nd stage)
+    diff_epoch: 10 # style diffusion starting epoch (2nd stage)
+    joint_epoch: 15 # joint training starting epoch (2nd stage)
+optimizer_params:
+  lr: 0.0001 # general learning rate
+  bert_lr: 0.00001 # learning rate for PLBERT
+  ft_lr: 0.00001 # learning rate for acoustic modules
+slmadv_params:
+  min_len: 400 # minimum length of samples
+  max_len: 500 # maximum length of samples
+  batch_percentage: 0.5 # to prevent out of memory, only use half of the original batch size
+  iter: 20 # update the discriminator every this iterations of generator update
+  thresh: 5 # gradient norm above which the gradient is scaled
+  scale: 0.01 # gradient scaling factor for predictors from SLM discriminators
+  sig: 1.5 # sigma for differentiable duration modeling

Configs/config_ft_single.yml CHANGED Viewed

@@ -1,18 +1,18 @@
 # ─── GLOBAL ──────────────────────────────────────────────────────────
-log_dir: logs/pod_90h_30k
 device: "cuda"
-batch_size: 8        # 40 GB A100, fp16
 max_len: 300          # ≈ 8 s (200 × 40 ms)
 epochs_1st: 25        # first-stage schedule
-epochs_2nd: 15        # second-stage schedule (later)
-save_freq: 2
 log_interval: 50
 # leave blank on first run
-pretrained_model: /home/ubuntu/styletts2-ft/logs/pod_90h_30k/epoch_1st_0012.pth
-second_stage_load_pretrained: false
 load_only_params: false
 # ─── PRE-PROCESS ─────────────────────────────────────────────────────
@@ -25,11 +25,11 @@ preprocess_params:
 # ─── DATA ────────────────────────────────────────────────────────────
 data_params:
-  root_path:  /home/ubuntu/styletts2-ft/data/wavs
-  train_data: /home/ubuntu/styletts2-ft/data/train_list.txt
-  val_data:   /home/ubuntu/styletts2-ft/data/val_list.txt
   min_length: 50    # sample until texts with this size are obtained for OOD texts
-  OOD_data:   /home/ubuntu/styletts2-ft/data/OOD_texts.txt
 # ─── LOSS SCHEDULE ──────────────────────────────────────────────────
 loss_params:
@@ -48,14 +48,14 @@ loss_params:
     lambda_sty: 1. # style reconstruction loss (2nd stage)
     lambda_diff: 1. # score matching loss (2nd stage)
-    diff_epoch: 20 # style diffusion starting epoch (2nd stage)
-    joint_epoch: 50 # joint training starting epoch (2nd stage)
 # ─── OPTIMISER ──────────────────────────────────────────────────────
 optimizer_params:
-  lr: 0.0008
-  bert_lr: 0.00002
-  ft_lr: 0.0002
   grad_accum_steps: 2
 # ─── MODEL (core network & sub-modules) ─────────────────────────────
@@ -105,7 +105,7 @@ F0_path: "Utils/JDC/bst.t7"
 ASR_config: "Utils/ASR/config.yml"
 ASR_path: "Utils/ASR/epoch_00080.pth"
 PLBERT_dir: 'Utils/PLBERT/'
-first_stage_path: ""        # filled automatically after this run
 # ─── SLM ADVERSARIAL (ignored in stage-1, kept default) ─────────────
 slmadv_params:

 # ─── GLOBAL ──────────────────────────────────────────────────────────
+log_dir: logs/pod_90h_30k_second_lr1
 device: "cuda"
+batch_size: 12        # 40 GB A100, fp16
 max_len: 300          # ≈ 8 s (200 × 40 ms)
 epochs_1st: 25        # first-stage schedule
+epochs_2nd: 20        # second-stage schedule (later)
+save_freq: 1
 log_interval: 50
 # leave blank on first run
+pretrained_model: "" #"/workspace/styletts2/logs/pod_90h_30k/epoch_2nd_00003.pth"
+second_stage_load_pretrained: true
 load_only_params: false
 # ─── PRE-PROCESS ─────────────────────────────────────────────────────
 # ─── DATA ────────────────────────────────────────────────────────────
 data_params:
+  root_path:  /workspace
+  train_data: /workspace/styletts2/data/train_list.txt
+  val_data:   /workspace/styletts2/data/val_list.txt
   min_length: 50    # sample until texts with this size are obtained for OOD texts
+  OOD_data:   /workspace/styletts2/data/OOD_texts.txt
 # ─── LOSS SCHEDULE ──────────────────────────────────────────────────
 loss_params:
     lambda_sty: 1. # style reconstruction loss (2nd stage)
     lambda_diff: 1. # score matching loss (2nd stage)
+    diff_epoch: 0 # style diffusion starting epoch (2nd stage)
+    joint_epoch: 0 # joint training starting epoch (2nd stage)
 # ─── OPTIMISER ──────────────────────────────────────────────────────
 optimizer_params:
+  lr: 0.0001
+  bert_lr: 0.00001
+  ft_lr: 0.00001
   grad_accum_steps: 2
 # ─── MODEL (core network & sub-modules) ─────────────────────────────
 ASR_config: "Utils/ASR/config.yml"
 ASR_path: "Utils/ASR/epoch_00080.pth"
 PLBERT_dir: 'Utils/PLBERT/'
+first_stage_path: "/workspace/styletts2/stage1_final.pth"        # filled automatically after this run
 # ─── SLM ADVERSARIAL (ignored in stage-1, kept default) ─────────────
 slmadv_params:

Demo/.ipynb_checkpoints/Inference_LibriTTS-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,1155 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9adb7bd1",
+   "metadata": {},
+   "source": [
+    "# StyleTTS 2 Demo (LibriTTS)\n",
+    "\n",
+    "Before you run the following cells, please make sure you have downloaded [reference_audio.zip](https://huggingface.co/yl4579/StyleTTS2-LibriTTS/resolve/main/reference_audio.zip) and unzipped it under the `demo` folder."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6108384d",
+   "metadata": {},
+   "source": [
+    "### Utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96e173bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "torch.manual_seed(0)\n",
+    "torch.backends.cudnn.benchmark = False\n",
+    "torch.backends.cudnn.deterministic = True\n",
+    "\n",
+    "import random\n",
+    "random.seed(0)\n",
+    "\n",
+    "import numpy as np\n",
+    "np.random.seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da84c60f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a3ddcc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load packages\n",
+    "import time\n",
+    "import random\n",
+    "import yaml\n",
+    "from munch import Munch\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "import torch.nn.functional as F\n",
+    "import torchaudio\n",
+    "import librosa\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "from models import *\n",
+    "from utils import *\n",
+    "from text_utils import TextCleaner\n",
+    "textclenaer = TextCleaner()\n",
+    "\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00ee05e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "to_mel = torchaudio.transforms.MelSpectrogram(\n",
+    "    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
+    "mean, std = -4, 4\n",
+    "\n",
+    "def length_to_mask(lengths):\n",
+    "    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
+    "    mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
+    "    return mask\n",
+    "\n",
+    "def preprocess(wave):\n",
+    "    wave_tensor = torch.from_numpy(wave).float()\n",
+    "    mel_tensor = to_mel(wave_tensor)\n",
+    "    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
+    "    return mel_tensor\n",
+    "\n",
+    "def compute_style(path):\n",
+    "    wave, sr = librosa.load(path, sr=24000)\n",
+    "    audio, index = librosa.effects.trim(wave, top_db=30)\n",
+    "    if sr != 24000:\n",
+    "        audio = librosa.resample(audio, sr, 24000)\n",
+    "    mel_tensor = preprocess(audio).to(device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        ref_s = model.style_encoder(mel_tensor.unsqueeze(1))\n",
+    "        ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))\n",
+    "\n",
+    "    return torch.cat([ref_s, ref_p], dim=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bbdc04c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b9cecbe",
+   "metadata": {},
+   "source": [
+    "### Load models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64fc4c0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load phonemizer\n",
+    "import phonemizer\n",
+    "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48e7b644",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = yaml.safe_load(open(\"Models/LibriTTS/config.yml\"))\n",
+    "\n",
+    "# load pretrained ASR model\n",
+    "ASR_config = config.get('ASR_config', False)\n",
+    "ASR_path = config.get('ASR_path', False)\n",
+    "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
+    "\n",
+    "# load pretrained F0 model\n",
+    "F0_path = config.get('F0_path', False)\n",
+    "pitch_extractor = load_F0_models(F0_path)\n",
+    "\n",
+    "# load BERT model\n",
+    "from Utils.PLBERT.util import load_plbert\n",
+    "BERT_path = config.get('PLBERT_dir', False)\n",
+    "plbert = load_plbert(BERT_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffc18cf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_params = recursive_munch(config['model_params'])\n",
+    "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
+    "_ = [model[key].eval() for key in model]\n",
+    "_ = [model[key].to(device) for key in model]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64529d5c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "params_whole = torch.load(\"Models/LibriTTS/epochs_2nd_00020.pth\", map_location='cpu')\n",
+    "params = params_whole['net']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "895d9706",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for key in model:\n",
+    "    if key in params:\n",
+    "        print('%s loaded' % key)\n",
+    "        try:\n",
+    "            model[key].load_state_dict(params[key])\n",
+    "        except:\n",
+    "            from collections import OrderedDict\n",
+    "            state_dict = params[key]\n",
+    "            new_state_dict = OrderedDict()\n",
+    "            for k, v in state_dict.items():\n",
+    "                name = k[7:] # remove `module.`\n",
+    "                new_state_dict[name] = v\n",
+    "            # load params\n",
+    "            model[key].load_state_dict(new_state_dict, strict=False)\n",
+    "#             except:\n",
+    "#                 _load(params[key], model[key])\n",
+    "_ = [model[key].eval() for key in model]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1a59db2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e30985ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sampler = DiffusionSampler(\n",
+    "    model.diffusion.diffusion,\n",
+    "    sampler=ADPM2Sampler(),\n",
+    "    sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
+    "    clamp=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b803110e",
+   "metadata": {},
+   "source": [
+    "### Synthesize speech"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca57469c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
+    "    text = text.strip()\n",
+    "    ps = global_phonemizer.phonemize([text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "    tokens = textclenaer(ps)\n",
+    "    tokens.insert(0, 0)\n",
+    "    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
+    "        text_mask = length_to_mask(input_lengths).to(device)\n",
+    "\n",
+    "        t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
+    "        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
+    "        d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
+    "\n",
+    "        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
+    "                                          embedding=bert_dur,\n",
+    "                                          embedding_scale=embedding_scale,\n",
+    "                                            features=ref_s, # reference from the same speaker as the embedding\n",
+    "                                             num_steps=diffusion_steps).squeeze(1)\n",
+    "\n",
+    "\n",
+    "        s = s_pred[:, 128:]\n",
+    "        ref = s_pred[:, :128]\n",
+    "\n",
+    "        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]\n",
+    "        s = beta * s + (1 - beta)  * ref_s[:, 128:]\n",
+    "\n",
+    "        d = model.predictor.text_encoder(d_en, \n",
+    "                                         s, input_lengths, text_mask)\n",
+    "\n",
+    "        x, _ = model.predictor.lstm(d)\n",
+    "        duration = model.predictor.duration_proj(x)\n",
+    "\n",
+    "        duration = torch.sigmoid(duration).sum(axis=-1)\n",
+    "        pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
+    "\n",
+    "\n",
+    "        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
+    "        c_frame = 0\n",
+    "        for i in range(pred_aln_trg.size(0)):\n",
+    "            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
+    "            c_frame += int(pred_dur[i].data)\n",
+    "\n",
+    "        # encode prosody\n",
+    "        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(en)\n",
+    "            asr_new[:, :, 0] = en[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
+    "            en = asr_new\n",
+    "\n",
+    "        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
+    "\n",
+    "        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(asr)\n",
+    "            asr_new[:, :, 0] = asr[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
+    "            asr = asr_new\n",
+    "\n",
+    "        out = model.decoder(asr, \n",
+    "                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
+    "    \n",
+    "        \n",
+    "    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d438ef4f",
+   "metadata": {},
+   "source": [
+    "#### Basic synthesis (5 diffusion steps, seen speakers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cace9787",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. '''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c88f461",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "reference_dicts['696_92939'] = \"Demo/reference_audio/696_92939_000016_000006.wav\"\n",
+    "reference_dicts['1789_142896'] = \"Demo/reference_audio/1789_142896_000022_000005.wav\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16e8ac60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "start = time.time()\n",
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, path in reference_dicts.items():\n",
+    "    ref_s = compute_style(path)\n",
+    "    \n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized:')\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print('Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14838708",
+   "metadata": {},
+   "source": [
+    "#### With higher diffusion steps (more diverse)\n",
+    "\n",
+    "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6fbff03b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, path in reference_dicts.items():\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=10, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized:')\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print(k + ' Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e6867fd",
+   "metadata": {},
+   "source": [
+    "#### Basic synthesis (5 diffusion steps, umseen speakers)\n",
+    "The following samples are to reproduce samples in [Section 4](https://styletts2.github.io/#libri) of the demo page. All spsakers are unseen during training. You can compare the generated samples to popular zero-shot TTS models like Vall-E and NaturalSpeech 2."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4e8faa0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "# format: (path, text)\n",
+    "reference_dicts['1221-135767'] = (\"Demo/reference_audio/1221-135767-0014.wav\", \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\")\n",
+    "reference_dicts['5639-40744'] = (\"Demo/reference_audio/5639-40744-0020.wav\", \"Thus did this humane and right minded father comfort his unhappy daughter, and her mother embracing her again, did all she could to soothe her feelings.\")\n",
+    "reference_dicts['908-157963'] = (\"Demo/reference_audio/908-157963-0027.wav\", \"And lay me down in my cold bed and leave my shining lot.\")\n",
+    "reference_dicts['4077-13754'] = (\"Demo/reference_audio/4077-13754-0000.wav\", \"The army found the people in poverty and left them in comparative wealth.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "653f1406",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, v in reference_dicts.items():\n",
+    "    path, text = v\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized: ' + text)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print(k + ' Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "141e91b3",
+   "metadata": {},
+   "source": [
+    "### Speech expressiveness\n",
+    "\n",
+    "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page. The speaker reference used is `1221-135767-0014.wav`, which is unseen during training. \n",
+    "\n",
+    "#### With `embedding_scale=1`\n",
+    "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81addda4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ref_s = compute_style(\"Demo/reference_audio/1221-135767-0014.wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be1b2a11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = {}\n",
+    "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
+    "\n",
+    "for k,v in texts.items():\n",
+    "    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96d262b8",
+   "metadata": {},
+   "source": [
+    "#### With `embedding_scale=2`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e7d40b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = {}\n",
+    "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
+    "\n",
+    "for k,v in texts.items():\n",
+    "    noise = torch.randn(1,1,256).to(device)\n",
+    "    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=2)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "402b2bd6",
+   "metadata": {},
+   "source": [
+    "#### With `embedding_scale=2, alpha = 0.5, beta = 0.9`\n",
+    "`alpha` and `beta` is the factor to determine much we use the style sampled based on the text instead of the reference. The higher the value of `alpha` and `beta`, the more suitable the style it is to the text but less similar to the reference. Using higher beta makes the synthesized speech more emotional, at the cost of lower similarity to the reference. `alpha` determines the timbre of the speaker while `beta` determines the prosody. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "599de5d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = {}\n",
+    "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
+    "\n",
+    "for k,v in texts.items():\n",
+    "    noise = torch.randn(1,1,256).to(device)\n",
+    "    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=2)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "48548866",
+   "metadata": {},
+   "source": [
+    "### Zero-shot speaker adaptation\n",
+    "This section recreates the \"Acoustic Environment Maintenance\" and \"Speaker’s Emotion Maintenance\" demo in [Section 4](https://styletts2.github.io/#libri) of the demo page. You can compare the generated samples to popular zero-shot TTS models like Vall-E. Note that the model was trained only on LibriTTS, which is about 250 times fewer data compared to those used to trian Vall-E with similar or better effect for these maintainance. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23e81572",
+   "metadata": {},
+   "source": [
+    "#### Acoustic Environment Maintenance\n",
+    "\n",
+    "Since we want to maintain the acoustic environment in the speaker (timbre), we set  `alpha = 0` to make the speaker as closer to the reference as possible while only changing the prosody according to the text.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8087bccb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "# format: (path, text)\n",
+    "reference_dicts['3'] = (\"Demo/reference_audio/3.wav\", \"As friends thing I definitely I've got more male friends.\")\n",
+    "reference_dicts['4'] = (\"Demo/reference_audio/4.wav\", \"Everything is run by computer but you got to know how to think before you can do a computer.\")\n",
+    "reference_dicts['5'] = (\"Demo/reference_audio/5.wav\", \"Then out in LA you guys got a whole another ball game within California to worry about.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e99c200",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, v in reference_dicts.items():\n",
+    "    path, text = v\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.0, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print('Synthesized: ' + text)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print('Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7d56505d",
+   "metadata": {},
+   "source": [
+    "#### Speaker’s Emotion Maintenance\n",
+    "\n",
+    "Since we want to maintain the emotion in the speaker (prosody), we set  `beta = 0.1` to make the speaker as closer to the reference as possible while having some diversity thruogh the slight timbre change."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f90179e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "# format: (path, text)\n",
+    "reference_dicts['Anger'] = (\"Demo/reference_audio/anger.wav\", \"We have to reduce the number of plastic bags.\")\n",
+    "reference_dicts['Sleepy'] = (\"Demo/reference_audio/sleepy.wav\", \"We have to reduce the number of plastic bags.\")\n",
+    "reference_dicts['Amused'] = (\"Demo/reference_audio/amused.wav\", \"We have to reduce the number of plastic bags.\")\n",
+    "reference_dicts['Disgusted'] = (\"Demo/reference_audio/disgusted.wav\", \"We have to reduce the number of plastic bags.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e6bdfed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, v in reference_dicts.items():\n",
+    "    path, text = v\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.1, diffusion_steps=10, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized: ' + text)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print(k + ' Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37ae3963",
+   "metadata": {},
+   "source": [
+    "### Longform Narration\n",
+    "\n",
+    "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f12a716b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word \"Homemade,\" when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first class home made products there is a market in all large cities. All first-class grocers have customers who purchase such goods.'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1a38079",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):\n",
+    "    text = text.strip()\n",
+    "    ps = global_phonemizer.phonemize([text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "    ps = ps.replace('``', '\"')\n",
+    "    ps = ps.replace(\"''\", '\"')\n",
+    "\n",
+    "    tokens = textclenaer(ps)\n",
+    "    tokens.insert(0, 0)\n",
+    "    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
+    "        text_mask = length_to_mask(input_lengths).to(device)\n",
+    "\n",
+    "        t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
+    "        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
+    "        d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
+    "\n",
+    "        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
+    "                                          embedding=bert_dur,\n",
+    "                                          embedding_scale=embedding_scale,\n",
+    "                                            features=ref_s, # reference from the same speaker as the embedding\n",
+    "                                             num_steps=diffusion_steps).squeeze(1)\n",
+    "        \n",
+    "        if s_prev is not None:\n",
+    "            # convex combination of previous and current style\n",
+    "            s_pred = t * s_prev + (1 - t) * s_pred\n",
+    "        \n",
+    "        s = s_pred[:, 128:]\n",
+    "        ref = s_pred[:, :128]\n",
+    "        \n",
+    "        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]\n",
+    "        s = beta * s + (1 - beta)  * ref_s[:, 128:]\n",
+    "\n",
+    "        s_pred = torch.cat([ref, s], dim=-1)\n",
+    "\n",
+    "        d = model.predictor.text_encoder(d_en, \n",
+    "                                         s, input_lengths, text_mask)\n",
+    "\n",
+    "        x, _ = model.predictor.lstm(d)\n",
+    "        duration = model.predictor.duration_proj(x)\n",
+    "\n",
+    "        duration = torch.sigmoid(duration).sum(axis=-1)\n",
+    "        pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
+    "\n",
+    "\n",
+    "        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
+    "        c_frame = 0\n",
+    "        for i in range(pred_aln_trg.size(0)):\n",
+    "            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
+    "            c_frame += int(pred_dur[i].data)\n",
+    "\n",
+    "        # encode prosody\n",
+    "        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(en)\n",
+    "            asr_new[:, :, 0] = en[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
+    "            en = asr_new\n",
+    "\n",
+    "        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
+    "\n",
+    "        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(asr)\n",
+    "            asr_new[:, :, 0] = asr[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
+    "            asr = asr_new\n",
+    "\n",
+    "        out = model.decoder(asr, \n",
+    "                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
+    "    \n",
+    "        \n",
+    "    return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9088f7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "s_ref = compute_style(path)\n",
+    "sentences = passage.split('.') # simple split by comma\n",
+    "wavs = []\n",
+    "s_prev = None\n",
+    "for text in sentences:\n",
+    "    if text.strip() == \"\": continue\n",
+    "    text += '.' # add it back\n",
+    "    \n",
+    "    wav, s_prev = LFinference(text, \n",
+    "                              s_prev, \n",
+    "                              s_ref, \n",
+    "                              alpha = 0.3, \n",
+    "                              beta = 0.9,  # make it more suitable for the text\n",
+    "                              t = 0.7, \n",
+    "                              diffusion_steps=10, embedding_scale=1.5)\n",
+    "    wavs.append(wav)\n",
+    "print('Synthesized: ')\n",
+    "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))\n",
+    "print('Reference: ')\n",
+    "display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7517b657",
+   "metadata": {},
+   "source": [
+    "### Style Transfer\n",
+    "\n",
+    "The following section demostrates the style transfer capacity for unseen speakers in [Section 6](https://styletts2.github.io/#emo) of the demo page. For this, we set `alpha=0.5, beta = 0.9` for the most pronounced effects (mostly using the sampled style). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed95d0f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
+    "    text = text.strip()\n",
+    "    ps = global_phonemizer.phonemize([text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "\n",
+    "    tokens = textclenaer(ps)\n",
+    "    tokens.insert(0, 0)\n",
+    "    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    ref_text = ref_text.strip()\n",
+    "    ps = global_phonemizer.phonemize([ref_text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "\n",
+    "    ref_tokens = textclenaer(ps)\n",
+    "    ref_tokens.insert(0, 0)\n",
+    "    ref_tokens = torch.LongTensor(ref_tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
+    "        text_mask = length_to_mask(input_lengths).to(device)\n",
+    "\n",
+    "        t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
+    "        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
+    "        d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
+    "        \n",
+    "        ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)\n",
+    "        ref_text_mask = length_to_mask(ref_input_lengths).to(device)\n",
+    "        ref_bert_dur = model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())\n",
+    "        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
+    "                                          embedding=bert_dur,\n",
+    "                                          embedding_scale=embedding_scale,\n",
+    "                                            features=ref_s, # reference from the same speaker as the embedding\n",
+    "                                             num_steps=diffusion_steps).squeeze(1)\n",
+    "\n",
+    "\n",
+    "        s = s_pred[:, 128:]\n",
+    "        ref = s_pred[:, :128]\n",
+    "\n",
+    "        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]\n",
+    "        s = beta * s + (1 - beta)  * ref_s[:, 128:]\n",
+    "\n",
+    "        d = model.predictor.text_encoder(d_en, \n",
+    "                                         s, input_lengths, text_mask)\n",
+    "\n",
+    "        x, _ = model.predictor.lstm(d)\n",
+    "        duration = model.predictor.duration_proj(x)\n",
+    "\n",
+    "        duration = torch.sigmoid(duration).sum(axis=-1)\n",
+    "        pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
+    "\n",
+    "\n",
+    "        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
+    "        c_frame = 0\n",
+    "        for i in range(pred_aln_trg.size(0)):\n",
+    "            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
+    "            c_frame += int(pred_dur[i].data)\n",
+    "\n",
+    "        # encode prosody\n",
+    "        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(en)\n",
+    "            asr_new[:, :, 0] = en[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
+    "            en = asr_new\n",
+    "\n",
+    "        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
+    "\n",
+    "        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(asr)\n",
+    "            asr_new[:, :, 0] = asr[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
+    "            asr = asr_new\n",
+    "\n",
+    "        out = model.decoder(asr, \n",
+    "                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
+    "    \n",
+    "        \n",
+    "    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec3f0da4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reference texts to sample styles\n",
+    "\n",
+    "ref_texts = {}\n",
+    "ref_texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "ref_texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "ref_texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "ref_texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d0a3825",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "s_ref = compute_style(path)\n",
+    "\n",
+    "text = \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\"\n",
+    "for k,v in ref_texts.items():\n",
+    "    wav = STinference(text, s_ref, v, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=1.5)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6750aed9",
+   "metadata": {},
+   "source": [
+    "### Speech diversity\n",
+    "\n",
+    "This section reproduces samples in [Section 7](https://styletts2.github.io/#var) of the demo page. \n",
+    "\n",
+    "`alpha` and `beta` determine the diversity of the synthesized speech. There are two extreme cases:\n",
+    "- If `alpha = 1` and `beta = 1`, the synthesized speech sounds the most dissimilar to the reference speaker, but it is also the most diverse (each time you synthesize a speech it will be totally different). \n",
+    "- If `alpha = 0` and `beta = 0`, the synthesized speech sounds the most siimlar to the reference speaker, but it is deterministic (i.e., the sampled style is not used for speech synthesis). \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6ae0aa5",
+   "metadata": {},
+   "source": [
+    "#### Default setting (`alpha = 0.3, beta=0.7`)\n",
+    "This setting uses 70% of the reference timbre and 30% of the reference prosody and use the diffusion model to sample them based on the text. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36dc0148",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bf9ef421",
+   "metadata": {},
+   "source": [
+    "#### Less diverse setting (`alpha = 0.1, beta=0.3`)\n",
+    "This setting uses 90% of the reference timbre and 70% of the reference prosody. This makes it more similar to the reference speaker at cost of less diverse samples. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ba406bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.1, beta=0.3, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a38fe464",
+   "metadata": {},
+   "source": [
+    "#### More diverse setting (`alpha = 0.5, beta=0.95`)\n",
+    "This setting uses 50% of the reference timbre and 5% of the reference prosody (so it uses 100% of the sampled prosody, which makes it more diverse), but this makes it more dissimilar to the reference speaker.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f25bf94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.5, beta=0.95, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21c3a071",
+   "metadata": {},
+   "source": [
+    "#### Extreme setting (`alpha = 1, beta=1`)\n",
+    "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very dissimilar to the reference speaker. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fff8bab1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=1, beta=1, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a8741e5a",
+   "metadata": {},
+   "source": [
+    "#### No variation (`alpha = 0, beta=0`)\n",
+    "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very similar to the reference speaker, but there is no variation. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e55dd281",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0, beta=0, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5e86423",
+   "metadata": {},
+   "source": [
+    "### Extra fun!\n",
+    "\n",
+    "Here we clone some of the authors' voice of the StyleTTS 2 papers with a few seconds of the recording in the wild. None of the voices is in the dataset and all authors agreed to have their voices cloned here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f558314",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. '''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "caa5747c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "reference_dicts['Yinghao'] = \"Demo/reference_audio/Yinghao.wav\"\n",
+    "reference_dicts['Gavin'] = \"Demo/reference_audio/Gavin.wav\"\n",
+    "reference_dicts['Vinay'] = \"Demo/reference_audio/Vinay.wav\"\n",
+    "reference_dicts['Nima'] = \"Demo/reference_audio/Nima.wav\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44a4cea1",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "start = time.time()\n",
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, path in reference_dicts.items():\n",
+    "    ref_s = compute_style(path)\n",
+    "    \n",
+    "    wav = inference(text, ref_s, alpha=0.1, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print('Speaker: ' + k)\n",
+    "    import IPython.display as ipd\n",
+    "    print('Synthesized:')\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print('Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "NLP",
+   "language": "python",
+   "name": "nlp"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Demo/.ipynb_checkpoints/Inference_pod_90h_30k-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,1155 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9adb7bd1",
+   "metadata": {},
+   "source": [
+    "# StyleTTS 2 Demo (LibriTTS)\n",
+    "\n",
+    "Before you run the following cells, please make sure you have downloaded [reference_audio.zip](https://huggingface.co/yl4579/StyleTTS2-LibriTTS/resolve/main/reference_audio.zip) and unzipped it under the `demo` folder."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6108384d",
+   "metadata": {},
+   "source": [
+    "### Utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96e173bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "torch.manual_seed(0)\n",
+    "torch.backends.cudnn.benchmark = False\n",
+    "torch.backends.cudnn.deterministic = True\n",
+    "\n",
+    "import random\n",
+    "random.seed(0)\n",
+    "\n",
+    "import numpy as np\n",
+    "np.random.seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "da84c60f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a3ddcc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load packages\n",
+    "import time\n",
+    "import random\n",
+    "import yaml\n",
+    "from munch import Munch\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "import torch.nn.functional as F\n",
+    "import torchaudio\n",
+    "import librosa\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "from models import *\n",
+    "from utils import *\n",
+    "from text_utils import TextCleaner\n",
+    "textclenaer = TextCleaner()\n",
+    "\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00ee05e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "to_mel = torchaudio.transforms.MelSpectrogram(\n",
+    "    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
+    "mean, std = -4, 4\n",
+    "\n",
+    "def length_to_mask(lengths):\n",
+    "    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
+    "    mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
+    "    return mask\n",
+    "\n",
+    "def preprocess(wave):\n",
+    "    wave_tensor = torch.from_numpy(wave).float()\n",
+    "    mel_tensor = to_mel(wave_tensor)\n",
+    "    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
+    "    return mel_tensor\n",
+    "\n",
+    "def compute_style(path):\n",
+    "    wave, sr = librosa.load(path, sr=24000)\n",
+    "    audio, index = librosa.effects.trim(wave, top_db=30)\n",
+    "    if sr != 24000:\n",
+    "        audio = librosa.resample(audio, sr, 24000)\n",
+    "    mel_tensor = preprocess(audio).to(device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        ref_s = model.style_encoder(mel_tensor.unsqueeze(1))\n",
+    "        ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))\n",
+    "\n",
+    "    return torch.cat([ref_s, ref_p], dim=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bbdc04c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b9cecbe",
+   "metadata": {},
+   "source": [
+    "### Load models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64fc4c0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load phonemizer\n",
+    "import phonemizer\n",
+    "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48e7b644",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = yaml.safe_load(open(\"Models/LibriTTS/config.yml\"))\n",
+    "\n",
+    "# load pretrained ASR model\n",
+    "ASR_config = config.get('ASR_config', False)\n",
+    "ASR_path = config.get('ASR_path', False)\n",
+    "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
+    "\n",
+    "# load pretrained F0 model\n",
+    "F0_path = config.get('F0_path', False)\n",
+    "pitch_extractor = load_F0_models(F0_path)\n",
+    "\n",
+    "# load BERT model\n",
+    "from Utils.PLBERT.util import load_plbert\n",
+    "BERT_path = config.get('PLBERT_dir', False)\n",
+    "plbert = load_plbert(BERT_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffc18cf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_params = recursive_munch(config['model_params'])\n",
+    "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
+    "_ = [model[key].eval() for key in model]\n",
+    "_ = [model[key].to(device) for key in model]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64529d5c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "params_whole = torch.load(\"Models/LibriTTS/epochs_2nd_00020.pth\", map_location='cpu')\n",
+    "params = params_whole['net']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "895d9706",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for key in model:\n",
+    "    if key in params:\n",
+    "        print('%s loaded' % key)\n",
+    "        try:\n",
+    "            model[key].load_state_dict(params[key])\n",
+    "        except:\n",
+    "            from collections import OrderedDict\n",
+    "            state_dict = params[key]\n",
+    "            new_state_dict = OrderedDict()\n",
+    "            for k, v in state_dict.items():\n",
+    "                name = k[7:] # remove `module.`\n",
+    "                new_state_dict[name] = v\n",
+    "            # load params\n",
+    "            model[key].load_state_dict(new_state_dict, strict=False)\n",
+    "#             except:\n",
+    "#                 _load(params[key], model[key])\n",
+    "_ = [model[key].eval() for key in model]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1a59db2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e30985ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sampler = DiffusionSampler(\n",
+    "    model.diffusion.diffusion,\n",
+    "    sampler=ADPM2Sampler(),\n",
+    "    sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
+    "    clamp=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b803110e",
+   "metadata": {},
+   "source": [
+    "### Synthesize speech"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca57469c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
+    "    text = text.strip()\n",
+    "    ps = global_phonemizer.phonemize([text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "    tokens = textclenaer(ps)\n",
+    "    tokens.insert(0, 0)\n",
+    "    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
+    "        text_mask = length_to_mask(input_lengths).to(device)\n",
+    "\n",
+    "        t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
+    "        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
+    "        d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
+    "\n",
+    "        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
+    "                                          embedding=bert_dur,\n",
+    "                                          embedding_scale=embedding_scale,\n",
+    "                                            features=ref_s, # reference from the same speaker as the embedding\n",
+    "                                             num_steps=diffusion_steps).squeeze(1)\n",
+    "\n",
+    "\n",
+    "        s = s_pred[:, 128:]\n",
+    "        ref = s_pred[:, :128]\n",
+    "\n",
+    "        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]\n",
+    "        s = beta * s + (1 - beta)  * ref_s[:, 128:]\n",
+    "\n",
+    "        d = model.predictor.text_encoder(d_en, \n",
+    "                                         s, input_lengths, text_mask)\n",
+    "\n",
+    "        x, _ = model.predictor.lstm(d)\n",
+    "        duration = model.predictor.duration_proj(x)\n",
+    "\n",
+    "        duration = torch.sigmoid(duration).sum(axis=-1)\n",
+    "        pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
+    "\n",
+    "\n",
+    "        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
+    "        c_frame = 0\n",
+    "        for i in range(pred_aln_trg.size(0)):\n",
+    "            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
+    "            c_frame += int(pred_dur[i].data)\n",
+    "\n",
+    "        # encode prosody\n",
+    "        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(en)\n",
+    "            asr_new[:, :, 0] = en[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
+    "            en = asr_new\n",
+    "\n",
+    "        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
+    "\n",
+    "        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(asr)\n",
+    "            asr_new[:, :, 0] = asr[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
+    "            asr = asr_new\n",
+    "\n",
+    "        out = model.decoder(asr, \n",
+    "                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
+    "    \n",
+    "        \n",
+    "    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d438ef4f",
+   "metadata": {},
+   "source": [
+    "#### Basic synthesis (5 diffusion steps, seen speakers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cace9787",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. '''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c88f461",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "reference_dicts['696_92939'] = \"Demo/reference_audio/696_92939_000016_000006.wav\"\n",
+    "reference_dicts['1789_142896'] = \"Demo/reference_audio/1789_142896_000022_000005.wav\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16e8ac60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "start = time.time()\n",
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, path in reference_dicts.items():\n",
+    "    ref_s = compute_style(path)\n",
+    "    \n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized:')\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print('Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14838708",
+   "metadata": {},
+   "source": [
+    "#### With higher diffusion steps (more diverse)\n",
+    "\n",
+    "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6fbff03b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, path in reference_dicts.items():\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=10, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized:')\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print(k + ' Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e6867fd",
+   "metadata": {},
+   "source": [
+    "#### Basic synthesis (5 diffusion steps, umseen speakers)\n",
+    "The following samples are to reproduce samples in [Section 4](https://styletts2.github.io/#libri) of the demo page. All spsakers are unseen during training. You can compare the generated samples to popular zero-shot TTS models like Vall-E and NaturalSpeech 2."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4e8faa0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "# format: (path, text)\n",
+    "reference_dicts['1221-135767'] = (\"Demo/reference_audio/1221-135767-0014.wav\", \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\")\n",
+    "reference_dicts['5639-40744'] = (\"Demo/reference_audio/5639-40744-0020.wav\", \"Thus did this humane and right minded father comfort his unhappy daughter, and her mother embracing her again, did all she could to soothe her feelings.\")\n",
+    "reference_dicts['908-157963'] = (\"Demo/reference_audio/908-157963-0027.wav\", \"And lay me down in my cold bed and leave my shining lot.\")\n",
+    "reference_dicts['4077-13754'] = (\"Demo/reference_audio/4077-13754-0000.wav\", \"The army found the people in poverty and left them in comparative wealth.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "653f1406",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, v in reference_dicts.items():\n",
+    "    path, text = v\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized: ' + text)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print(k + ' Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "141e91b3",
+   "metadata": {},
+   "source": [
+    "### Speech expressiveness\n",
+    "\n",
+    "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page. The speaker reference used is `1221-135767-0014.wav`, which is unseen during training. \n",
+    "\n",
+    "#### With `embedding_scale=1`\n",
+    "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81addda4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ref_s = compute_style(\"Demo/reference_audio/1221-135767-0014.wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be1b2a11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = {}\n",
+    "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
+    "\n",
+    "for k,v in texts.items():\n",
+    "    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96d262b8",
+   "metadata": {},
+   "source": [
+    "#### With `embedding_scale=2`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e7d40b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = {}\n",
+    "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
+    "\n",
+    "for k,v in texts.items():\n",
+    "    noise = torch.randn(1,1,256).to(device)\n",
+    "    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=2)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "402b2bd6",
+   "metadata": {},
+   "source": [
+    "#### With `embedding_scale=2, alpha = 0.5, beta = 0.9`\n",
+    "`alpha` and `beta` is the factor to determine much we use the style sampled based on the text instead of the reference. The higher the value of `alpha` and `beta`, the more suitable the style it is to the text but less similar to the reference. Using higher beta makes the synthesized speech more emotional, at the cost of lower similarity to the reference. `alpha` determines the timbre of the speaker while `beta` determines the prosody. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "599de5d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = {}\n",
+    "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
+    "\n",
+    "for k,v in texts.items():\n",
+    "    noise = torch.randn(1,1,256).to(device)\n",
+    "    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=2)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "48548866",
+   "metadata": {},
+   "source": [
+    "### Zero-shot speaker adaptation\n",
+    "This section recreates the \"Acoustic Environment Maintenance\" and \"Speaker’s Emotion Maintenance\" demo in [Section 4](https://styletts2.github.io/#libri) of the demo page. You can compare the generated samples to popular zero-shot TTS models like Vall-E. Note that the model was trained only on LibriTTS, which is about 250 times fewer data compared to those used to trian Vall-E with similar or better effect for these maintainance. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23e81572",
+   "metadata": {},
+   "source": [
+    "#### Acoustic Environment Maintenance\n",
+    "\n",
+    "Since we want to maintain the acoustic environment in the speaker (timbre), we set  `alpha = 0` to make the speaker as closer to the reference as possible while only changing the prosody according to the text.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8087bccb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "# format: (path, text)\n",
+    "reference_dicts['3'] = (\"Demo/reference_audio/3.wav\", \"As friends thing I definitely I've got more male friends.\")\n",
+    "reference_dicts['4'] = (\"Demo/reference_audio/4.wav\", \"Everything is run by computer but you got to know how to think before you can do a computer.\")\n",
+    "reference_dicts['5'] = (\"Demo/reference_audio/5.wav\", \"Then out in LA you guys got a whole another ball game within California to worry about.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e99c200",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, v in reference_dicts.items():\n",
+    "    path, text = v\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.0, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print('Synthesized: ' + text)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print('Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7d56505d",
+   "metadata": {},
+   "source": [
+    "#### Speaker’s Emotion Maintenance\n",
+    "\n",
+    "Since we want to maintain the emotion in the speaker (prosody), we set  `beta = 0.1` to make the speaker as closer to the reference as possible while having some diversity thruogh the slight timbre change."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f90179e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "# format: (path, text)\n",
+    "reference_dicts['Anger'] = (\"Demo/reference_audio/anger.wav\", \"We have to reduce the number of plastic bags.\")\n",
+    "reference_dicts['Sleepy'] = (\"Demo/reference_audio/sleepy.wav\", \"We have to reduce the number of plastic bags.\")\n",
+    "reference_dicts['Amused'] = (\"Demo/reference_audio/amused.wav\", \"We have to reduce the number of plastic bags.\")\n",
+    "reference_dicts['Disgusted'] = (\"Demo/reference_audio/disgusted.wav\", \"We have to reduce the number of plastic bags.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e6bdfed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, v in reference_dicts.items():\n",
+    "    path, text = v\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.1, diffusion_steps=10, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized: ' + text)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print(k + ' Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37ae3963",
+   "metadata": {},
+   "source": [
+    "### Longform Narration\n",
+    "\n",
+    "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f12a716b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word \"Homemade,\" when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first class home made products there is a market in all large cities. All first-class grocers have customers who purchase such goods.'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1a38079",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):\n",
+    "    text = text.strip()\n",
+    "    ps = global_phonemizer.phonemize([text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "    ps = ps.replace('``', '\"')\n",
+    "    ps = ps.replace(\"''\", '\"')\n",
+    "\n",
+    "    tokens = textclenaer(ps)\n",
+    "    tokens.insert(0, 0)\n",
+    "    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
+    "        text_mask = length_to_mask(input_lengths).to(device)\n",
+    "\n",
+    "        t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
+    "        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
+    "        d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
+    "\n",
+    "        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
+    "                                          embedding=bert_dur,\n",
+    "                                          embedding_scale=embedding_scale,\n",
+    "                                            features=ref_s, # reference from the same speaker as the embedding\n",
+    "                                             num_steps=diffusion_steps).squeeze(1)\n",
+    "        \n",
+    "        if s_prev is not None:\n",
+    "            # convex combination of previous and current style\n",
+    "            s_pred = t * s_prev + (1 - t) * s_pred\n",
+    "        \n",
+    "        s = s_pred[:, 128:]\n",
+    "        ref = s_pred[:, :128]\n",
+    "        \n",
+    "        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]\n",
+    "        s = beta * s + (1 - beta)  * ref_s[:, 128:]\n",
+    "\n",
+    "        s_pred = torch.cat([ref, s], dim=-1)\n",
+    "\n",
+    "        d = model.predictor.text_encoder(d_en, \n",
+    "                                         s, input_lengths, text_mask)\n",
+    "\n",
+    "        x, _ = model.predictor.lstm(d)\n",
+    "        duration = model.predictor.duration_proj(x)\n",
+    "\n",
+    "        duration = torch.sigmoid(duration).sum(axis=-1)\n",
+    "        pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
+    "\n",
+    "\n",
+    "        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
+    "        c_frame = 0\n",
+    "        for i in range(pred_aln_trg.size(0)):\n",
+    "            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
+    "            c_frame += int(pred_dur[i].data)\n",
+    "\n",
+    "        # encode prosody\n",
+    "        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(en)\n",
+    "            asr_new[:, :, 0] = en[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
+    "            en = asr_new\n",
+    "\n",
+    "        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
+    "\n",
+    "        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(asr)\n",
+    "            asr_new[:, :, 0] = asr[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
+    "            asr = asr_new\n",
+    "\n",
+    "        out = model.decoder(asr, \n",
+    "                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
+    "    \n",
+    "        \n",
+    "    return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9088f7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "s_ref = compute_style(path)\n",
+    "sentences = passage.split('.') # simple split by comma\n",
+    "wavs = []\n",
+    "s_prev = None\n",
+    "for text in sentences:\n",
+    "    if text.strip() == \"\": continue\n",
+    "    text += '.' # add it back\n",
+    "    \n",
+    "    wav, s_prev = LFinference(text, \n",
+    "                              s_prev, \n",
+    "                              s_ref, \n",
+    "                              alpha = 0.3, \n",
+    "                              beta = 0.9,  # make it more suitable for the text\n",
+    "                              t = 0.7, \n",
+    "                              diffusion_steps=10, embedding_scale=1.5)\n",
+    "    wavs.append(wav)\n",
+    "print('Synthesized: ')\n",
+    "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))\n",
+    "print('Reference: ')\n",
+    "display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7517b657",
+   "metadata": {},
+   "source": [
+    "### Style Transfer\n",
+    "\n",
+    "The following section demostrates the style transfer capacity for unseen speakers in [Section 6](https://styletts2.github.io/#emo) of the demo page. For this, we set `alpha=0.5, beta = 0.9` for the most pronounced effects (mostly using the sampled style). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed95d0f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
+    "    text = text.strip()\n",
+    "    ps = global_phonemizer.phonemize([text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "\n",
+    "    tokens = textclenaer(ps)\n",
+    "    tokens.insert(0, 0)\n",
+    "    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    ref_text = ref_text.strip()\n",
+    "    ps = global_phonemizer.phonemize([ref_text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "\n",
+    "    ref_tokens = textclenaer(ps)\n",
+    "    ref_tokens.insert(0, 0)\n",
+    "    ref_tokens = torch.LongTensor(ref_tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
+    "        text_mask = length_to_mask(input_lengths).to(device)\n",
+    "\n",
+    "        t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
+    "        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
+    "        d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
+    "        \n",
+    "        ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)\n",
+    "        ref_text_mask = length_to_mask(ref_input_lengths).to(device)\n",
+    "        ref_bert_dur = model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())\n",
+    "        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
+    "                                          embedding=bert_dur,\n",
+    "                                          embedding_scale=embedding_scale,\n",
+    "                                            features=ref_s, # reference from the same speaker as the embedding\n",
+    "                                             num_steps=diffusion_steps).squeeze(1)\n",
+    "\n",
+    "\n",
+    "        s = s_pred[:, 128:]\n",
+    "        ref = s_pred[:, :128]\n",
+    "\n",
+    "        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]\n",
+    "        s = beta * s + (1 - beta)  * ref_s[:, 128:]\n",
+    "\n",
+    "        d = model.predictor.text_encoder(d_en, \n",
+    "                                         s, input_lengths, text_mask)\n",
+    "\n",
+    "        x, _ = model.predictor.lstm(d)\n",
+    "        duration = model.predictor.duration_proj(x)\n",
+    "\n",
+    "        duration = torch.sigmoid(duration).sum(axis=-1)\n",
+    "        pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
+    "\n",
+    "\n",
+    "        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
+    "        c_frame = 0\n",
+    "        for i in range(pred_aln_trg.size(0)):\n",
+    "            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
+    "            c_frame += int(pred_dur[i].data)\n",
+    "\n",
+    "        # encode prosody\n",
+    "        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(en)\n",
+    "            asr_new[:, :, 0] = en[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
+    "            en = asr_new\n",
+    "\n",
+    "        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
+    "\n",
+    "        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(asr)\n",
+    "            asr_new[:, :, 0] = asr[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
+    "            asr = asr_new\n",
+    "\n",
+    "        out = model.decoder(asr, \n",
+    "                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
+    "    \n",
+    "        \n",
+    "    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec3f0da4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reference texts to sample styles\n",
+    "\n",
+    "ref_texts = {}\n",
+    "ref_texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "ref_texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "ref_texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "ref_texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d0a3825",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "s_ref = compute_style(path)\n",
+    "\n",
+    "text = \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\"\n",
+    "for k,v in ref_texts.items():\n",
+    "    wav = STinference(text, s_ref, v, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=1.5)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6750aed9",
+   "metadata": {},
+   "source": [
+    "### Speech diversity\n",
+    "\n",
+    "This section reproduces samples in [Section 7](https://styletts2.github.io/#var) of the demo page. \n",
+    "\n",
+    "`alpha` and `beta` determine the diversity of the synthesized speech. There are two extreme cases:\n",
+    "- If `alpha = 1` and `beta = 1`, the synthesized speech sounds the most dissimilar to the reference speaker, but it is also the most diverse (each time you synthesize a speech it will be totally different). \n",
+    "- If `alpha = 0` and `beta = 0`, the synthesized speech sounds the most siimlar to the reference speaker, but it is deterministic (i.e., the sampled style is not used for speech synthesis). \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6ae0aa5",
+   "metadata": {},
+   "source": [
+    "#### Default setting (`alpha = 0.3, beta=0.7`)\n",
+    "This setting uses 70% of the reference timbre and 30% of the reference prosody and use the diffusion model to sample them based on the text. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36dc0148",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bf9ef421",
+   "metadata": {},
+   "source": [
+    "#### Less diverse setting (`alpha = 0.1, beta=0.3`)\n",
+    "This setting uses 90% of the reference timbre and 70% of the reference prosody. This makes it more similar to the reference speaker at cost of less diverse samples. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ba406bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.1, beta=0.3, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a38fe464",
+   "metadata": {},
+   "source": [
+    "#### More diverse setting (`alpha = 0.5, beta=0.95`)\n",
+    "This setting uses 50% of the reference timbre and 5% of the reference prosody (so it uses 100% of the sampled prosody, which makes it more diverse), but this makes it more dissimilar to the reference speaker.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f25bf94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.5, beta=0.95, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21c3a071",
+   "metadata": {},
+   "source": [
+    "#### Extreme setting (`alpha = 1, beta=1`)\n",
+    "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very dissimilar to the reference speaker. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fff8bab1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=1, beta=1, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a8741e5a",
+   "metadata": {},
+   "source": [
+    "#### No variation (`alpha = 0, beta=0`)\n",
+    "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very similar to the reference speaker, but there is no variation. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e55dd281",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0, beta=0, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5e86423",
+   "metadata": {},
+   "source": [
+    "### Extra fun!\n",
+    "\n",
+    "Here we clone some of the authors' voice of the StyleTTS 2 papers with a few seconds of the recording in the wild. None of the voices is in the dataset and all authors agreed to have their voices cloned here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f558314",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. '''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "caa5747c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "reference_dicts['Yinghao'] = \"Demo/reference_audio/Yinghao.wav\"\n",
+    "reference_dicts['Gavin'] = \"Demo/reference_audio/Gavin.wav\"\n",
+    "reference_dicts['Vinay'] = \"Demo/reference_audio/Vinay.wav\"\n",
+    "reference_dicts['Nima'] = \"Demo/reference_audio/Nima.wav\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44a4cea1",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "start = time.time()\n",
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, path in reference_dicts.items():\n",
+    "    ref_s = compute_style(path)\n",
+    "    \n",
+    "    wav = inference(text, ref_s, alpha=0.1, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print('Speaker: ' + k)\n",
+    "    import IPython.display as ipd\n",
+    "    print('Synthesized:')\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print('Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "NLP",
+   "language": "python",
+   "name": "nlp"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Demo/Inference_pod_90h_30k.ipynb ADDED Viewed

	@@ -0,0 +1,1360 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9adb7bd1",
+   "metadata": {},
+   "source": [
+    "# StyleTTS 2 Demo (LibriTTS)\n",
+    "\n",
+    "Before you run the following cells, please make sure you have downloaded [reference_audio.zip](https://huggingface.co/yl4579/StyleTTS2-LibriTTS/resolve/main/reference_audio.zip) and unzipped it under the `demo` folder."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6108384d",
+   "metadata": {},
+   "source": [
+    "### Utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "96e173bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "torch.manual_seed(0)\n",
+    "torch.backends.cudnn.benchmark = False\n",
+    "torch.backends.cudnn.deterministic = True\n",
+    "\n",
+    "import random\n",
+    "random.seed(0)\n",
+    "\n",
+    "import numpy as np\n",
+    "np.random.seed(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "2458c639-10a0-4b57-8602-22bc893c5176",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting git+https://github.com/resemble-ai/monotonic_align.git (from -r requirements.txt (line 17))\n",
+      "  Cloning https://github.com/resemble-ai/monotonic_align.git to /tmp/pip-req-build-ps9pa2ga\n",
+      "  Running command git clone --filter=blob:none --quiet https://github.com/resemble-ai/monotonic_align.git /tmp/pip-req-build-ps9pa2ga\n",
+      "  Resolved https://github.com/resemble-ai/monotonic_align.git to commit c6e5e6cb19882164027eb6e35118e841eed9298e\n",
+      "  Installing build dependencies ... \u001b[?25ldone\n",
+      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
+      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting SoundFile (from -r requirements.txt (line 1))\n",
+      "  Using cached soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)\n",
+      "Requirement already satisfied: torchaudio in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 2)) (2.6.0+cu126)\n",
+      "Collecting munch (from -r requirements.txt (line 3))\n",
+      "  Using cached munch-4.0.0-py2.py3-none-any.whl.metadata (5.9 kB)\n",
+      "Requirement already satisfied: torch in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 4)) (2.6.0+cu126)\n",
+      "Collecting pydub (from -r requirements.txt (line 5))\n",
+      "  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)\n",
+      "Requirement already satisfied: pyyaml in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 6)) (6.0.2)\n",
+      "Collecting librosa (from -r requirements.txt (line 7))\n",
+      "  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)\n",
+      "Collecting nltk (from -r requirements.txt (line 8))\n",
+      "  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)\n",
+      "Collecting matplotlib (from -r requirements.txt (line 9))\n",
+      "  Downloading matplotlib-3.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
+      "Collecting accelerate (from -r requirements.txt (line 10))\n",
+      "  Using cached accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)\n",
+      "Collecting transformers (from -r requirements.txt (line 11))\n",
+      "  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)\n",
+      "Collecting einops (from -r requirements.txt (line 12))\n",
+      "  Using cached einops-0.8.1-py3-none-any.whl.metadata (13 kB)\n",
+      "Collecting einops-exts (from -r requirements.txt (line 13))\n",
+      "  Using cached einops_exts-0.0.4-py3-none-any.whl.metadata (621 bytes)\n",
+      "Requirement already satisfied: tqdm in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 14)) (4.67.1)\n",
+      "Collecting typing (from -r requirements.txt (line 15))\n",
+      "  Using cached typing-3.7.4.3.tar.gz (78 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: typing-extensions in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 16)) (4.13.2)\n",
+      "Collecting cffi>=1.0 (from SoundFile->-r requirements.txt (line 1))\n",
+      "  Downloading cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Requirement already satisfied: numpy in /venv/main/lib/python3.12/site-packages (from SoundFile->-r requirements.txt (line 1)) (2.1.2)\n",
+      "Requirement already satisfied: filelock in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.18.0)\n",
+      "Requirement already satisfied: setuptools in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (70.2.0)\n",
+      "Requirement already satisfied: sympy==1.13.1 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (1.13.1)\n",
+      "Requirement already satisfied: networkx in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.3)\n",
+      "Requirement already satisfied: jinja2 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.1.4)\n",
+      "Requirement already satisfied: fsspec in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (2025.3.2)\n",
+      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.77)\n",
+      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.77)\n",
+      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.80)\n",
+      "Requirement already satisfied: nvidia-cudnn-cu12==9.5.1.17 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (9.5.1.17)\n",
+      "Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.4.1)\n",
+      "Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (11.3.0.4)\n",
+      "Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (10.3.7.77)\n",
+      "Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (11.7.1.2)\n",
+      "Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.5.4.2)\n",
+      "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.3 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (0.6.3)\n",
+      "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (2.21.5)\n",
+      "Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.77)\n",
+      "Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.85)\n",
+      "Requirement already satisfied: triton==3.2.0 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.2.0)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /venv/main/lib/python3.12/site-packages (from sympy==1.13.1->torch->-r requirements.txt (line 4)) (1.3.0)\n",
+      "Collecting audioread>=2.1.9 (from librosa->-r requirements.txt (line 7))\n",
+      "  Using cached audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)\n",
+      "Collecting numba>=0.51.0 (from librosa->-r requirements.txt (line 7))\n",
+      "  Downloading numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)\n",
+      "Collecting scipy>=1.6.0 (from librosa->-r requirements.txt (line 7))\n",
+      "  Downloading scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.0/62.0 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hCollecting scikit-learn>=1.1.0 (from librosa->-r requirements.txt (line 7))\n",
+      "  Downloading scikit_learn-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)\n",
+      "Collecting joblib>=1.0 (from librosa->-r requirements.txt (line 7))\n",
+      "  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)\n",
+      "Requirement already satisfied: decorator>=4.3.0 in /venv/main/lib/python3.12/site-packages (from librosa->-r requirements.txt (line 7)) (5.2.1)\n",
+      "Collecting pooch>=1.1 (from librosa->-r requirements.txt (line 7))\n",
+      "  Using cached pooch-1.8.2-py3-none-any.whl.metadata (10 kB)\n",
+      "Collecting soxr>=0.3.2 (from librosa->-r requirements.txt (line 7))\n",
+      "  Downloading soxr-0.5.0.post1-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
+      "Collecting lazy_loader>=0.1 (from librosa->-r requirements.txt (line 7))\n",
+      "  Using cached lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)\n",
+      "Collecting msgpack>=1.0 (from librosa->-r requirements.txt (line 7))\n",
+      "  Downloading msgpack-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)\n",
+      "Collecting click (from nltk->-r requirements.txt (line 8))\n",
+      "  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)\n",
+      "Collecting regex>=2021.8.3 (from nltk->-r requirements.txt (line 8))\n",
+      "  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.5/40.5 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting contourpy>=1.0.1 (from matplotlib->-r requirements.txt (line 9))\n",
+      "  Downloading contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)\n",
+      "Collecting cycler>=0.10 (from matplotlib->-r requirements.txt (line 9))\n",
+      "  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n",
+      "Collecting fonttools>=4.22.0 (from matplotlib->-r requirements.txt (line 9))\n",
+      "  Downloading fonttools-4.58.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (106 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m106.3/106.3 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hCollecting kiwisolver>=1.3.1 (from matplotlib->-r requirements.txt (line 9))\n",
+      "  Downloading kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)\n",
+      "Requirement already satisfied: packaging>=20.0 in /venv/main/lib/python3.12/site-packages (from matplotlib->-r requirements.txt (line 9)) (25.0)\n",
+      "Requirement already satisfied: pillow>=8 in /venv/main/lib/python3.12/site-packages (from matplotlib->-r requirements.txt (line 9)) (11.0.0)\n",
+      "Collecting pyparsing>=2.3.1 (from matplotlib->-r requirements.txt (line 9))\n",
+      "  Using cached pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /venv/main/lib/python3.12/site-packages (from matplotlib->-r requirements.txt (line 9)) (2.9.0.post0)\n",
+      "Requirement already satisfied: psutil in /venv/main/lib/python3.12/site-packages (from accelerate->-r requirements.txt (line 10)) (7.0.0)\n",
+      "Requirement already satisfied: huggingface-hub>=0.21.0 in /venv/main/lib/python3.12/site-packages (from accelerate->-r requirements.txt (line 10)) (0.30.2)\n",
+      "Collecting safetensors>=0.4.3 (from accelerate->-r requirements.txt (line 10))\n",
+      "  Using cached safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n",
+      "Requirement already satisfied: requests in /venv/main/lib/python3.12/site-packages (from transformers->-r requirements.txt (line 11)) (2.32.3)\n",
+      "Collecting tokenizers<0.22,>=0.21 (from transformers->-r requirements.txt (line 11))\n",
+      "  Using cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)\n",
+      "Collecting pycparser (from cffi>=1.0->SoundFile->-r requirements.txt (line 1))\n",
+      "  Using cached pycparser-2.22-py3-none-any.whl.metadata (943 bytes)\n",
+      "Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.51.0->librosa->-r requirements.txt (line 7))\n",
+      "  Downloading llvmlite-0.44.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)\n",
+      "Requirement already satisfied: platformdirs>=2.5.0 in /venv/main/lib/python3.12/site-packages (from pooch>=1.1->librosa->-r requirements.txt (line 7)) (4.3.7)\n",
+      "Requirement already satisfied: six>=1.5 in /venv/main/lib/python3.12/site-packages (from python-dateutil>=2.7->matplotlib->-r requirements.txt (line 9)) (1.17.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (3.4.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (2.4.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (2025.4.26)\n",
+      "Collecting threadpoolctl>=3.1.0 (from scikit-learn>=1.1.0->librosa->-r requirements.txt (line 7))\n",
+      "  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /venv/main/lib/python3.12/site-packages (from jinja2->torch->-r requirements.txt (line 4)) (2.1.5)\n",
+      "Using cached soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl (1.3 MB)\n",
+      "Using cached munch-4.0.0-py2.py3-none-any.whl (9.9 kB)\n",
+      "Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
+      "Using cached librosa-0.11.0-py3-none-any.whl (260 kB)\n",
+      "Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)\n",
+      "Downloading matplotlib-3.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.6/8.6 MB\u001b[0m \u001b[31m28.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hUsing cached accelerate-1.7.0-py3-none-any.whl (362 kB)\n",
+      "Using cached transformers-4.52.4-py3-none-any.whl (10.5 MB)\n",
+      "Using cached einops-0.8.1-py3-none-any.whl (64 kB)\n",
+      "Using cached einops_exts-0.0.4-py3-none-any.whl (3.9 kB)\n",
+      "Using cached audioread-3.0.1-py3-none-any.whl (23 kB)\n",
+      "Downloading cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (479 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.4/479.4 kB\u001b[0m \u001b[31m169.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (323 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m323.7/323.7 kB\u001b[0m \u001b[31m127.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hUsing cached cycler-0.12.1-py3-none-any.whl (8.3 kB)\n",
+      "Downloading fonttools-4.58.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m87.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hUsing cached joblib-1.5.1-py3-none-any.whl (307 kB)\n",
+      "Downloading kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m185.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hUsing cached lazy_loader-0.4-py3-none-any.whl (12 kB)\n",
+      "Downloading msgpack-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (401 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m401.4/401.4 kB\u001b[0m \u001b[31m192.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.9/3.9 MB\u001b[0m \u001b[31m42.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hUsing cached pooch-1.8.2-py3-none-any.whl (64 kB)\n",
+      "Using cached pyparsing-3.2.3-py3-none-any.whl (111 kB)\n",
+      "Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (796 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m796.9/796.9 kB\u001b[0m \u001b[31m125.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hUsing cached safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)\n",
+      "Downloading scikit_learn-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.5/12.5 MB\u001b[0m \u001b[31m43.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m37.3/37.3 MB\u001b[0m \u001b[31m26.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading soxr-0.5.0.post1-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (248 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m248.5/248.5 kB\u001b[0m \u001b[31m36.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hUsing cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n",
+      "Using cached click-8.2.1-py3-none-any.whl (102 kB)\n",
+      "Downloading llvmlite-0.44.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.4 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.4/42.4 MB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hUsing cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)\n",
+      "Using cached pycparser-2.22-py3-none-any.whl (117 kB)\n",
+      "Building wheels for collected packages: typing, monotonic_align\n",
+      "  Building wheel for typing (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for typing: filename=typing-3.7.4.3-py3-none-any.whl size=26304 sha256=7bd8523fe1f7cb4e20da87ee646956891addbdea2d87074f6bbf77fe282e8720\n",
+      "  Stored in directory: /root/.cache/pip/wheels/12/98/52/2bffe242a9a487f00886e43b8ed8dac46456702e11a0d6abef\n",
+      "  Building wheel for monotonic_align (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for monotonic_align: filename=monotonic_align-1.2-cp312-cp312-linux_x86_64.whl size=1543517 sha256=dc9566d3e5a0656ebf939e760d934e0926d435f336db84e0019c7391576cd4cc\n",
+      "  Stored in directory: /tmp/pip-ephem-wheel-cache-0gzg26zy/wheels/76/0a/37/00634137cd000799e060087bd1cb49a060ac6a48fc42a15488\n",
+      "Successfully built typing monotonic_align\n",
+      "Installing collected packages: pydub, typing, threadpoolctl, soxr, scipy, safetensors, regex, pyparsing, pycparser, munch, msgpack, monotonic_align, llvmlite, lazy_loader, kiwisolver, joblib, fonttools, einops, cycler, contourpy, click, audioread, scikit-learn, pooch, numba, nltk, matplotlib, einops-exts, cffi, tokenizers, SoundFile, transformers, librosa, accelerate\n",
+      "Successfully installed SoundFile-0.13.1 accelerate-1.7.0 audioread-3.0.1 cffi-1.17.1 click-8.2.1 contourpy-1.3.2 cycler-0.12.1 einops-0.8.1 einops-exts-0.0.4 fonttools-4.58.2 joblib-1.5.1 kiwisolver-1.4.8 lazy_loader-0.4 librosa-0.11.0 llvmlite-0.44.0 matplotlib-3.10.3 monotonic_align-1.2 msgpack-1.1.0 munch-4.0.0 nltk-3.9.1 numba-0.61.2 pooch-1.8.2 pycparser-2.22 pydub-0.25.1 pyparsing-3.2.3 regex-2024.11.6 safetensors-0.5.3 scikit-learn-1.7.0 scipy-1.15.3 soxr-0.5.0.post1 threadpoolctl-3.6.0 tokenizers-0.21.1 transformers-4.52.4 typing-3.7.4.3\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "da84c60f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/workspace/styletts2\n"
+     ]
+    }
+   ],
+   "source": [
+    "%cd .."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "5a3ddcc8",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'munch'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mModuleNotFoundError\u001b[39m                       Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[3]\u001b[39m\u001b[32m, line 5\u001b[39m\n\u001b[32m      3\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mrandom\u001b[39;00m\n\u001b[32m      4\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01myaml\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m5\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mmunch\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m Munch\n\u001b[32m      6\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnumpy\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnp\u001b[39;00m\n\u001b[32m      7\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\n",
+      "\u001b[31mModuleNotFoundError\u001b[39m: No module named 'munch'"
+     ]
+    }
+   ],
+   "source": [
+    "# load packages\n",
+    "import time\n",
+    "import random\n",
+    "import yaml\n",
+    "from munch import Munch\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "import torch.nn.functional as F\n",
+    "import torchaudio\n",
+    "import librosa\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "\n",
+    "from models import *\n",
+    "from utils import *\n",
+    "from text_utils import TextCleaner\n",
+    "textclenaer = TextCleaner()\n",
+    "\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00ee05e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "to_mel = torchaudio.transforms.MelSpectrogram(\n",
+    "    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)\n",
+    "mean, std = -4, 4\n",
+    "\n",
+    "def length_to_mask(lengths):\n",
+    "    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)\n",
+    "    mask = torch.gt(mask+1, lengths.unsqueeze(1))\n",
+    "    return mask\n",
+    "\n",
+    "def preprocess(wave):\n",
+    "    wave_tensor = torch.from_numpy(wave).float()\n",
+    "    mel_tensor = to_mel(wave_tensor)\n",
+    "    mel_tensor = (torch.log(1e-5 + mel_tensor.unsqueeze(0)) - mean) / std\n",
+    "    return mel_tensor\n",
+    "\n",
+    "def compute_style(path):\n",
+    "    wave, sr = librosa.load(path, sr=24000)\n",
+    "    audio, index = librosa.effects.trim(wave, top_db=30)\n",
+    "    if sr != 24000:\n",
+    "        audio = librosa.resample(audio, sr, 24000)\n",
+    "    mel_tensor = preprocess(audio).to(device)\n",
+    "\n",
+    "    with torch.no_grad():\n",
+    "        ref_s = model.style_encoder(mel_tensor.unsqueeze(1))\n",
+    "        ref_p = model.predictor_encoder(mel_tensor.unsqueeze(1))\n",
+    "\n",
+    "    return torch.cat([ref_s, ref_p], dim=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bbdc04c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b9cecbe",
+   "metadata": {},
+   "source": [
+    "### Load models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64fc4c0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load phonemizer\n",
+    "import phonemizer\n",
+    "global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48e7b644",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = yaml.safe_load(open(\"Models/LibriTTS/config.yml\"))\n",
+    "\n",
+    "# load pretrained ASR model\n",
+    "ASR_config = config.get('ASR_config', False)\n",
+    "ASR_path = config.get('ASR_path', False)\n",
+    "text_aligner = load_ASR_models(ASR_path, ASR_config)\n",
+    "\n",
+    "# load pretrained F0 model\n",
+    "F0_path = config.get('F0_path', False)\n",
+    "pitch_extractor = load_F0_models(F0_path)\n",
+    "\n",
+    "# load BERT model\n",
+    "from Utils.PLBERT.util import load_plbert\n",
+    "BERT_path = config.get('PLBERT_dir', False)\n",
+    "plbert = load_plbert(BERT_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffc18cf7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_params = recursive_munch(config['model_params'])\n",
+    "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
+    "_ = [model[key].eval() for key in model]\n",
+    "_ = [model[key].to(device) for key in model]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64529d5c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "params_whole = torch.load(\"Models/LibriTTS/epochs_2nd_00020.pth\", map_location='cpu')\n",
+    "params = params_whole['net']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "895d9706",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for key in model:\n",
+    "    if key in params:\n",
+    "        print('%s loaded' % key)\n",
+    "        try:\n",
+    "            model[key].load_state_dict(params[key])\n",
+    "        except:\n",
+    "            from collections import OrderedDict\n",
+    "            state_dict = params[key]\n",
+    "            new_state_dict = OrderedDict()\n",
+    "            for k, v in state_dict.items():\n",
+    "                name = k[7:] # remove `module.`\n",
+    "                new_state_dict[name] = v\n",
+    "            # load params\n",
+    "            model[key].load_state_dict(new_state_dict, strict=False)\n",
+    "#             except:\n",
+    "#                 _load(params[key], model[key])\n",
+    "_ = [model[key].eval() for key in model]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c1a59db2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Modules.diffusion.sampler import DiffusionSampler, ADPM2Sampler, KarrasSchedule"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e30985ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sampler = DiffusionSampler(\n",
+    "    model.diffusion.diffusion,\n",
+    "    sampler=ADPM2Sampler(),\n",
+    "    sigma_schedule=KarrasSchedule(sigma_min=0.0001, sigma_max=3.0, rho=9.0), # empirical parameters\n",
+    "    clamp=False\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b803110e",
+   "metadata": {},
+   "source": [
+    "### Synthesize speech"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ca57469c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def inference(text, ref_s, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
+    "    text = text.strip()\n",
+    "    ps = global_phonemizer.phonemize([text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "    tokens = textclenaer(ps)\n",
+    "    tokens.insert(0, 0)\n",
+    "    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
+    "        text_mask = length_to_mask(input_lengths).to(device)\n",
+    "\n",
+    "        t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
+    "        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
+    "        d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
+    "\n",
+    "        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
+    "                                          embedding=bert_dur,\n",
+    "                                          embedding_scale=embedding_scale,\n",
+    "                                            features=ref_s, # reference from the same speaker as the embedding\n",
+    "                                             num_steps=diffusion_steps).squeeze(1)\n",
+    "\n",
+    "\n",
+    "        s = s_pred[:, 128:]\n",
+    "        ref = s_pred[:, :128]\n",
+    "\n",
+    "        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]\n",
+    "        s = beta * s + (1 - beta)  * ref_s[:, 128:]\n",
+    "\n",
+    "        d = model.predictor.text_encoder(d_en, \n",
+    "                                         s, input_lengths, text_mask)\n",
+    "\n",
+    "        x, _ = model.predictor.lstm(d)\n",
+    "        duration = model.predictor.duration_proj(x)\n",
+    "\n",
+    "        duration = torch.sigmoid(duration).sum(axis=-1)\n",
+    "        pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
+    "\n",
+    "\n",
+    "        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
+    "        c_frame = 0\n",
+    "        for i in range(pred_aln_trg.size(0)):\n",
+    "            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
+    "            c_frame += int(pred_dur[i].data)\n",
+    "\n",
+    "        # encode prosody\n",
+    "        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(en)\n",
+    "            asr_new[:, :, 0] = en[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
+    "            en = asr_new\n",
+    "\n",
+    "        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
+    "\n",
+    "        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(asr)\n",
+    "            asr_new[:, :, 0] = asr[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
+    "            asr = asr_new\n",
+    "\n",
+    "        out = model.decoder(asr, \n",
+    "                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
+    "    \n",
+    "        \n",
+    "    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d438ef4f",
+   "metadata": {},
+   "source": [
+    "#### Basic synthesis (5 diffusion steps, seen speakers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cace9787",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. '''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c88f461",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "reference_dicts['696_92939'] = \"Demo/reference_audio/696_92939_000016_000006.wav\"\n",
+    "reference_dicts['1789_142896'] = \"Demo/reference_audio/1789_142896_000022_000005.wav\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16e8ac60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "start = time.time()\n",
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, path in reference_dicts.items():\n",
+    "    ref_s = compute_style(path)\n",
+    "    \n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized:')\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print('Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14838708",
+   "metadata": {},
+   "source": [
+    "#### With higher diffusion steps (more diverse)\n",
+    "\n",
+    "Since the sampler is ancestral, the higher the stpes, the more diverse the samples are, with the cost of slower synthesis speed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6fbff03b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, path in reference_dicts.items():\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=10, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized:')\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print(k + ' Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7e6867fd",
+   "metadata": {},
+   "source": [
+    "#### Basic synthesis (5 diffusion steps, umseen speakers)\n",
+    "The following samples are to reproduce samples in [Section 4](https://styletts2.github.io/#libri) of the demo page. All spsakers are unseen during training. You can compare the generated samples to popular zero-shot TTS models like Vall-E and NaturalSpeech 2."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4e8faa0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "# format: (path, text)\n",
+    "reference_dicts['1221-135767'] = (\"Demo/reference_audio/1221-135767-0014.wav\", \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\")\n",
+    "reference_dicts['5639-40744'] = (\"Demo/reference_audio/5639-40744-0020.wav\", \"Thus did this humane and right minded father comfort his unhappy daughter, and her mother embracing her again, did all she could to soothe her feelings.\")\n",
+    "reference_dicts['908-157963'] = (\"Demo/reference_audio/908-157963-0027.wav\", \"And lay me down in my cold bed and leave my shining lot.\")\n",
+    "reference_dicts['4077-13754'] = (\"Demo/reference_audio/4077-13754-0000.wav\", \"The army found the people in poverty and left them in comparative wealth.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "653f1406",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, v in reference_dicts.items():\n",
+    "    path, text = v\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.7, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized: ' + text)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print(k + ' Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "141e91b3",
+   "metadata": {},
+   "source": [
+    "### Speech expressiveness\n",
+    "\n",
+    "The following section recreates the samples shown in [Section 6](https://styletts2.github.io/#emo) of the demo page. The speaker reference used is `1221-135767-0014.wav`, which is unseen during training. \n",
+    "\n",
+    "#### With `embedding_scale=1`\n",
+    "This is the classifier-free guidance scale. The higher the scale, the more conditional the style is to the input text and hence more emotional.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81addda4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ref_s = compute_style(\"Demo/reference_audio/1221-135767-0014.wav\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be1b2a11",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = {}\n",
+    "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
+    "\n",
+    "for k,v in texts.items():\n",
+    "    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "96d262b8",
+   "metadata": {},
+   "source": [
+    "#### With `embedding_scale=2`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e7d40b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = {}\n",
+    "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
+    "\n",
+    "for k,v in texts.items():\n",
+    "    noise = torch.randn(1,1,256).to(device)\n",
+    "    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=2)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "402b2bd6",
+   "metadata": {},
+   "source": [
+    "#### With `embedding_scale=2, alpha = 0.5, beta = 0.9`\n",
+    "`alpha` and `beta` is the factor to determine much we use the style sampled based on the text instead of the reference. The higher the value of `alpha` and `beta`, the more suitable the style it is to the text but less similar to the reference. Using higher beta makes the synthesized speech more emotional, at the cost of lower similarity to the reference. `alpha` determines the timbre of the speaker while `beta` determines the prosody. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "599de5d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "texts = {}\n",
+    "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\"\n",
+    "\n",
+    "for k,v in texts.items():\n",
+    "    noise = torch.randn(1,1,256).to(device)\n",
+    "    wav = inference(v, ref_s, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=2)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "48548866",
+   "metadata": {},
+   "source": [
+    "### Zero-shot speaker adaptation\n",
+    "This section recreates the \"Acoustic Environment Maintenance\" and \"Speaker’s Emotion Maintenance\" demo in [Section 4](https://styletts2.github.io/#libri) of the demo page. You can compare the generated samples to popular zero-shot TTS models like Vall-E. Note that the model was trained only on LibriTTS, which is about 250 times fewer data compared to those used to trian Vall-E with similar or better effect for these maintainance. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23e81572",
+   "metadata": {},
+   "source": [
+    "#### Acoustic Environment Maintenance\n",
+    "\n",
+    "Since we want to maintain the acoustic environment in the speaker (timbre), we set  `alpha = 0` to make the speaker as closer to the reference as possible while only changing the prosody according to the text.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8087bccb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "# format: (path, text)\n",
+    "reference_dicts['3'] = (\"Demo/reference_audio/3.wav\", \"As friends thing I definitely I've got more male friends.\")\n",
+    "reference_dicts['4'] = (\"Demo/reference_audio/4.wav\", \"Everything is run by computer but you got to know how to think before you can do a computer.\")\n",
+    "reference_dicts['5'] = (\"Demo/reference_audio/5.wav\", \"Then out in LA you guys got a whole another ball game within California to worry about.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e99c200",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, v in reference_dicts.items():\n",
+    "    path, text = v\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.0, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print('Synthesized: ' + text)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print('Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7d56505d",
+   "metadata": {},
+   "source": [
+    "#### Speaker’s Emotion Maintenance\n",
+    "\n",
+    "Since we want to maintain the emotion in the speaker (prosody), we set  `beta = 0.1` to make the speaker as closer to the reference as possible while having some diversity thruogh the slight timbre change."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f90179e7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "# format: (path, text)\n",
+    "reference_dicts['Anger'] = (\"Demo/reference_audio/anger.wav\", \"We have to reduce the number of plastic bags.\")\n",
+    "reference_dicts['Sleepy'] = (\"Demo/reference_audio/sleepy.wav\", \"We have to reduce the number of plastic bags.\")\n",
+    "reference_dicts['Amused'] = (\"Demo/reference_audio/amused.wav\", \"We have to reduce the number of plastic bags.\")\n",
+    "reference_dicts['Disgusted'] = (\"Demo/reference_audio/disgusted.wav\", \"We have to reduce the number of plastic bags.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2e6bdfed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, v in reference_dicts.items():\n",
+    "    path, text = v\n",
+    "    ref_s = compute_style(path)\n",
+    "    start = time.time()\n",
+    "    wav = inference(text, ref_s, alpha=0.3, beta=0.1, diffusion_steps=10, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print(f\"RTF = {rtf:5f}\")\n",
+    "    import IPython.display as ipd\n",
+    "    print(k + ' Synthesized: ' + text)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print(k + ' Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "37ae3963",
+   "metadata": {},
+   "source": [
+    "### Longform Narration\n",
+    "\n",
+    "This section includes basic implementation of Algorithm 1 in the paper for consistent longform audio generation. The example passage is taken from [Section 5](https://styletts2.github.io/#long) of the demo page."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f12a716b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "passage = '''If the supply of fruit is greater than the family needs, it may be made a source of income by sending the fresh fruit to the market if there is one near enough, or by preserving, canning, and making jelly for sale. To make such an enterprise a success the fruit and work must be first class. There is magic in the word \"Homemade,\" when the product appeals to the eye and the palate; but many careless and incompetent people have found to their sorrow that this word has not magic enough to float inferior goods on the market. As a rule large canning and preserving establishments are clean and have the best appliances, and they employ chemists and skilled labor. The home product must be very good to compete with the attractive goods that are sent out from such establishments. Yet for first class home made products there is a market in all large cities. All first-class grocers have customers who purchase such goods.'''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1a38079",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def LFinference(text, s_prev, ref_s, alpha = 0.3, beta = 0.7, t = 0.7, diffusion_steps=5, embedding_scale=1):\n",
+    "    text = text.strip()\n",
+    "    ps = global_phonemizer.phonemize([text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "    ps = ps.replace('``', '\"')\n",
+    "    ps = ps.replace(\"''\", '\"')\n",
+    "\n",
+    "    tokens = textclenaer(ps)\n",
+    "    tokens.insert(0, 0)\n",
+    "    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
+    "        text_mask = length_to_mask(input_lengths).to(device)\n",
+    "\n",
+    "        t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
+    "        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
+    "        d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
+    "\n",
+    "        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
+    "                                          embedding=bert_dur,\n",
+    "                                          embedding_scale=embedding_scale,\n",
+    "                                            features=ref_s, # reference from the same speaker as the embedding\n",
+    "                                             num_steps=diffusion_steps).squeeze(1)\n",
+    "        \n",
+    "        if s_prev is not None:\n",
+    "            # convex combination of previous and current style\n",
+    "            s_pred = t * s_prev + (1 - t) * s_pred\n",
+    "        \n",
+    "        s = s_pred[:, 128:]\n",
+    "        ref = s_pred[:, :128]\n",
+    "        \n",
+    "        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]\n",
+    "        s = beta * s + (1 - beta)  * ref_s[:, 128:]\n",
+    "\n",
+    "        s_pred = torch.cat([ref, s], dim=-1)\n",
+    "\n",
+    "        d = model.predictor.text_encoder(d_en, \n",
+    "                                         s, input_lengths, text_mask)\n",
+    "\n",
+    "        x, _ = model.predictor.lstm(d)\n",
+    "        duration = model.predictor.duration_proj(x)\n",
+    "\n",
+    "        duration = torch.sigmoid(duration).sum(axis=-1)\n",
+    "        pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
+    "\n",
+    "\n",
+    "        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
+    "        c_frame = 0\n",
+    "        for i in range(pred_aln_trg.size(0)):\n",
+    "            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
+    "            c_frame += int(pred_dur[i].data)\n",
+    "\n",
+    "        # encode prosody\n",
+    "        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(en)\n",
+    "            asr_new[:, :, 0] = en[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
+    "            en = asr_new\n",
+    "\n",
+    "        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
+    "\n",
+    "        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(asr)\n",
+    "            asr_new[:, :, 0] = asr[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
+    "            asr = asr_new\n",
+    "\n",
+    "        out = model.decoder(asr, \n",
+    "                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
+    "    \n",
+    "        \n",
+    "    return out.squeeze().cpu().numpy()[..., :-100], s_pred # weird pulse at the end of the model, need to be fixed later"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9088f7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "s_ref = compute_style(path)\n",
+    "sentences = passage.split('.') # simple split by comma\n",
+    "wavs = []\n",
+    "s_prev = None\n",
+    "for text in sentences:\n",
+    "    if text.strip() == \"\": continue\n",
+    "    text += '.' # add it back\n",
+    "    \n",
+    "    wav, s_prev = LFinference(text, \n",
+    "                              s_prev, \n",
+    "                              s_ref, \n",
+    "                              alpha = 0.3, \n",
+    "                              beta = 0.9,  # make it more suitable for the text\n",
+    "                              t = 0.7, \n",
+    "                              diffusion_steps=10, embedding_scale=1.5)\n",
+    "    wavs.append(wav)\n",
+    "print('Synthesized: ')\n",
+    "display(ipd.Audio(np.concatenate(wavs), rate=24000, normalize=False))\n",
+    "print('Reference: ')\n",
+    "display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7517b657",
+   "metadata": {},
+   "source": [
+    "### Style Transfer\n",
+    "\n",
+    "The following section demostrates the style transfer capacity for unseen speakers in [Section 6](https://styletts2.github.io/#emo) of the demo page. For this, we set `alpha=0.5, beta = 0.9` for the most pronounced effects (mostly using the sampled style). "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ed95d0f7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def STinference(text, ref_s, ref_text, alpha = 0.3, beta = 0.7, diffusion_steps=5, embedding_scale=1):\n",
+    "    text = text.strip()\n",
+    "    ps = global_phonemizer.phonemize([text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "\n",
+    "    tokens = textclenaer(ps)\n",
+    "    tokens.insert(0, 0)\n",
+    "    tokens = torch.LongTensor(tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    ref_text = ref_text.strip()\n",
+    "    ps = global_phonemizer.phonemize([ref_text])\n",
+    "    ps = word_tokenize(ps[0])\n",
+    "    ps = ' '.join(ps)\n",
+    "\n",
+    "    ref_tokens = textclenaer(ps)\n",
+    "    ref_tokens.insert(0, 0)\n",
+    "    ref_tokens = torch.LongTensor(ref_tokens).to(device).unsqueeze(0)\n",
+    "    \n",
+    "    \n",
+    "    with torch.no_grad():\n",
+    "        input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)\n",
+    "        text_mask = length_to_mask(input_lengths).to(device)\n",
+    "\n",
+    "        t_en = model.text_encoder(tokens, input_lengths, text_mask)\n",
+    "        bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())\n",
+    "        d_en = model.bert_encoder(bert_dur).transpose(-1, -2) \n",
+    "        \n",
+    "        ref_input_lengths = torch.LongTensor([ref_tokens.shape[-1]]).to(device)\n",
+    "        ref_text_mask = length_to_mask(ref_input_lengths).to(device)\n",
+    "        ref_bert_dur = model.bert(ref_tokens, attention_mask=(~ref_text_mask).int())\n",
+    "        s_pred = sampler(noise = torch.randn((1, 256)).unsqueeze(1).to(device), \n",
+    "                                          embedding=bert_dur,\n",
+    "                                          embedding_scale=embedding_scale,\n",
+    "                                            features=ref_s, # reference from the same speaker as the embedding\n",
+    "                                             num_steps=diffusion_steps).squeeze(1)\n",
+    "\n",
+    "\n",
+    "        s = s_pred[:, 128:]\n",
+    "        ref = s_pred[:, :128]\n",
+    "\n",
+    "        ref = alpha * ref + (1 - alpha)  * ref_s[:, :128]\n",
+    "        s = beta * s + (1 - beta)  * ref_s[:, 128:]\n",
+    "\n",
+    "        d = model.predictor.text_encoder(d_en, \n",
+    "                                         s, input_lengths, text_mask)\n",
+    "\n",
+    "        x, _ = model.predictor.lstm(d)\n",
+    "        duration = model.predictor.duration_proj(x)\n",
+    "\n",
+    "        duration = torch.sigmoid(duration).sum(axis=-1)\n",
+    "        pred_dur = torch.round(duration.squeeze()).clamp(min=1)\n",
+    "\n",
+    "\n",
+    "        pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))\n",
+    "        c_frame = 0\n",
+    "        for i in range(pred_aln_trg.size(0)):\n",
+    "            pred_aln_trg[i, c_frame:c_frame + int(pred_dur[i].data)] = 1\n",
+    "            c_frame += int(pred_dur[i].data)\n",
+    "\n",
+    "        # encode prosody\n",
+    "        en = (d.transpose(-1, -2) @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(en)\n",
+    "            asr_new[:, :, 0] = en[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = en[:, :, 0:-1]\n",
+    "            en = asr_new\n",
+    "\n",
+    "        F0_pred, N_pred = model.predictor.F0Ntrain(en, s)\n",
+    "\n",
+    "        asr = (t_en @ pred_aln_trg.unsqueeze(0).to(device))\n",
+    "        if model_params.decoder.type == \"hifigan\":\n",
+    "            asr_new = torch.zeros_like(asr)\n",
+    "            asr_new[:, :, 0] = asr[:, :, 0]\n",
+    "            asr_new[:, :, 1:] = asr[:, :, 0:-1]\n",
+    "            asr = asr_new\n",
+    "\n",
+    "        out = model.decoder(asr, \n",
+    "                                F0_pred, N_pred, ref.squeeze().unsqueeze(0))\n",
+    "    \n",
+    "        \n",
+    "    return out.squeeze().cpu().numpy()[..., :-50] # weird pulse at the end of the model, need to be fixed later"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec3f0da4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# reference texts to sample styles\n",
+    "\n",
+    "ref_texts = {}\n",
+    "ref_texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
+    "ref_texts['Sad'] = \"I am sorry to say that we have suffered a severe setback in our efforts to restore prosperity and confidence.\"\n",
+    "ref_texts['Angry'] = \"The field of astronomy is a joke! Its theories are based on flawed observations and biased interpretations!\"\n",
+    "ref_texts['Surprised'] = \"I can't believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d0a3825",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "s_ref = compute_style(path)\n",
+    "\n",
+    "text = \"Yea, his honourable worship is within, but he hath a godly minister or two with him, and likewise a leech.\"\n",
+    "for k,v in ref_texts.items():\n",
+    "    wav = STinference(text, s_ref, v, diffusion_steps=10, alpha=0.5, beta=0.9, embedding_scale=1.5)\n",
+    "    print(k + \": \")\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6750aed9",
+   "metadata": {},
+   "source": [
+    "### Speech diversity\n",
+    "\n",
+    "This section reproduces samples in [Section 7](https://styletts2.github.io/#var) of the demo page. \n",
+    "\n",
+    "`alpha` and `beta` determine the diversity of the synthesized speech. There are two extreme cases:\n",
+    "- If `alpha = 1` and `beta = 1`, the synthesized speech sounds the most dissimilar to the reference speaker, but it is also the most diverse (each time you synthesize a speech it will be totally different). \n",
+    "- If `alpha = 0` and `beta = 0`, the synthesized speech sounds the most siimlar to the reference speaker, but it is deterministic (i.e., the sampled style is not used for speech synthesis). \n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6ae0aa5",
+   "metadata": {},
+   "source": [
+    "#### Default setting (`alpha = 0.3, beta=0.7`)\n",
+    "This setting uses 70% of the reference timbre and 30% of the reference prosody and use the diffusion model to sample them based on the text. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36dc0148",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.3, beta=0.7, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bf9ef421",
+   "metadata": {},
+   "source": [
+    "#### Less diverse setting (`alpha = 0.1, beta=0.3`)\n",
+    "This setting uses 90% of the reference timbre and 70% of the reference prosody. This makes it more similar to the reference speaker at cost of less diverse samples. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9ba406bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.1, beta=0.3, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a38fe464",
+   "metadata": {},
+   "source": [
+    "#### More diverse setting (`alpha = 0.5, beta=0.95`)\n",
+    "This setting uses 50% of the reference timbre and 5% of the reference prosody (so it uses 100% of the sampled prosody, which makes it more diverse), but this makes it more dissimilar to the reference speaker.  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5f25bf94",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0.5, beta=0.95, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21c3a071",
+   "metadata": {},
+   "source": [
+    "#### Extreme setting (`alpha = 1, beta=1`)\n",
+    "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very dissimilar to the reference speaker. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fff8bab1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=1, beta=1, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a8741e5a",
+   "metadata": {},
+   "source": [
+    "#### No variation (`alpha = 0, beta=0`)\n",
+    "This setting uses 0% of the reference timbre and prosody and use the diffusion model to sample the entire style. This makes the speaker very similar to the reference speaker, but there is no variation. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e55dd281",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# unseen speaker\n",
+    "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
+    "ref_s = compute_style(path)\n",
+    "\n",
+    "text = \"How much variation is there?\"\n",
+    "for _ in range(5):\n",
+    "    wav = inference(text, ref_s, diffusion_steps=10, alpha=0, beta=0, embedding_scale=1)\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5e86423",
+   "metadata": {},
+   "source": [
+    "### Extra fun!\n",
+    "\n",
+    "Here we clone some of the authors' voice of the StyleTTS 2 papers with a few seconds of the recording in the wild. None of the voices is in the dataset and all authors agreed to have their voices cloned here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6f558314",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = ''' StyleTTS 2 is a text to speech model that leverages style diffusion and adversarial training with large speech language models to achieve human level text to speech synthesis. '''"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "caa5747c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "reference_dicts = {}\n",
+    "reference_dicts['Yinghao'] = \"Demo/reference_audio/Yinghao.wav\"\n",
+    "reference_dicts['Gavin'] = \"Demo/reference_audio/Gavin.wav\"\n",
+    "reference_dicts['Vinay'] = \"Demo/reference_audio/Vinay.wav\"\n",
+    "reference_dicts['Nima'] = \"Demo/reference_audio/Nima.wav\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44a4cea1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "start = time.time()\n",
+    "noise = torch.randn(1,1,256).to(device)\n",
+    "for k, path in reference_dicts.items():\n",
+    "    ref_s = compute_style(path)\n",
+    "    \n",
+    "    wav = inference(text, ref_s, alpha=0.1, beta=0.5, diffusion_steps=5, embedding_scale=1)\n",
+    "    rtf = (time.time() - start) / (len(wav) / 24000)\n",
+    "    print('Speaker: ' + k)\n",
+    "    import IPython.display as ipd\n",
+    "    print('Synthesized:')\n",
+    "    display(ipd.Audio(wav, rate=24000, normalize=False))\n",
+    "    print('Reference:')\n",
+    "    display(ipd.Audio(path, rate=24000, normalize=False))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python3 (main venv)",
+   "language": "python",
+   "name": "main"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Modules/.ipynb_checkpoints/slmadv-checkpoint.py ADDED Viewed

	@@ -0,0 +1,177 @@

+import torch
+import numpy as np
+import torch.nn.functional as F
+class SLMAdversarialLoss(torch.nn.Module):
+    def __init__(
+        self,
+        model,
+        wl,
+        sampler,
+        min_len,
+        max_len,
+        batch_percentage=0.5,
+        skip_update=10,
+        sig=1.5,
+    ):
+        super().__init__()
+        self.model = model
+        self.wl = wl
+        self.sampler = sampler
+        self.min_len = min_len
+        self.max_len = max_len
+        self.batch_percentage = batch_percentage
+        self.sig = sig
+        self.skip_update = skip_update
+    # ------------------------------------------------------------------ #
+    def forward(
+        self,
+        iters,
+        y_rec_gt,
+        y_rec_gt_pred,
+        waves,
+        mel_input_length,
+        ref_text,
+        ref_lengths,
+        use_ind,
+        s_trg,
+        ref_s=None,
+    ):
+        # ---- full-width mask (matches ref_text.size(1)) ----------------
+        seq_len = ref_text.size(1)
+        text_mask = (
+            torch.arange(seq_len, device=ref_text.device)
+            .unsqueeze(0)
+            >= ref_lengths.unsqueeze(1)
+        )  # shape [B, seq_len]
+        bert_dur = self.model.bert(ref_text, attention_mask=(~text_mask).int())
+        d_en = self.model.bert_encoder(bert_dur).transpose(-1, -2)
+        # ----- style / prosody sampling ---------------------------------
+        if use_ind and np.random.rand() < 0.5:
+            s_preds = s_trg
+        else:
+            num_steps = np.random.randint(3, 5)
+            noise = torch.randn_like(s_trg).unsqueeze(1).to(ref_text.device)
+            sampler_kwargs = dict(
+                noise=noise,
+                embedding=bert_dur,
+                embedding_scale=1,
+                embedding_mask_proba=0.1,
+                num_steps=num_steps,
+            )
+            if ref_s is not None:
+                sampler_kwargs["features"] = ref_s
+            s_preds = self.sampler(**sampler_kwargs).squeeze(1)
+        s_dur, s = s_preds[:, 128:], s_preds[:, :128]
+        # random alignment placeholder must match the *padded* token width
+        seq_len   = ref_text.size(1)
+        rand_align = torch.randn(ref_text.size(0), seq_len, 2, device=ref_text.device)
+        d, _ = self.model.predictor(
+            d_en, s_dur, ref_lengths,
+            rand_align,
+            text_mask,
+        )
+        # ----- differentiable duration modelling -----------------------
+        attn_preds, output_lengths = [], []
+        for _s2s_pred, _len in zip(d, ref_lengths):
+            _s2s_pred_org = _s2s_pred[: _len]
+            _s2s_pred_sig = torch.sigmoid(_s2s_pred_org)
+            _dur_pred = _s2s_pred_sig.sum(dim=-1)
+            l = int(torch.round(_s2s_pred_sig.sum()).item())
+            t = torch.arange(l, device=ref_text.device).unsqueeze(0).expand(_len, l)
+            loc = torch.cumsum(_dur_pred, dim=0) - _dur_pred / 2
+            h = torch.exp(-0.5 * (t - (l - loc.unsqueeze(-1))) ** 2 / (self.sig**2))
+            out = F.conv1d(
+                _s2s_pred_org.unsqueeze(0),
+                h.unsqueeze(1),
+                padding=h.size(-1) - 1,
+                groups=int(_len),
+            )[..., :l]
+            attn_preds.append(F.softmax(out.squeeze(), dim=0))
+            output_lengths.append(l)
+        max_len = max(output_lengths)
+        # ----- build full-width alignment matrix -----------------------
+        with torch.no_grad():
+            t_en = self.model.text_encoder(ref_text, ref_lengths, text_mask)
+        seq_len = ref_text.size(1)
+        s2s_attn = torch.zeros(
+            len(ref_lengths), seq_len, max_len, device=ref_text.device
+        )
+        for bib, (attn, L) in enumerate(zip(attn_preds, output_lengths)):
+            s2s_attn[bib, : ref_lengths[bib], :L] = attn
+        asr_pred = t_en @ s2s_attn
+        _, p_pred = self.model.predictor(
+            d_en, s_dur, ref_lengths, s2s_attn, text_mask
+        )
+        # ----- clip extraction -----------------------------------------
+        mel_len = max(int(min(output_lengths) / 2 - 1), self.min_len // 2)
+        mel_len = min(mel_len, self.max_len // 2)
+        en, p_en, sp, wav = [], [], [], []
+        for bib, L_pred in enumerate(output_lengths):
+            L_gt = int(mel_input_length[bib].item() / 2)
+            if L_gt <= mel_len or L_pred <= mel_len:
+                continue
+            sp.append(s_preds[bib])
+            start = np.random.randint(0, L_pred - mel_len)
+            en.append(asr_pred[bib, :, start : start + mel_len])
+            p_en.append(p_pred[bib, :, start : start + mel_len])
+            start_gt = np.random.randint(0, L_gt - mel_len)
+            y = waves[bib][(start_gt * 2) * 300 : ((start_gt + mel_len) * 2) * 300]
+            wav.append(torch.from_numpy(y).to(ref_text.device))
+            if len(wav) >= self.batch_percentage * len(waves):
+                break
+        if len(sp) <= 1:
+            return None
+        sp = torch.stack(sp)
+        wav = torch.stack(wav).float()
+        en = torch.stack(en)
+        p_en = torch.stack(p_en)
+        F0_fake, N_fake = self.model.predictor.F0Ntrain(p_en, sp[:, 128:])
+        y_pred = self.model.decoder(en, F0_fake, N_fake, sp[:, :128])
+        # -------------- adversarial losses -----------------------------
+        if (iters + 1) % self.skip_update == 0:
+            d_loss = self.wl.discriminator(wav.squeeze(), y_pred.detach().squeeze()).mean()
+        else:
+            d_loss = 0
+        gen_loss = self.wl.generator(y_pred.squeeze()).mean()
+        return d_loss, gen_loss, y_pred.detach().cpu().numpy()
+# ------------------------------------------------------------------ #
+def length_to_mask(lengths: torch.Tensor) -> torch.Tensor:
+    """Classic length mask: 1 → PAD, 0 → real token."""
+    max_len = lengths.max()
+    mask = (
+        torch.arange(max_len, device=lengths.device)
+        .unsqueeze(0)
+        .expand(lengths.size(0), -1)
+    )
+    return mask >= lengths.unsqueeze(1)

Modules/slmadv.py CHANGED Viewed

@@ -2,194 +2,176 @@ import torch
 import numpy as np
 import torch.nn.functional as F
-class SLMAdversarialLoss(torch.nn.Module):
-    def __init__(self, model, wl, sampler, min_len, max_len, batch_percentage=0.5, skip_update=10, sig=1.5):
-        super(SLMAdversarialLoss, self).__init__()
         self.model = model
         self.wl = wl
         self.sampler = sampler
         self.min_len = min_len
         self.max_len = max_len
         self.batch_percentage = batch_percentage
         self.sig = sig
         self.skip_update = skip_update
-    def forward(self, iters, y_rec_gt, y_rec_gt_pred, waves, mel_input_length, ref_text, ref_lengths, use_ind, s_trg, ref_s=None):
-        text_mask = length_to_mask(ref_lengths).to(ref_text.device)
         bert_dur = self.model.bert(ref_text, attention_mask=(~text_mask).int())
-        d_en = self.model.bert_encoder(bert_dur).transpose(-1, -2)
         if use_ind and np.random.rand() < 0.5:
             s_preds = s_trg
         else:
             num_steps = np.random.randint(3, 5)
             if ref_s is not None:
-                s_preds = self.sampler(noise = torch.randn_like(s_trg).unsqueeze(1).to(ref_text.device),
-                      embedding=bert_dur,
-                      embedding_scale=1,
-                               features=ref_s, # reference from the same speaker as the embedding
-                         embedding_mask_proba=0.1,
-                         num_steps=num_steps).squeeze(1)
-            else:
-                s_preds = self.sampler(noise = torch.randn_like(s_trg).unsqueeze(1).to(ref_text.device),
-                      embedding=bert_dur,
-                      embedding_scale=1,
-                         embedding_mask_proba=0.1,
-                         num_steps=num_steps).squeeze(1)
-        s_dur = s_preds[:, 128:]
-        s = s_preds[:, :128]
-        d, _ = self.model.predictor(d_en, s_dur,
-                                                ref_lengths,
-                                                torch.randn(ref_lengths.shape[0], ref_lengths.max(), 2).to(ref_text.device),
-                                                text_mask)
-        bib = 0
-        output_lengths = []
-        attn_preds = []
-        # differentiable duration modeling
-        for _s2s_pred, _text_length in zip(d, ref_lengths):
-            _s2s_pred_org = _s2s_pred[:_text_length, :]
-            _s2s_pred = torch.sigmoid(_s2s_pred_org)
-            _dur_pred = _s2s_pred.sum(axis=-1)
-            l = int(torch.round(_s2s_pred.sum()).item())
-            t = torch.arange(0, l).expand(l)
-            t = torch.arange(0, l).unsqueeze(0).expand((len(_s2s_pred), l)).to(ref_text.device)
             loc = torch.cumsum(_dur_pred, dim=0) - _dur_pred / 2
-            h = torch.exp(-0.5 * torch.square(t - (l - loc.unsqueeze(-1))) / (self.sig)**2)
-            out = torch.nn.functional.conv1d(_s2s_pred_org.unsqueeze(0),
-                                         h.unsqueeze(1),
-                                         padding=h.shape[-1] - 1, groups=int(_text_length))[..., :l]
             attn_preds.append(F.softmax(out.squeeze(), dim=0))
             output_lengths.append(l)
         max_len = max(output_lengths)
         with torch.no_grad():
             t_en = self.model.text_encoder(ref_text, ref_lengths, text_mask)
-        s2s_attn = torch.zeros(len(ref_lengths), int(ref_lengths.max()), max_len).to(ref_text.device)
-        for bib in range(len(output_lengths)):
-            s2s_attn[bib, :ref_lengths[bib], :output_lengths[bib]] = attn_preds[bib]
         asr_pred = t_en @ s2s_attn
-        _, p_pred = self.model.predictor(d_en, s_dur,
-                                                ref_lengths,
-                                                s2s_attn,
-                                                text_mask)
         mel_len = max(int(min(output_lengths) / 2 - 1), self.min_len // 2)
         mel_len = min(mel_len, self.max_len // 2)
-        # get clips
-        en = []
-        p_en = []
-        sp = []
-        F0_fakes = []
-        N_fakes = []
-        wav = []
-        for bib in range(len(output_lengths)):
-            mel_length_pred = output_lengths[bib]
-            mel_length_gt = int(mel_input_length[bib].item() / 2)
-            if mel_length_gt <= mel_len or mel_length_pred <= mel_len:
                 continue
             sp.append(s_preds[bib])
-            random_start = np.random.randint(0, mel_length_pred - mel_len)
-            en.append(asr_pred[bib, :, random_start:random_start+mel_len])
-            p_en.append(p_pred[bib, :, random_start:random_start+mel_len])
-            # get ground truth clips
-            random_start = np.random.randint(0, mel_length_gt - mel_len)
-            y = waves[bib][(random_start * 2) * 300:((random_start+mel_len) * 2) * 300]
             wav.append(torch.from_numpy(y).to(ref_text.device))
-            if len(wav) >= self.batch_percentage * len(waves): # prevent OOM due to longer lengths
                 break
         if len(sp) <= 1:
             return None
         sp = torch.stack(sp)
         wav = torch.stack(wav).float()
         en = torch.stack(en)
         p_en = torch.stack(p_en)
         F0_fake, N_fake = self.model.predictor.F0Ntrain(p_en, sp[:, 128:])
         y_pred = self.model.decoder(en, F0_fake, N_fake, sp[:, :128])
-        # discriminator loss
         if (iters + 1) % self.skip_update == 0:
-            if np.random.randint(0, 2) == 0:
-                wav = y_rec_gt_pred
-                use_rec = True
-            else:
-                use_rec = False
-            crop_size = min(wav.size(-1), y_pred.size(-1))
-            if use_rec: # use reconstructed (shorter lengths), do length invariant regularization
-                if wav.size(-1) > y_pred.size(-1):
-                    real_GP = wav[:, : , :crop_size]
-                    out_crop = self.wl.discriminator_forward(real_GP.detach().squeeze())
-                    out_org = self.wl.discriminator_forward(wav.detach().squeeze())
-                    loss_reg = F.l1_loss(out_crop, out_org[..., :out_crop.size(-1)])
-                    if np.random.randint(0, 2) == 0:
-                        d_loss = self.wl.discriminator(real_GP.detach().squeeze(), y_pred.detach().squeeze()).mean()
-                    else:
-                        d_loss = self.wl.discriminator(wav.detach().squeeze(), y_pred.detach().squeeze()).mean()
-                else:
-                    real_GP = y_pred[:, : , :crop_size]
-                    out_crop = self.wl.discriminator_forward(real_GP.detach().squeeze())
-                    out_org = self.wl.discriminator_forward(y_pred.detach().squeeze())
-                    loss_reg = F.l1_loss(out_crop, out_org[..., :out_crop.size(-1)])
-                    if np.random.randint(0, 2) == 0:
-                        d_loss = self.wl.discriminator(wav.detach().squeeze(), real_GP.detach().squeeze()).mean()
-                    else:
-                        d_loss = self.wl.discriminator(wav.detach().squeeze(), y_pred.detach().squeeze()).mean()
-                # regularization (ignore length variation)
-                d_loss += loss_reg
-                out_gt = self.wl.discriminator_forward(y_rec_gt.detach().squeeze())
-                out_rec = self.wl.discriminator_forward(y_rec_gt_pred.detach().squeeze())
-                # regularization (ignore reconstruction artifacts)
-                d_loss += F.l1_loss(out_gt, out_rec)
-            else:
-                d_loss = self.wl.discriminator(wav.detach().squeeze(), y_pred.detach().squeeze()).mean()
         else:
             d_loss = 0
-        # generator loss
-        gen_loss = self.wl.generator(y_pred.squeeze())
-        gen_loss = gen_loss.mean()
         return d_loss, gen_loss, y_pred.detach().cpu().numpy()
-def length_to_mask(lengths):
-    mask = torch.arange(lengths.max()).unsqueeze(0).expand(lengths.shape[0], -1).type_as(lengths)
-    mask = torch.gt(mask+1, lengths.unsqueeze(1))
-    return mask

 import numpy as np
 import torch.nn.functional as F
+class SLMAdversarialLoss(torch.nn.Module):
+    def __init__(
+        self,
+        model,
+        wl,
+        sampler,
+        min_len,
+        max_len,
+        batch_percentage=0.5,
+        skip_update=10,
+        sig=1.5,
+    ):
+        super().__init__()
         self.model = model
         self.wl = wl
         self.sampler = sampler
         self.min_len = min_len
         self.max_len = max_len
         self.batch_percentage = batch_percentage
         self.sig = sig
         self.skip_update = skip_update
+    # ------------------------------------------------------------------ #
+    def forward(
+        self,
+        iters,
+        y_rec_gt,
+        y_rec_gt_pred,
+        waves,
+        mel_input_length,
+        ref_text,
+        ref_lengths,
+        use_ind,
+        s_trg,
+        ref_s=None,
+    ):
+        # ---- full-width mask (matches ref_text.size(1)) ----------------
+        seq_len = ref_text.size(1)
+        text_mask = (
+            torch.arange(seq_len, device=ref_text.device)
+            .unsqueeze(0)
+            >= ref_lengths.unsqueeze(1)
+        )  # shape [B, seq_len]
         bert_dur = self.model.bert(ref_text, attention_mask=(~text_mask).int())
+        d_en = self.model.bert_encoder(bert_dur).transpose(-1, -2)
+        # ----- style / prosody sampling ---------------------------------
         if use_ind and np.random.rand() < 0.5:
             s_preds = s_trg
         else:
             num_steps = np.random.randint(3, 5)
+            noise = torch.randn_like(s_trg).unsqueeze(1).to(ref_text.device)
+            sampler_kwargs = dict(
+                noise=noise,
+                embedding=bert_dur,
+                embedding_scale=1,
+                embedding_mask_proba=0.1,
+                num_steps=num_steps,
+            )
             if ref_s is not None:
+                sampler_kwargs["features"] = ref_s
+            s_preds = self.sampler(**sampler_kwargs).squeeze(1)
+        s_dur, s = s_preds[:, 128:], s_preds[:, :128]
+        # random alignment placeholder must match the *padded* token width
+        seq_len   = ref_text.size(1)
+        rand_align = torch.randn(ref_text.size(0), seq_len, 2, device=ref_text.device)
+        d, _ = self.model.predictor(
+            d_en, s_dur, ref_lengths,
+            rand_align,
+            text_mask,
+        )
+        # ----- differentiable duration modelling -----------------------
+        attn_preds, output_lengths = [], []
+        for _s2s_pred, _len in zip(d, ref_lengths):
+            _s2s_pred_org = _s2s_pred[: _len]
+            _s2s_pred_sig = torch.sigmoid(_s2s_pred_org)
+            _dur_pred = _s2s_pred_sig.sum(dim=-1)
+            l = int(torch.round(_s2s_pred_sig.sum()).item())
+            t = torch.arange(l, device=ref_text.device).unsqueeze(0).expand(_len, l)
             loc = torch.cumsum(_dur_pred, dim=0) - _dur_pred / 2
+            h = torch.exp(-0.5 * (t - (l - loc.unsqueeze(-1))) ** 2 / (self.sig**2))
+            out = F.conv1d(
+                _s2s_pred_org.unsqueeze(0),
+                h.unsqueeze(1),
+                padding=h.size(-1) - 1,
+                groups=int(_len),
+            )[..., :l]
             attn_preds.append(F.softmax(out.squeeze(), dim=0))
             output_lengths.append(l)
         max_len = max(output_lengths)
+        # ----- build full-width alignment matrix -----------------------
         with torch.no_grad():
             t_en = self.model.text_encoder(ref_text, ref_lengths, text_mask)
+        seq_len = ref_text.size(1)
+        s2s_attn = torch.zeros(
+            len(ref_lengths), seq_len, max_len, device=ref_text.device
+        )
+        for bib, (attn, L) in enumerate(zip(attn_preds, output_lengths)):
+            s2s_attn[bib, : ref_lengths[bib], :L] = attn
         asr_pred = t_en @ s2s_attn
+        _, p_pred = self.model.predictor(
+            d_en, s_dur, ref_lengths, s2s_attn, text_mask
+        )
+        # ----- clip extraction -----------------------------------------
         mel_len = max(int(min(output_lengths) / 2 - 1), self.min_len // 2)
         mel_len = min(mel_len, self.max_len // 2)
+        en, p_en, sp, wav = [], [], [], []
+        for bib, L_pred in enumerate(output_lengths):
+            L_gt = int(mel_input_length[bib].item() / 2)
+            if L_gt <= mel_len or L_pred <= mel_len:
                 continue
             sp.append(s_preds[bib])
+            start = np.random.randint(0, L_pred - mel_len)
+            en.append(asr_pred[bib, :, start : start + mel_len])
+            p_en.append(p_pred[bib, :, start : start + mel_len])
+            start_gt = np.random.randint(0, L_gt - mel_len)
+            y = waves[bib][(start_gt * 2) * 300 : ((start_gt + mel_len) * 2) * 300]
             wav.append(torch.from_numpy(y).to(ref_text.device))
+            if len(wav) >= self.batch_percentage * len(waves):
                 break
         if len(sp) <= 1:
             return None
         sp = torch.stack(sp)
         wav = torch.stack(wav).float()
         en = torch.stack(en)
         p_en = torch.stack(p_en)
         F0_fake, N_fake = self.model.predictor.F0Ntrain(p_en, sp[:, 128:])
         y_pred = self.model.decoder(en, F0_fake, N_fake, sp[:, :128])
+        # -------------- adversarial losses -----------------------------
         if (iters + 1) % self.skip_update == 0:
+            d_loss = self.wl.discriminator(wav.squeeze(), y_pred.detach().squeeze()).mean()
         else:
             d_loss = 0
+        gen_loss = self.wl.generator(y_pred.squeeze()).mean()
         return d_loss, gen_loss, y_pred.detach().cpu().numpy()
+# ------------------------------------------------------------------ #
+def length_to_mask(lengths: torch.Tensor) -> torch.Tensor:
+    """Classic length mask: 1 → PAD, 0 → real token."""
+    max_len = lengths.max()
+    mask = (
+        torch.arange(max_len, device=lengths.device)
+        .unsqueeze(0)
+        .expand(lengths.size(0), -1)
+    )
+    return mask >= lengths.unsqueeze(1)

__pycache__/losses.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/losses.cpython-310.pyc and b/__pycache__/losses.cpython-310.pyc differ

__pycache__/meldataset.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/meldataset.cpython-310.pyc and b/__pycache__/meldataset.cpython-310.pyc differ

__pycache__/models.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/models.cpython-310.pyc and b/__pycache__/models.cpython-310.pyc differ

__pycache__/optimizers.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/optimizers.cpython-310.pyc and b/__pycache__/optimizers.cpython-310.pyc differ

__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ

events.out.tfevents.1749451143.164-152-17-237.47710.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ca5ac7da0de1cd8b2940a042eddfe0f7ea50cc867411a91d90240fa2186962b0
-size 88

events.out.tfevents.1749451143.164-152-17-237.47712.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:24f7b8986b9471590fd70ce3705e31a5b5a97854cdc1887585591ba318c1c150
-size 88

events.out.tfevents.1749451144.164-152-17-237.47706.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a4f87f7a9fa06bc2a39e77d91d3dd4c7d76ee7c9bbbf2f6d6b73f3a9d6836d0a
-size 88

events.out.tfevents.1749451144.164-152-17-237.47708.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4f57d72c8bb8f7d68c2a16d4e5eea3151d1cd8aa752be3a879c003aa481c19b3
-size 88

events.out.tfevents.1749451144.164-152-17-237.47709.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:103d5b2f29512166ac9979033248e0fb344847396ec0ed3dea7e96e5fad84e80
-size 88

events.out.tfevents.1749451144.164-152-17-237.47711.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7fa0f5d9031c97fbf708f0a40c4e2950dcbd07c683488a659710ab9fcfd1c224
-size 88

events.out.tfevents.1749451220.164-152-17-237.48862.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1f67c96bcdf2b41944f1f6710d3735137dd4254b5d58570d3b304e894de5acc8
-size 88

events.out.tfevents.1749451220.164-152-17-237.48863.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2e09c7f592ec6d1f20e8a0b1e0fce4ff9b209f4c6d2e466ee6c2a10c761207a4
-size 88

events.out.tfevents.1749451220.164-152-17-237.48864.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6358d1685327b1fc73bcdfc1ba181c900d63e4bc2a679a646aa697446cbcc818
-size 88

events.out.tfevents.1749451220.164-152-17-237.48865.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0c2cfe09514438496a286074eff7b5d988953c53eca8c446a57b833aca2cd233
-size 88

events.out.tfevents.1749451220.164-152-17-237.48868.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a2d99a10e442411fc79a40bba5ba012773c90f5be44254d6b872f3e350d0bb98
-size 88

events.out.tfevents.1749451221.164-152-17-237.48861.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:72554714ef2293ef47dc12bdc8698f70024abd16bfa736eb3de03d0e8b1c0eee
-size 88

events.out.tfevents.1749451221.164-152-17-237.48867.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7a882509295dfd1193368519c9cad538370392117ee7b6c483ee939ee7979769
-size 88

events.out.tfevents.1749451222.164-152-17-237.48866.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:37a6fd68ad5ef36692d7d5389ba938e318c6287b20c2684a628e5245f186048c
-size 88

events.out.tfevents.1749453792.164-152-17-237.51057.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8d42df1e6023c4e593c8479eecb153738ecc7600b94a1c388173708d38fc3688
-size 88

events.out.tfevents.1749453792.164-152-17-237.51059.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3be5b37365ae5d97c1f0573f94f70a082f707c6c7d49926d118111ac7e48a818
-size 88

events.out.tfevents.1749453792.164-152-17-237.51061.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:16300e80b48b67bd14ce00c5751e4a6841df3da3be4873c677dc08a99a6c3aae
-size 88

events.out.tfevents.1749453792.164-152-17-237.51063.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:84bbbebc6a97e89725078bcf6533475d3286f6af25b23f781558e7cb8d8957e3
-size 88

events.out.tfevents.1749453793.164-152-17-237.51056.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9a3560e484d3bd2c79ebe1507599d497f261cb3b62d1644697a5a2953d156c0d
-size 88

events.out.tfevents.1749453793.164-152-17-237.51058.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c70617864f6b48f9177c3051cf3d0e857374656601e1f3b2130e91dd6d3090ed
-size 88

events.out.tfevents.1749453793.164-152-17-237.51060.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e2a2cccc5958217207ffb6d11f98e619f4dedf92682121709eb5870fb3db085d
-size 88

events.out.tfevents.1749453794.164-152-17-237.51062.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b66ebcaedfeb8d0a1d4a801e5e18783d8dfa41eedf362b2a0733a38c8f0a82fa
-size 88

events.out.tfevents.1749453905.164-152-17-237.52357.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:39098f172013dbdfbddcfb78c6c126b7b67671f89989a344d4262fcd433c3e9f
-size 88

events.out.tfevents.1749453905.164-152-17-237.52358.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e31a13e3e32ec6f2081c987fd6b1bd6c20bce2ca61312a576eea1aeceea533dc
-size 88

events.out.tfevents.1749453905.164-152-17-237.52360.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c85710b39b9fb9764094a24aa502a715e94261613d7865f89e037553679ee109
-size 88

events.out.tfevents.1749453905.164-152-17-237.52361.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:521592225e7079f153bc3a7f18f79d9122eedbe6dffc5d342912d69ba4a9a1e1
-size 88

events.out.tfevents.1749453906.164-152-17-237.52355.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:914db70df7d7e69fc3396fb49f23b5a6849c9a251533d681c447df73ab81df34
-size 88

events.out.tfevents.1749453906.164-152-17-237.52356.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e1c4479d3572d16ab62395082c0c4a300c42b739f724966866a1a7b15c08344e
-size 88

events.out.tfevents.1749453906.164-152-17-237.52359.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f2daad6b2a7a604a0853f70aee92c15f79557490acd34797d875c670446db7e3
-size 88

events.out.tfevents.1749453906.164-152-17-237.52362.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6abbb8d2b926e553f18af6c4c9203b122736ea0c2a508407150076c1b2842dad
-size 88

events.out.tfevents.1749453977.164-152-17-237.53096.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7a6eaa7928ccbca758c1c4d170f439c3938a2f300a78a12746749cfca3b997cf
-size 88

events.out.tfevents.1749453977.164-152-17-237.53097.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f90c29e2391c038fb896ced73c9099014dfbff51e889bf6075ea8181a59da78d
-size 88

events.out.tfevents.1749453977.164-152-17-237.53098.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0d82622b483fc183e71198f54b5b10c60851af9752b27ddf4dbaca39e988e15d
-size 88

events.out.tfevents.1749453977.164-152-17-237.53099.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:85fb51c72b78ca2c571570edd1a44288737f426a7e314f865c395d3f1d42d764
-size 88

events.out.tfevents.1749453977.164-152-17-237.53100.0 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9d3516306f7671fae64a2df06e433f84e451c5d5e6387a9159d6861958371c75
-size 88