ak36 commited on Jun 14, 2025

Commit

9c5b5e6

1 Parent(s): bf65828

v2

Browse files

Files changed (40) hide show

.ipynb_checkpoints/models-checkpoint.py +2 -1
.ipynb_checkpoints/optimizers-checkpoint.py +73 -0
Configs/.ipynb_checkpoints/config_ft_single-checkpoint.yml +5 -5
Configs/config_ft_single.yml +4 -4
Demo/.ipynb_checkpoints/Inference_pod_90h_30k-checkpoint.ipynb +423 -31
__pycache__/models.cpython-310.pyc +0 -0
logs/pod_90h_30k_second_v2/.ipynb_checkpoints/train-checkpoint.log +0 -0
logs/pod_90h_30k_second_v2/config_ft_single.yml +22 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00000.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00001.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00002.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00003.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00004.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00006.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00007.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00008.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00009.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00010.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00011.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00012.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00013.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00014.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00015.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00016.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00017.pth +3 -0
logs/pod_90h_30k_second_v2/epoch_2nd_00018.pth +3 -0
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758267.7f09b0e2c0b0.17026.0 +3 -0
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758489.7f09b0e2c0b0.18353.0 +3 -0
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758524.7f09b0e2c0b0.18773.0 +3 -0
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758552.7f09b0e2c0b0.19160.0 +3 -0
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758602.7f09b0e2c0b0.19654.0 +3 -0
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763142.7f09b0e2c0b0.41611.0 +3 -0
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763329.7f09b0e2c0b0.42740.0 +3 -0
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763548.7f09b0e2c0b0.44123.0 +3 -0
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749789808.7f09b0e2c0b0.1500.0 +3 -0
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749790964.7f09b0e2c0b0.2345.0 +3 -0
logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749791414.7f09b0e2c0b0.1465.0 +3 -0
logs/pod_90h_30k_second_v2/train.log +0 -0
models.py +2 -1

.ipynb_checkpoints/models-checkpoint.py CHANGED Viewed

@@ -703,8 +703,9 @@ def load_checkpoint(model, optimizer, path, load_only_params=True, ignore_module
     _ = [model[key].eval() for key in model]
     if not load_only_params:
-        epoch = state["epoch"]
         iters = state["iters"]
         optimizer.load_state_dict(state["optimizer"])
     else:
         epoch = 0

     _ = [model[key].eval() for key in model]
     if not load_only_params:
+        epoch = state["epoch"] + 1
         iters = state["iters"]
+        print('Load checkpoint from %s, epoch %d, iters %d' % (path, epoch, iters))
         optimizer.load_state_dict(state["optimizer"])
     else:
         epoch = 0

.ipynb_checkpoints/optimizers-checkpoint.py ADDED Viewed

	@@ -0,0 +1,73 @@

+#coding:utf-8
+import os, sys
+import os.path as osp
+import numpy as np
+import torch
+from torch import nn
+from torch.optim import Optimizer
+from functools import reduce
+from torch.optim import AdamW
+class MultiOptimizer:
+    def __init__(self, optimizers={}, schedulers={}):
+        self.optimizers = optimizers
+        self.schedulers = schedulers
+        self.keys = list(optimizers.keys())
+        self.param_groups = reduce(lambda x,y: x+y, [v.param_groups for v in self.optimizers.values()])
+    def state_dict(self):
+        state_dicts = [(key, self.optimizers[key].state_dict())\
+                       for key in self.keys]
+        return state_dicts
+    def load_state_dict(self, state_dict):
+        for key, val in state_dict:
+            try:
+                self.optimizers[key].load_state_dict(val)
+            except:
+                print("Unloaded %s" % key)
+    def step(self, key=None, scaler=None):
+        keys = [key] if key is not None else self.keys
+        _ = [self._step(key, scaler) for key in keys]
+    def _step(self, key, scaler=None):
+        if scaler is not None:
+            scaler.step(self.optimizers[key])
+            scaler.update()
+        else:
+            self.optimizers[key].step()
+    def zero_grad(self, key=None):
+        if key is not None:
+            self.optimizers[key].zero_grad()
+        else:
+            _ = [self.optimizers[key].zero_grad() for key in self.keys]
+    def scheduler(self, *args, key=None):
+        if key is not None:
+            self.schedulers[key].step(*args)
+        else:
+            _ = [self.schedulers[key].step(*args) for key in self.keys]
+def define_scheduler(optimizer, params):
+    scheduler = torch.optim.lr_scheduler.OneCycleLR(
+        optimizer,
+        max_lr=params.get('max_lr', 2e-4),
+        epochs=params.get('epochs', 200),
+        steps_per_epoch=params.get('steps_per_epoch', 1000),
+        pct_start=params.get('pct_start', 0.0),
+        div_factor=1,
+        final_div_factor=1)
+    return scheduler
+def build_optimizer(parameters_dict, scheduler_params_dict, lr):
+    optim = dict([(key, AdamW(params, lr=lr, weight_decay=1e-4, betas=(0.0, 0.99), eps=1e-9))
+                   for key, params in parameters_dict.items()])
+    schedulers = dict([(key, define_scheduler(opt, scheduler_params_dict[key])) \
+                       for key, opt in optim.items()])
+    multi_optim = MultiOptimizer(optim, schedulers)
+    return multi_optim

Configs/.ipynb_checkpoints/config_ft_single-checkpoint.yml CHANGED Viewed

@@ -1,5 +1,5 @@
 # ─── GLOBAL ──────────────────────────────────────────────────────────
-log_dir: logs/pod_90h_30k_second_lr1
 device: "cuda"
 batch_size: 12        # 40 GB A100, fp16
@@ -11,7 +11,7 @@ save_freq: 1
 log_interval: 50
 # leave blank on first run
-pretrained_model: "" #"/workspace/styletts2/logs/pod_90h_30k/epoch_2nd_00003.pth"
 second_stage_load_pretrained: true
 load_only_params: false
@@ -48,14 +48,14 @@ loss_params:
     lambda_sty: 1. # style reconstruction loss (2nd stage)
     lambda_diff: 1. # score matching loss (2nd stage)
-    diff_epoch: 0 # style diffusion starting epoch (2nd stage)
-    joint_epoch: 0 # joint training starting epoch (2nd stage)
 # ─── OPTIMISER ──────────────────────────────────────────────────────
 optimizer_params:
   lr: 0.0001
   bert_lr: 0.00001
-  ft_lr: 0.0001
   grad_accum_steps: 2
 # ─── MODEL (core network & sub-modules) ─────────────────────────────

 # ─── GLOBAL ──────────────────────────────────────────────────────────
+log_dir: logs/pod_90h_30k_second_v2
 device: "cuda"
 batch_size: 12        # 40 GB A100, fp16
 log_interval: 50
 # leave blank on first run
+pretrained_model: "/workspace/styletts2/logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth"
 second_stage_load_pretrained: true
 load_only_params: false
     lambda_sty: 1. # style reconstruction loss (2nd stage)
     lambda_diff: 1. # score matching loss (2nd stage)
+    diff_epoch: 1 # style diffusion starting epoch (2nd stage)
+    joint_epoch: 5 # joint training starting epoch (2nd stage)
 # ─── OPTIMISER ──────────────────────────────────────────────────────
 optimizer_params:
   lr: 0.0001
   bert_lr: 0.00001
+  ft_lr: 0.00001
   grad_accum_steps: 2
 # ─── MODEL (core network & sub-modules) ─────────────────────────────

Configs/config_ft_single.yml CHANGED Viewed

@@ -1,5 +1,5 @@
 # ─── GLOBAL ──────────────────────────────────────────────────────────
-log_dir: logs/pod_90h_30k_second_lr1
 device: "cuda"
 batch_size: 12        # 40 GB A100, fp16
@@ -11,7 +11,7 @@ save_freq: 1
 log_interval: 50
 # leave blank on first run
-pretrained_model: "" #"/workspace/styletts2/logs/pod_90h_30k/epoch_2nd_00003.pth"
 second_stage_load_pretrained: true
 load_only_params: false
@@ -48,8 +48,8 @@ loss_params:
     lambda_sty: 1. # style reconstruction loss (2nd stage)
     lambda_diff: 1. # score matching loss (2nd stage)
-    diff_epoch: 0 # style diffusion starting epoch (2nd stage)
-    joint_epoch: 0 # joint training starting epoch (2nd stage)
 # ─── OPTIMISER ──────────────────────────────────────────────────────
 optimizer_params:

 # ─── GLOBAL ──────────────────────────────────────────────────────────
+log_dir: logs/pod_90h_30k_second_v2
 device: "cuda"
 batch_size: 12        # 40 GB A100, fp16
 log_interval: 50
 # leave blank on first run
+pretrained_model: "/workspace/styletts2/logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth"
 second_stage_load_pretrained: true
 load_only_params: false
     lambda_sty: 1. # style reconstruction loss (2nd stage)
     lambda_diff: 1. # score matching loss (2nd stage)
+    diff_epoch: 1 # style diffusion starting epoch (2nd stage)
+    joint_epoch: 5 # joint training starting epoch (2nd stage)
 # ─── OPTIMISER ──────────────────────────────────────────────────────
 optimizer_params:

Demo/.ipynb_checkpoints/Inference_pod_90h_30k-checkpoint.ipynb CHANGED Viewed

@@ -20,7 +20,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "96e173bf",
    "metadata": {},
    "outputs": [],
@@ -39,20 +39,227 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "da84c60f",
    "metadata": {},
-   "outputs": [],
    "source": [
     "%cd .."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "5a3ddcc8",
    "metadata": {},
-   "outputs": [],
    "source": [
     "# load packages\n",
     "import time\n",
@@ -77,7 +284,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "00ee05e1",
    "metadata": {},
    "outputs": [],
@@ -113,7 +320,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "bbdc04c0",
    "metadata": {},
    "outputs": [],
@@ -121,6 +328,111 @@
     "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "7b9cecbe",
@@ -131,7 +443,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "64fc4c0f",
    "metadata": {},
    "outputs": [],
@@ -143,12 +455,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "48e7b644",
    "metadata": {},
    "outputs": [],
    "source": [
-    "config = yaml.safe_load(open(\"Models/LibriTTS/config.yml\"))\n",
     "\n",
     "# load pretrained ASR model\n",
     "ASR_config = config.get('ASR_config', False)\n",
@@ -167,10 +479,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "ffc18cf7",
    "metadata": {},
-   "outputs": [],
    "source": [
     "model_params = recursive_munch(config['model_params'])\n",
     "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
@@ -180,21 +503,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "64529d5c",
    "metadata": {},
    "outputs": [],
    "source": [
-    "params_whole = torch.load(\"Models/LibriTTS/epochs_2nd_00020.pth\", map_location='cpu')\n",
     "params = params_whole['net']"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "895d9706",
    "metadata": {},
-   "outputs": [],
    "source": [
     "for key in model:\n",
     "    if key in params:\n",
@@ -217,7 +560,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "c1a59db2",
    "metadata": {},
    "outputs": [],
@@ -227,7 +570,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "e30985ab",
    "metadata": {},
    "outputs": [],
@@ -250,7 +593,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "ca57469c",
    "metadata": {},
    "outputs": [],
@@ -335,7 +678,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "cace9787",
    "metadata": {},
    "outputs": [],
@@ -454,6 +797,22 @@
     "    display(ipd.Audio(path, rate=24000, normalize=False))"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "141e91b3",
@@ -470,7 +829,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "81addda4",
    "metadata": {},
    "outputs": [],
@@ -481,9 +840,46 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "be1b2a11",
    "metadata": {},
    "outputs": [],
    "source": [
     "texts = {}\n",
     "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
@@ -913,9 +1309,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "6d0a3825",
-   "metadata": {
-    "scrolled": false
-   },
    "outputs": [],
    "source": [
     "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
@@ -1110,9 +1504,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "44a4cea1",
-   "metadata": {
-    "scrolled": false
-   },
    "outputs": [],
    "source": [
     "start = time.time()\n",
@@ -1133,9 +1525,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "NLP",
    "language": "python",
-   "name": "nlp"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1147,7 +1539,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.7"
   }
  },
  "nbformat": 4,

   },
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "96e173bf",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
+   "id": "2458c639-10a0-4b57-8602-22bc893c5176",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting git+https://github.com/resemble-ai/monotonic_align.git (from -r requirements.txt (line 17))\n",
+      "  Cloning https://github.com/resemble-ai/monotonic_align.git to /tmp/pip-req-build-ps9pa2ga\n",
+      "  Running command git clone --filter=blob:none --quiet https://github.com/resemble-ai/monotonic_align.git /tmp/pip-req-build-ps9pa2ga\n",
+      "  Resolved https://github.com/resemble-ai/monotonic_align.git to commit c6e5e6cb19882164027eb6e35118e841eed9298e\n",
+      "  Installing build dependencies ... \u001b[?25ldone\n",
+      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
+      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25hCollecting SoundFile (from -r requirements.txt (line 1))\n",
+      "  Using cached soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl.metadata (16 kB)\n",
+      "Requirement already satisfied: torchaudio in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 2)) (2.6.0+cu126)\n",
+      "Collecting munch (from -r requirements.txt (line 3))\n",
+      "  Using cached munch-4.0.0-py2.py3-none-any.whl.metadata (5.9 kB)\n",
+      "Requirement already satisfied: torch in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 4)) (2.6.0+cu126)\n",
+      "Collecting pydub (from -r requirements.txt (line 5))\n",
+      "  Using cached pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)\n",
+      "Requirement already satisfied: pyyaml in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 6)) (6.0.2)\n",
+      "Collecting librosa (from -r requirements.txt (line 7))\n",
+      "  Using cached librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)\n",
+      "Collecting nltk (from -r requirements.txt (line 8))\n",
+      "  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)\n",
+      "Collecting matplotlib (from -r requirements.txt (line 9))\n",
+      "  Downloading matplotlib-3.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)\n",
+      "Collecting accelerate (from -r requirements.txt (line 10))\n",
+      "  Using cached accelerate-1.7.0-py3-none-any.whl.metadata (19 kB)\n",
+      "Collecting transformers (from -r requirements.txt (line 11))\n",
+      "  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)\n",
+      "Collecting einops (from -r requirements.txt (line 12))\n",
+      "  Using cached einops-0.8.1-py3-none-any.whl.metadata (13 kB)\n",
+      "Collecting einops-exts (from -r requirements.txt (line 13))\n",
+      "  Using cached einops_exts-0.0.4-py3-none-any.whl.metadata (621 bytes)\n",
+      "Requirement already satisfied: tqdm in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 14)) (4.67.1)\n",
+      "Collecting typing (from -r requirements.txt (line 15))\n",
+      "  Using cached typing-3.7.4.3.tar.gz (78 kB)\n",
+      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25hRequirement already satisfied: typing-extensions in /venv/main/lib/python3.12/site-packages (from -r requirements.txt (line 16)) (4.13.2)\n",
+      "Collecting cffi>=1.0 (from SoundFile->-r requirements.txt (line 1))\n",
+      "  Downloading cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
+      "Requirement already satisfied: numpy in /venv/main/lib/python3.12/site-packages (from SoundFile->-r requirements.txt (line 1)) (2.1.2)\n",
+      "Requirement already satisfied: filelock in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.18.0)\n",
+      "Requirement already satisfied: setuptools in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (70.2.0)\n",
+      "Requirement already satisfied: sympy==1.13.1 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (1.13.1)\n",
+      "Requirement already satisfied: networkx in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.3)\n",
+      "Requirement already satisfied: jinja2 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.1.4)\n",
+      "Requirement already satisfied: fsspec in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (2025.3.2)\n",
+      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.6.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.77)\n",
+      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.6.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.77)\n",
+      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.6.80 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.80)\n",
+      "Requirement already satisfied: nvidia-cudnn-cu12==9.5.1.17 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (9.5.1.17)\n",
+      "Requirement already satisfied: nvidia-cublas-cu12==12.6.4.1 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.4.1)\n",
+      "Requirement already satisfied: nvidia-cufft-cu12==11.3.0.4 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (11.3.0.4)\n",
+      "Requirement already satisfied: nvidia-curand-cu12==10.3.7.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (10.3.7.77)\n",
+      "Requirement already satisfied: nvidia-cusolver-cu12==11.7.1.2 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (11.7.1.2)\n",
+      "Requirement already satisfied: nvidia-cusparse-cu12==12.5.4.2 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.5.4.2)\n",
+      "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.3 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (0.6.3)\n",
+      "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (2.21.5)\n",
+      "Requirement already satisfied: nvidia-nvtx-cu12==12.6.77 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.77)\n",
+      "Requirement already satisfied: nvidia-nvjitlink-cu12==12.6.85 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (12.6.85)\n",
+      "Requirement already satisfied: triton==3.2.0 in /venv/main/lib/python3.12/site-packages (from torch->-r requirements.txt (line 4)) (3.2.0)\n",
+      "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /venv/main/lib/python3.12/site-packages (from sympy==1.13.1->torch->-r requirements.txt (line 4)) (1.3.0)\n",
+      "Collecting audioread>=2.1.9 (from librosa->-r requirements.txt (line 7))\n",
+      "  Using cached audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)\n",
+      "Collecting numba>=0.51.0 (from librosa->-r requirements.txt (line 7))\n",
+      "  Downloading numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.8 kB)\n",
+      "Collecting scipy>=1.6.0 (from librosa->-r requirements.txt (line 7))\n",
+      "  Downloading scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.0/62.0 kB\u001b[0m \u001b[31m1.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hCollecting scikit-learn>=1.1.0 (from librosa->-r requirements.txt (line 7))\n",
+      "  Downloading scikit_learn-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)\n",
+      "Collecting joblib>=1.0 (from librosa->-r requirements.txt (line 7))\n",
+      "  Using cached joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)\n",
+      "Requirement already satisfied: decorator>=4.3.0 in /venv/main/lib/python3.12/site-packages (from librosa->-r requirements.txt (line 7)) (5.2.1)\n",
+      "Collecting pooch>=1.1 (from librosa->-r requirements.txt (line 7))\n",
+      "  Using cached pooch-1.8.2-py3-none-any.whl.metadata (10 kB)\n",
+      "Collecting soxr>=0.3.2 (from librosa->-r requirements.txt (line 7))\n",
+      "  Downloading soxr-0.5.0.post1-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
+      "Collecting lazy_loader>=0.1 (from librosa->-r requirements.txt (line 7))\n",
+      "  Using cached lazy_loader-0.4-py3-none-any.whl.metadata (7.6 kB)\n",
+      "Collecting msgpack>=1.0 (from librosa->-r requirements.txt (line 7))\n",
+      "  Downloading msgpack-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)\n",
+      "Collecting click (from nltk->-r requirements.txt (line 8))\n",
+      "  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)\n",
+      "Collecting regex>=2021.8.3 (from nltk->-r requirements.txt (line 8))\n",
+      "  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.5/40.5 kB\u001b[0m \u001b[31m4.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting contourpy>=1.0.1 (from matplotlib->-r requirements.txt (line 9))\n",
+      "  Downloading contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.5 kB)\n",
+      "Collecting cycler>=0.10 (from matplotlib->-r requirements.txt (line 9))\n",
+      "  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n",
+      "Collecting fonttools>=4.22.0 (from matplotlib->-r requirements.txt (line 9))\n",
+      "  Downloading fonttools-4.58.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (106 kB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m106.3/106.3 kB\u001b[0m \u001b[31m2.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hCollecting kiwisolver>=1.3.1 (from matplotlib->-r requirements.txt (line 9))\n",
+      "  Downloading kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)\n",
+      "Requirement already satisfied: packaging>=20.0 in /venv/main/lib/python3.12/site-packages (from matplotlib->-r requirements.txt (line 9)) (25.0)\n",
+      "Requirement already satisfied: pillow>=8 in /venv/main/lib/python3.12/site-packages (from matplotlib->-r requirements.txt (line 9)) (11.0.0)\n",
+      "Collecting pyparsing>=2.3.1 (from matplotlib->-r requirements.txt (line 9))\n",
+      "  Using cached pyparsing-3.2.3-py3-none-any.whl.metadata (5.0 kB)\n",
+      "Requirement already satisfied: python-dateutil>=2.7 in /venv/main/lib/python3.12/site-packages (from matplotlib->-r requirements.txt (line 9)) (2.9.0.post0)\n",
+      "Requirement already satisfied: psutil in /venv/main/lib/python3.12/site-packages (from accelerate->-r requirements.txt (line 10)) (7.0.0)\n",
+      "Requirement already satisfied: huggingface-hub>=0.21.0 in /venv/main/lib/python3.12/site-packages (from accelerate->-r requirements.txt (line 10)) (0.30.2)\n",
+      "Collecting safetensors>=0.4.3 (from accelerate->-r requirements.txt (line 10))\n",
+      "  Using cached safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n",
+      "Requirement already satisfied: requests in /venv/main/lib/python3.12/site-packages (from transformers->-r requirements.txt (line 11)) (2.32.3)\n",
+      "Collecting tokenizers<0.22,>=0.21 (from transformers->-r requirements.txt (line 11))\n",
+      "  Using cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)\n",
+      "Collecting pycparser (from cffi>=1.0->SoundFile->-r requirements.txt (line 1))\n",
+      "  Using cached pycparser-2.22-py3-none-any.whl.metadata (943 bytes)\n",
+      "Collecting llvmlite<0.45,>=0.44.0dev0 (from numba>=0.51.0->librosa->-r requirements.txt (line 7))\n",
+      "  Downloading llvmlite-0.44.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)\n",
+      "Requirement already satisfied: platformdirs>=2.5.0 in /venv/main/lib/python3.12/site-packages (from pooch>=1.1->librosa->-r requirements.txt (line 7)) (4.3.7)\n",
+      "Requirement already satisfied: six>=1.5 in /venv/main/lib/python3.12/site-packages (from python-dateutil>=2.7->matplotlib->-r requirements.txt (line 9)) (1.17.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (3.4.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (2.4.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /venv/main/lib/python3.12/site-packages (from requests->transformers->-r requirements.txt (line 11)) (2025.4.26)\n",
+      "Collecting threadpoolctl>=3.1.0 (from scikit-learn>=1.1.0->librosa->-r requirements.txt (line 7))\n",
+      "  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /venv/main/lib/python3.12/site-packages (from jinja2->torch->-r requirements.txt (line 4)) (2.1.5)\n",
+      "Using cached soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl (1.3 MB)\n",
+      "Using cached munch-4.0.0-py2.py3-none-any.whl (9.9 kB)\n",
+      "Using cached pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
+      "Using cached librosa-0.11.0-py3-none-any.whl (260 kB)\n",
+      "Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)\n",
+      "Downloading matplotlib-3.10.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.6/8.6 MB\u001b[0m \u001b[31m28.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hUsing cached accelerate-1.7.0-py3-none-any.whl (362 kB)\n",
+      "Using cached transformers-4.52.4-py3-none-any.whl (10.5 MB)\n",
+      "Using cached einops-0.8.1-py3-none-any.whl (64 kB)\n",
+      "Using cached einops_exts-0.0.4-py3-none-any.whl (3.9 kB)\n",
+      "Using cached audioread-3.0.1-py3-none-any.whl (23 kB)\n",
+      "Downloading cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (479 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m479.4/479.4 kB\u001b[0m \u001b[31m169.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (323 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m323.7/323.7 kB\u001b[0m \u001b[31m127.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hUsing cached cycler-0.12.1-py3-none-any.whl (8.3 kB)\n",
+      "Downloading fonttools-4.58.2-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.9/4.9 MB\u001b[0m \u001b[31m87.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hUsing cached joblib-1.5.1-py3-none-any.whl (307 kB)\n",
+      "Downloading kiwisolver-1.4.8-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m185.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hUsing cached lazy_loader-0.4-py3-none-any.whl (12 kB)\n",
+      "Downloading msgpack-1.1.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (401 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m401.4/401.4 kB\u001b[0m \u001b[31m192.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hDownloading numba-0.61.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.9 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.9/3.9 MB\u001b[0m \u001b[31m42.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
+      "\u001b[?25hUsing cached pooch-1.8.2-py3-none-any.whl (64 kB)\n",
+      "Using cached pyparsing-3.2.3-py3-none-any.whl (111 kB)\n",
+      "Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (796 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m796.9/796.9 kB\u001b[0m \u001b[31m125.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hUsing cached safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (471 kB)\n",
+      "Downloading scikit_learn-1.7.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.5 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.5/12.5 MB\u001b[0m \u001b[31m43.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37.3 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m37.3/37.3 MB\u001b[0m \u001b[31m26.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hDownloading soxr-0.5.0.post1-cp312-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (248 kB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m248.5/248.5 kB\u001b[0m \u001b[31m36.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hUsing cached tokenizers-0.21.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)\n",
+      "Using cached click-8.2.1-py3-none-any.whl (102 kB)\n",
+      "Downloading llvmlite-0.44.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.4 MB)\n",
+      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m42.4/42.4 MB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+      "\u001b[?25hUsing cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)\n",
+      "Using cached pycparser-2.22-py3-none-any.whl (117 kB)\n",
+      "Building wheels for collected packages: typing, monotonic_align\n",
+      "  Building wheel for typing (setup.py) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for typing: filename=typing-3.7.4.3-py3-none-any.whl size=26304 sha256=7bd8523fe1f7cb4e20da87ee646956891addbdea2d87074f6bbf77fe282e8720\n",
+      "  Stored in directory: /root/.cache/pip/wheels/12/98/52/2bffe242a9a487f00886e43b8ed8dac46456702e11a0d6abef\n",
+      "  Building wheel for monotonic_align (pyproject.toml) ... \u001b[?25ldone\n",
+      "\u001b[?25h  Created wheel for monotonic_align: filename=monotonic_align-1.2-cp312-cp312-linux_x86_64.whl size=1543517 sha256=dc9566d3e5a0656ebf939e760d934e0926d435f336db84e0019c7391576cd4cc\n",
+      "  Stored in directory: /tmp/pip-ephem-wheel-cache-0gzg26zy/wheels/76/0a/37/00634137cd000799e060087bd1cb49a060ac6a48fc42a15488\n",
+      "Successfully built typing monotonic_align\n",
+      "Installing collected packages: pydub, typing, threadpoolctl, soxr, scipy, safetensors, regex, pyparsing, pycparser, munch, msgpack, monotonic_align, llvmlite, lazy_loader, kiwisolver, joblib, fonttools, einops, cycler, contourpy, click, audioread, scikit-learn, pooch, numba, nltk, matplotlib, einops-exts, cffi, tokenizers, SoundFile, transformers, librosa, accelerate\n",
+      "Successfully installed SoundFile-0.13.1 accelerate-1.7.0 audioread-3.0.1 cffi-1.17.1 click-8.2.1 contourpy-1.3.2 cycler-0.12.1 einops-0.8.1 einops-exts-0.0.4 fonttools-4.58.2 joblib-1.5.1 kiwisolver-1.4.8 lazy_loader-0.4 librosa-0.11.0 llvmlite-0.44.0 matplotlib-3.10.3 monotonic_align-1.2 msgpack-1.1.0 munch-4.0.0 nltk-3.9.1 numba-0.61.2 pooch-1.8.2 pycparser-2.22 pydub-0.25.1 pyparsing-3.2.3 regex-2024.11.6 safetensors-0.5.3 scikit-learn-1.7.0 scipy-1.15.3 soxr-0.5.0.post1 threadpoolctl-3.6.0 tokenizers-0.21.1 transformers-4.52.4 typing-3.7.4.3\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install -r requirements.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
    "id": "da84c60f",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "/workspace/styletts2\n"
+     ]
+    }
+   ],
    "source": [
     "%cd .."
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "id": "5a3ddcc8",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "177\n"
+     ]
+    }
+   ],
    "source": [
     "# load packages\n",
     "import time\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "id": "00ee05e1",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "id": "bbdc04c0",
    "metadata": {},
    "outputs": [],
     "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "bc8a517e-915c-427f-a3e0-b96310317bec",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: phonemizer in /venv/main/lib/python3.12/site-packages (3.3.0)\n",
+      "Requirement already satisfied: joblib in /venv/main/lib/python3.12/site-packages (from phonemizer) (1.5.1)\n",
+      "Requirement already satisfied: segments in /venv/main/lib/python3.12/site-packages (from phonemizer) (2.3.0)\n",
+      "Requirement already satisfied: attrs>=18.1 in /venv/main/lib/python3.12/site-packages (from phonemizer) (25.3.0)\n",
+      "Requirement already satisfied: dlinfo in /venv/main/lib/python3.12/site-packages (from phonemizer) (2.0.0)\n",
+      "Requirement already satisfied: typing-extensions in /venv/main/lib/python3.12/site-packages (from phonemizer) (4.13.2)\n",
+      "Requirement already satisfied: regex in /venv/main/lib/python3.12/site-packages (from segments->phonemizer) (2024.11.6)\n",
+      "Requirement already satisfied: csvw>=1.5.6 in /venv/main/lib/python3.12/site-packages (from segments->phonemizer) (3.5.1)\n",
+      "Requirement already satisfied: isodate in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (0.7.2)\n",
+      "Requirement already satisfied: python-dateutil in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (2.9.0.post0)\n",
+      "Requirement already satisfied: rfc3986<2 in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (1.5.0)\n",
+      "Requirement already satisfied: uritemplate>=3.0.0 in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (4.2.0)\n",
+      "Requirement already satisfied: babel in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (2.17.0)\n",
+      "Requirement already satisfied: requests in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (2.32.3)\n",
+      "Requirement already satisfied: language-tags in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (1.2.0)\n",
+      "Requirement already satisfied: rdflib in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (7.1.4)\n",
+      "Requirement already satisfied: colorama in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (0.4.6)\n",
+      "Requirement already satisfied: jsonschema in /venv/main/lib/python3.12/site-packages (from csvw>=1.5.6->segments->phonemizer) (4.24.0)\n",
+      "Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /venv/main/lib/python3.12/site-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (2025.4.1)\n",
+      "Requirement already satisfied: referencing>=0.28.4 in /venv/main/lib/python3.12/site-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (0.36.2)\n",
+      "Requirement already satisfied: rpds-py>=0.7.1 in /venv/main/lib/python3.12/site-packages (from jsonschema->csvw>=1.5.6->segments->phonemizer) (0.25.1)\n",
+      "Requirement already satisfied: six>=1.5 in /venv/main/lib/python3.12/site-packages (from python-dateutil->csvw>=1.5.6->segments->phonemizer) (1.17.0)\n",
+      "Requirement already satisfied: pyparsing<4,>=2.1.0 in /venv/main/lib/python3.12/site-packages (from rdflib->csvw>=1.5.6->segments->phonemizer) (3.2.3)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /venv/main/lib/python3.12/site-packages (from requests->csvw>=1.5.6->segments->phonemizer) (3.4.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /venv/main/lib/python3.12/site-packages (from requests->csvw>=1.5.6->segments->phonemizer) (3.10)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /venv/main/lib/python3.12/site-packages (from requests->csvw>=1.5.6->segments->phonemizer) (2.4.0)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /venv/main/lib/python3.12/site-packages (from requests->csvw>=1.5.6->segments->phonemizer) (2025.4.26)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install phonemizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "48f471f2-ae4a-489e-9d6b-11caff294cf6",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reading package lists... Done\n",
+      "Building dependency tree... Done\n",
+      "Reading state information... Done\n",
+      "The following additional packages will be installed:\n",
+      "  espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0\n",
+      "The following NEW packages will be installed:\n",
+      "  espeak-ng espeak-ng-data libespeak-ng1 libpcaudio0 libsonic0\n",
+      "0 upgraded, 5 newly installed, 0 to remove and 42 not upgraded.\n",
+      "Need to get 5128 kB of archives.\n",
+      "After this operation, 13.7 MB of additional disk space will be used.\n",
+      "Get:1 http://archive.ubuntu.com/ubuntu noble/main amd64 libpcaudio0 amd64 1.2-2build3 [9144 B]\n",
+      "Get:2 http://archive.ubuntu.com/ubuntu noble/main amd64 libsonic0 amd64 0.2.0-13build1 [10.3 kB]\n",
+      "Get:3 http://archive.ubuntu.com/ubuntu noble/main amd64 espeak-ng-data amd64 1.51+dfsg-12build1 [4538 kB]\n",
+      "Get:4 http://archive.ubuntu.com/ubuntu noble/main amd64 libespeak-ng1 amd64 1.51+dfsg-12build1 [206 kB]\n",
+      "Get:5 http://archive.ubuntu.com/ubuntu noble/universe amd64 espeak-ng amd64 1.51+dfsg-12build1 [364 kB]\n",
+      "Fetched 5128 kB in 2s (3310 kB/s)     \n",
+      "debconf: delaying package configuration, since apt-utils is not installed\n",
+      "Selecting previously unselected package libpcaudio0:amd64.\n",
+      "(Reading database ... 41253 files and directories currently installed.)\n",
+      "Preparing to unpack .../libpcaudio0_1.2-2build3_amd64.deb ...\n",
+      "Unpacking libpcaudio0:amd64 (1.2-2build3) ...\n",
+      "Selecting previously unselected package libsonic0:amd64.\n",
+      "Preparing to unpack .../libsonic0_0.2.0-13build1_amd64.deb ...\n",
+      "Unpacking libsonic0:amd64 (0.2.0-13build1) ...\n",
+      "Selecting previously unselected package espeak-ng-data:amd64.\n",
+      "Preparing to unpack .../espeak-ng-data_1.51+dfsg-12build1_amd64.deb ...\n",
+      "Unpacking espeak-ng-data:amd64 (1.51+dfsg-12build1) ...\n",
+      "Selecting previously unselected package libespeak-ng1:amd64.\n",
+      "Preparing to unpack .../libespeak-ng1_1.51+dfsg-12build1_amd64.deb ...\n",
+      "Unpacking libespeak-ng1:amd64 (1.51+dfsg-12build1) ...\n",
+      "Selecting previously unselected package espeak-ng.\n",
+      "Preparing to unpack .../espeak-ng_1.51+dfsg-12build1_amd64.deb ...\n",
+      "Unpacking espeak-ng (1.51+dfsg-12build1) ...\n",
+      "Setting up libpcaudio0:amd64 (1.2-2build3) ...\n",
+      "Setting up libsonic0:amd64 (0.2.0-13build1) ...\n",
+      "Setting up espeak-ng-data:amd64 (1.51+dfsg-12build1) ...\n",
+      "Setting up libespeak-ng1:amd64 (1.51+dfsg-12build1) ...\n",
+      "Setting up espeak-ng (1.51+dfsg-12build1) ...\n",
+      "Processing triggers for man-db (2.12.0-4build2) ...\n",
+      "Processing triggers for libc-bin (2.39-0ubuntu8.4) ...\n"
+     ]
+    }
+   ],
+   "source": [
+    "!sudo apt-get install -y -V espeak-ng"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7b9cecbe",
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "id": "64fc4c0f",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 15,
    "id": "48e7b644",
    "metadata": {},
    "outputs": [],
    "source": [
+    "config = yaml.safe_load(open(\"logs/pod_90h_30k_second_lr1/config_ft_single.yml\"))\n",
     "\n",
     "# load pretrained ASR model\n",
     "ASR_config = config.get('ASR_config', False)\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 16,
    "id": "ffc18cf7",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/venv/main/lib/python3.12/site-packages/torch/nn/utils/weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
+      "  WeightNorm.apply(module, name, dim)\n",
+      "/venv/main/lib/python3.12/site-packages/torch/nn/modules/rnn.py:123: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
    "source": [
     "model_params = recursive_munch(config['model_params'])\n",
     "model = build_model(model_params, text_aligner, pitch_extractor, plbert)\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 18,
    "id": "64529d5c",
    "metadata": {},
    "outputs": [],
    "source": [
+    "params_whole = torch.load(\"logs/pod_90h_30k_second_lr1/epoch_2nd_00018.pth\", map_location='cpu')\n",
     "params = params_whole['net']"
    ]
   },
   {
    "cell_type": "code",
+   "execution_count": 19,
    "id": "895d9706",
    "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert loaded\n",
+      "bert_encoder loaded\n",
+      "predictor loaded\n",
+      "decoder loaded\n",
+      "text_encoder loaded\n",
+      "predictor_encoder loaded\n",
+      "style_encoder loaded\n",
+      "diffusion loaded\n",
+      "text_aligner loaded\n",
+      "pitch_extractor loaded\n",
+      "mpd loaded\n",
+      "msd loaded\n",
+      "wd loaded\n"
+     ]
+    }
+   ],
    "source": [
     "for key in model:\n",
     "    if key in params:\n",
   },
   {
    "cell_type": "code",
+   "execution_count": 20,
    "id": "c1a59db2",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 21,
    "id": "e30985ab",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 22,
    "id": "ca57469c",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 23,
    "id": "cace9787",
    "metadata": {},
    "outputs": [],
     "    display(ipd.Audio(path, rate=24000, normalize=False))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "62747cbb-bb33-4be4-8275-8c292e306987",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec8fb32f-91dd-4fca-a7c6-7f156449c296",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "id": "141e91b3",
   },
   {
    "cell_type": "code",
+   "execution_count": 25,
    "id": "81addda4",
    "metadata": {},
    "outputs": [],
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "c0deea36-de7c-4b65-bbc4-8e00697c6796",
    "metadata": {},
    "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41b18368-2fcb-4bc8-8963-00734227267c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "be1b2a11",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "ename": "LookupError",
+     "evalue": "\n**********************************************************************\n  Resource \u001b[93mpunkt_tab\u001b[0m not found.\n  Please use the NLTK Downloader to obtain the resource:\n\n  \u001b[31m>>> import nltk\n  >>> nltk.download('punkt_tab')\n  \u001b[0m\n  For more information see: https://www.nltk.org/data.html\n\n  Attempted to load \u001b[93mtokenizers/punkt_tab/english/\u001b[0m\n\n  Searched in:\n    - '/root/nltk_data'\n    - '/venv/main/nltk_data'\n    - '/venv/main/share/nltk_data'\n    - '/venv/main/lib/nltk_data'\n    - '/usr/share/nltk_data'\n    - '/usr/local/share/nltk_data'\n    - '/usr/lib/nltk_data'\n    - '/usr/local/lib/nltk_data'\n**********************************************************************\n",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mLookupError\u001b[39m                               Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[26]\u001b[39m\u001b[32m, line 8\u001b[39m\n\u001b[32m      5\u001b[39m texts[\u001b[33m'\u001b[39m\u001b[33mSurprised\u001b[39m\u001b[33m'\u001b[39m] = \u001b[33m\"\u001b[39m\u001b[33mI can\u001b[39m\u001b[33m'\u001b[39m\u001b[33mt believe it! You mean to tell me that you have discovered a new species of bacteria in this pond?\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m      7\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m k,v \u001b[38;5;129;01min\u001b[39;00m texts.items():\n\u001b[32m----> \u001b[39m\u001b[32m8\u001b[39m     wav = \u001b[43minference\u001b[49m\u001b[43m(\u001b[49m\u001b[43mv\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mref_s\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdiffusion_steps\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m10\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.3\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbeta\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m0.7\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43membedding_scale\u001b[49m\u001b[43m=\u001b[49m\u001b[32;43m1\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m      9\u001b[39m     \u001b[38;5;28mprint\u001b[39m(k + \u001b[33m\"\u001b[39m\u001b[33m: \u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m     10\u001b[39m     display(ipd.Audio(wav, rate=\u001b[32m24000\u001b[39m, normalize=\u001b[38;5;28;01mFalse\u001b[39;00m))\n",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[22]\u001b[39m\u001b[32m, line 4\u001b[39m, in \u001b[36minference\u001b[39m\u001b[34m(text, ref_s, alpha, beta, diffusion_steps, embedding_scale)\u001b[39m\n\u001b[32m      2\u001b[39m text = text.strip()\n\u001b[32m      3\u001b[39m ps = global_phonemizer.phonemize([text])\n\u001b[32m----> \u001b[39m\u001b[32m4\u001b[39m ps = \u001b[43mword_tokenize\u001b[49m\u001b[43m(\u001b[49m\u001b[43mps\u001b[49m\u001b[43m[\u001b[49m\u001b[32;43m0\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m      5\u001b[39m ps = \u001b[33m'\u001b[39m\u001b[33m \u001b[39m\u001b[33m'\u001b[39m.join(ps)\n\u001b[32m      6\u001b[39m tokens = textclenaer(ps)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/tokenize/__init__.py:142\u001b[39m, in \u001b[36mword_tokenize\u001b[39m\u001b[34m(text, language, preserve_line)\u001b[39m\n\u001b[32m    127\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mword_tokenize\u001b[39m(text, language=\u001b[33m\"\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m\"\u001b[39m, preserve_line=\u001b[38;5;28;01mFalse\u001b[39;00m):\n\u001b[32m    128\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m    129\u001b[39m \u001b[33;03m    Return a tokenized copy of *text*,\u001b[39;00m\n\u001b[32m    130\u001b[39m \u001b[33;03m    using NLTK's recommended word tokenizer\u001b[39;00m\n\u001b[32m   (...)\u001b[39m\u001b[32m    140\u001b[39m \u001b[33;03m    :type preserve_line: bool\u001b[39;00m\n\u001b[32m    141\u001b[39m \u001b[33;03m    \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m142\u001b[39m     sentences = [text] \u001b[38;5;28;01mif\u001b[39;00m preserve_line \u001b[38;5;28;01melse\u001b[39;00m \u001b[43msent_tokenize\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlanguage\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    143\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m [\n\u001b[32m    144\u001b[39m         token \u001b[38;5;28;01mfor\u001b[39;00m sent \u001b[38;5;129;01min\u001b[39;00m sentences \u001b[38;5;28;01mfor\u001b[39;00m token \u001b[38;5;129;01min\u001b[39;00m _treebank_word_tokenizer.tokenize(sent)\n\u001b[32m    145\u001b[39m     ]\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/tokenize/__init__.py:119\u001b[39m, in \u001b[36msent_tokenize\u001b[39m\u001b[34m(text, language)\u001b[39m\n\u001b[32m    109\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34msent_tokenize\u001b[39m(text, language=\u001b[33m\"\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m    110\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m    111\u001b[39m \u001b[33;03m    Return a sentence-tokenized copy of *text*,\u001b[39;00m\n\u001b[32m    112\u001b[39m \u001b[33;03m    using NLTK's recommended sentence tokenizer\u001b[39;00m\n\u001b[32m   (...)\u001b[39m\u001b[32m    117\u001b[39m \u001b[33;03m    :param language: the model name in the Punkt corpus\u001b[39;00m\n\u001b[32m    118\u001b[39m \u001b[33;03m    \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m119\u001b[39m     tokenizer = \u001b[43m_get_punkt_tokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlanguage\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    120\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m tokenizer.tokenize(text)\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/tokenize/__init__.py:105\u001b[39m, in \u001b[36m_get_punkt_tokenizer\u001b[39m\u001b[34m(language)\u001b[39m\n\u001b[32m     96\u001b[39m \u001b[38;5;129m@functools\u001b[39m.lru_cache\n\u001b[32m     97\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m_get_punkt_tokenizer\u001b[39m(language=\u001b[33m\"\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m     98\u001b[39m \u001b[38;5;250m    \u001b[39m\u001b[33;03m\"\"\"\u001b[39;00m\n\u001b[32m     99\u001b[39m \u001b[33;03m    A constructor for the PunktTokenizer that utilizes\u001b[39;00m\n\u001b[32m    100\u001b[39m \u001b[33;03m    a lru cache for performance.\u001b[39;00m\n\u001b[32m   (...)\u001b[39m\u001b[32m    103\u001b[39m \u001b[33;03m    :type language: str\u001b[39;00m\n\u001b[32m    104\u001b[39m \u001b[33;03m    \"\"\"\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m105\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPunktTokenizer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlanguage\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/tokenize/punkt.py:1744\u001b[39m, in \u001b[36mPunktTokenizer.__init__\u001b[39m\u001b[34m(self, lang)\u001b[39m\n\u001b[32m   1742\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, lang=\u001b[33m\"\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m   1743\u001b[39m     PunktSentenceTokenizer.\u001b[34m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m)\n\u001b[32m-> \u001b[39m\u001b[32m1744\u001b[39m     \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mload_lang\u001b[49m\u001b[43m(\u001b[49m\u001b[43mlang\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/tokenize/punkt.py:1749\u001b[39m, in \u001b[36mPunktTokenizer.load_lang\u001b[39m\u001b[34m(self, lang)\u001b[39m\n\u001b[32m   1746\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mload_lang\u001b[39m(\u001b[38;5;28mself\u001b[39m, lang=\u001b[33m\"\u001b[39m\u001b[33menglish\u001b[39m\u001b[33m\"\u001b[39m):\n\u001b[32m   1747\u001b[39m     \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mnltk\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mdata\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m find\n\u001b[32m-> \u001b[39m\u001b[32m1749\u001b[39m     lang_dir = \u001b[43mfind\u001b[49m\u001b[43m(\u001b[49m\u001b[33;43mf\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[33;43mtokenizers/punkt_tab/\u001b[39;49m\u001b[38;5;132;43;01m{\u001b[39;49;00m\u001b[43mlang\u001b[49m\u001b[38;5;132;43;01m}\u001b[39;49;00m\u001b[33;43m/\u001b[39;49m\u001b[33;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[32m   1750\u001b[39m     \u001b[38;5;28mself\u001b[39m._params = load_punkt_params(lang_dir)\n\u001b[32m   1751\u001b[39m     \u001b[38;5;28mself\u001b[39m._lang = lang\n",
+      "\u001b[36mFile \u001b[39m\u001b[32m/venv/main/lib/python3.12/site-packages/nltk/data.py:579\u001b[39m, in \u001b[36mfind\u001b[39m\u001b[34m(resource_name, paths)\u001b[39m\n\u001b[32m    577\u001b[39m sep = \u001b[33m\"\u001b[39m\u001b[33m*\u001b[39m\u001b[33m\"\u001b[39m * \u001b[32m70\u001b[39m\n\u001b[32m    578\u001b[39m resource_not_found = \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00msep\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mmsg\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00msep\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m\n\u001b[32m--> \u001b[39m\u001b[32m579\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mLookupError\u001b[39;00m(resource_not_found)\n",
+      "\u001b[31mLookupError\u001b[39m: \n**********************************************************************\n  Resource \u001b[93mpunkt_tab\u001b[0m not found.\n  Please use the NLTK Downloader to obtain the resource:\n\n  \u001b[31m>>> import nltk\n  >>> nltk.download('punkt_tab')\n  \u001b[0m\n  For more information see: https://www.nltk.org/data.html\n\n  Attempted to load \u001b[93mtokenizers/punkt_tab/english/\u001b[0m\n\n  Searched in:\n    - '/root/nltk_data'\n    - '/venv/main/nltk_data'\n    - '/venv/main/share/nltk_data'\n    - '/venv/main/lib/nltk_data'\n    - '/usr/share/nltk_data'\n    - '/usr/local/share/nltk_data'\n    - '/usr/lib/nltk_data'\n    - '/usr/local/lib/nltk_data'\n**********************************************************************\n"
+     ]
+    }
+   ],
    "source": [
     "texts = {}\n",
     "texts['Happy'] = \"We are happy to invite you to join us on a journey to the past, where we will visit the most amazing monuments ever built by human hands.\"\n",
    "cell_type": "code",
    "execution_count": null,
    "id": "6d0a3825",
+   "metadata": {},
    "outputs": [],
    "source": [
     "path = \"Demo/reference_audio/1221-135767-0014.wav\"\n",
    "cell_type": "code",
    "execution_count": null,
    "id": "44a4cea1",
+   "metadata": {},
    "outputs": [],
    "source": [
     "start = time.time()\n",
  ],
  "metadata": {
   "kernelspec": {
+   "display_name": "Python3 (main venv)",
    "language": "python",
+   "name": "main"
   },
   "language_info": {
    "codemirror_mode": {
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.12.3"
   }
  },
  "nbformat": 4,

__pycache__/models.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/models.cpython-310.pyc and b/__pycache__/models.cpython-310.pyc differ

logs/pod_90h_30k_second_v2/.ipynb_checkpoints/train-checkpoint.log ADDED Viewed

The diff for this file is too large to render. See raw diff

logs/pod_90h_30k_second_v2/config_ft_single.yml ADDED Viewed

	@@ -0,0 +1,22 @@

+{ASR_config: Utils/ASR/config.yml, ASR_path: Utils/ASR/epoch_00080.pth, F0_path: Utils/JDC/bst.t7,
+  PLBERT_dir: Utils/PLBERT/, batch_size: 12, data_params: {OOD_data: /workspace/styletts2/data/OOD_texts.txt,
+    min_length: 50, root_path: /workspace, train_data: /workspace/styletts2/data/train_list.txt,
+    val_data: /workspace/styletts2/data/val_list.txt}, device: cuda, epochs_1st: 25,
+  epochs_2nd: 20, first_stage_path: /workspace/styletts2/stage1_final.pth, load_only_params: false,
+  log_dir: logs/pod_90h_30k_second_v2, log_interval: 50, loss_params: {TMA_epoch: 14,
+    diff_epoch: 1, joint_epoch: 5, lambda_F0: 1.0, lambda_ce: 20.0, lambda_diff: 1.0,
+    lambda_dur: 1.0, lambda_gen: 1.0, lambda_mel: 5.0, lambda_mono: 1.0, lambda_norm: 1.0,
+    lambda_s2s: 1.0, lambda_slm: 1.0, lambda_sty: 1.0}, max_len: 300, model_params: {
+    decoder: {resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]], resblock_kernel_sizes: [
+        3, 7, 11], type: hifigan, upsample_initial_channel: 512, upsample_kernel_sizes: [
+        20, 10, 6, 4], upsample_rates: [10, 5, 3, 2]}, diffusion: {dist: {estimate_sigma_data: true,
+        mean: -3.0, sigma_data: 0.3631309394446902, std: 1.0}, embedding_mask_proba: 0.1,
+      transformer: {head_features: 64, multiplier: 2, num_heads: 8, num_layers: 3}},
+    dim_in: 64, dropout: 0.2, hidden_dim: 512, max_conv_dim: 512, max_dur: 50, multispeaker: true,
+    n_layer: 3, n_mels: 80, n_token: 178, slm: {hidden: 768, initial_channel: 64,
+      model: microsoft/wavlm-base-plus, nlayers: 13, sr: 16000}, style_dim: 128},
+  optimizer_params: {bert_lr: 1.0e-05, ft_lr: 1.0e-05, grad_accum_steps: 2, lr: 0.0001},
+  preprocess_params: {spect_params: {hop_length: 300, n_fft: 2048, win_length: 1200},
+    sr: 24000}, pretrained_model: /workspace/styletts2/logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth,
+  save_freq: 1, second_stage_load_pretrained: true, slmadv_params: {batch_percentage: 0.5,
+    iter: 20, max_len: 500, min_len: 400, scale: 0.01, sig: 1.5, thresh: 5}}

logs/pod_90h_30k_second_v2/epoch_2nd_00000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0928157cdc46e1d7e76b85e06e5264bd3cde20091d3042dc4665caf2bfe02526
+size 1055973030

logs/pod_90h_30k_second_v2/epoch_2nd_00001.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e0e105ad5e3a8c28cd29e0b3f02eb057076d1a8be9eb2b683f1533635e8401e
+size 1589850598

logs/pod_90h_30k_second_v2/epoch_2nd_00002.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26a7b2881b5543005c7581da0d2055750057dea608d01ea788adb1fafc3945f4
+size 1589850598

logs/pod_90h_30k_second_v2/epoch_2nd_00003.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c0065304e4bf5ed0559cb9a45c2d4bdd0e609e89d6e21474ffc4b1cf33922571
+size 1589850598

logs/pod_90h_30k_second_v2/epoch_2nd_00004.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:256ee7584931924080191070f86a26ee9d32828465a52a33b0a7a94fdd109eb7
+size 1589850598

logs/pod_90h_30k_second_v2/epoch_2nd_00005.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74fc75095ee7d28d84006849e9103d2640292e34f57fe58638cf0dc355e7ed0b
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00006.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:637f68e2b35a7b60431920cfc9a9494d15ca971d3edab962b11c54c02c8728ad
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00007.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f05c6d4bba2e93159df4e9b2bb8d3552e695476d173062467cadb57694875f49
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00008.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d97bc932af34f115414e10ac988870e48e50921b0832d14ea142deb89e9d99e
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00009.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5b08801054bf85f09cbc03240bdf5e1ec1d834d97d0933d4df29385dc3808d3f
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00010.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ae7cbe8be0a7f72aa499f92cbc17f6e7a379314675a5d4eae526eff30f1af72
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00011.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:873f640d25ba128814ef68d1c63ff537bbf9d32e3d3c263584c9feeede06c5b5
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00012.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a26b0348252ea0c3525df5b6f9f88c51291d7a191d5502fa7a604ebbeb89dcca
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00013.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4b6e188d004bbb60c34f4807cb2c2a6593242deab4ec59e7c672a5485fe9988
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00014.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2252d3258328915ef20c3569e7297723537e691479690ebd026fc426dbcf2384
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00015.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43544b5d118dbdaacb98500b2f68c8e355ad53222189948e9ee2890e2fec4430
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00016.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edfe04c4831cd8d61fafc6bb413572806f30a8346c033fcae3bedeb05f94e9ae
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00017.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8684fe24d5bd9a1ae3e24529c19147503367aa61df2e550f34902b87179e1e10
+size 2144951284

logs/pod_90h_30k_second_v2/epoch_2nd_00018.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71b08d3be791931c8c1534e2349e2d37dada50a72e700685fd8e4cf6b2d6e381
+size 2144951284

logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758267.7f09b0e2c0b0.17026.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d06bd4591368eb36ccaedb27cf696641c63f468363381dee869e800814cb7a9
+size 1984

logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758489.7f09b0e2c0b0.18353.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f67fd20bb1e7c93e62e1f9c7195dc5abcc7781698b70c679dc8ec3a16e4301e8
+size 88

logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758524.7f09b0e2c0b0.18773.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c6f25e7eb7c86858883dd108c75d5748b4c35d1c2523cd938aadd0c1d6aeb8b1
+size 88

logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758552.7f09b0e2c0b0.19160.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2256999fdd31fea21179c68e82673acd9189401e9d6ae24fed4e3c2eba5c6fd
+size 88

logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749758602.7f09b0e2c0b0.19654.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:de73c06f010bf27d9dddc26d28870a7b5cbeb43e7b1d8be67585861622895a2d
+size 6761012

logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763142.7f09b0e2c0b0.41611.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:048f50f6a49f65fd244e93aed39d98e13e34c77499fc09d3bde9c1e80833a4f9
+size 716

logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763329.7f09b0e2c0b0.42740.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2622d704b1a3eb973e99275d7ad036d28e17836bc139b3fe77060ca5332b2c96
+size 1984

logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749763548.7f09b0e2c0b0.44123.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:453ad43bcbf767292cc38b1b68b95bce85be1f0554f5773779d1c6d851056587
+size 1953908

logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749789808.7f09b0e2c0b0.1500.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f8e754635df505b43fc9ce9b08109bdcc715d84659fc6e8fa4dc93ba2727991
+size 1344

logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749790964.7f09b0e2c0b0.2345.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:704b38d288963faaa1da5a8302dc10f92e904b92545dd17d2c38f520d9bbea01
+size 88

logs/pod_90h_30k_second_v2/tensorboard/events.out.tfevents.1749791414.7f09b0e2c0b0.1465.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6022662a58865c2f726d48b227e4a35f5643c01cdd8781d20152470fc8905558
+size 3881480

logs/pod_90h_30k_second_v2/train.log ADDED Viewed

The diff for this file is too large to render. See raw diff

models.py CHANGED Viewed

@@ -703,8 +703,9 @@ def load_checkpoint(model, optimizer, path, load_only_params=True, ignore_module
     _ = [model[key].eval() for key in model]
     if not load_only_params:
-        epoch = state["epoch"]
         iters = state["iters"]
         optimizer.load_state_dict(state["optimizer"])
     else:
         epoch = 0

     _ = [model[key].eval() for key in model]
     if not load_only_params:
+        epoch = state["epoch"] + 1
         iters = state["iters"]
+        print('Load checkpoint from %s, epoch %d, iters %d' % (path, epoch, iters))
         optimizer.load_state_dict(state["optimizer"])
     else:
         epoch = 0