BlueV2

Running

App Files Files Community

notmax123 commited on Apr 24

Commit

863d06f

1 Parent(s): 4818895

Clone tab: search fonts/pt_models with filename aliases; vendor models/

Browse files

Files changed (7) hide show

.gitignore +2 -0
app.py +57 -33
models/__init__.py +0 -0
models/reference_encoder.py +86 -0
models/text_encoder.py +358 -0
models/utils.py +197 -0
models/vf_estimator.py +507 -0

.gitignore CHANGED Viewed

@@ -13,6 +13,8 @@ voices
 renikud.onnx
 model.onnx
 pt_weights
 # Virtual environments
 venv/

 renikud.onnx
 model.onnx
 pt_weights
+fonts/pt_models
+pt_models
 # Virtual environments
 venv/

app.py CHANGED Viewed

@@ -122,7 +122,7 @@ class TextProcessor:
             from phonemizer.separator import Separator
             EspeakWrapper.set_library(espeakng_loader.get_library_path())
             if hasattr(EspeakWrapper, "set_data_path"):
-                EspeakWrapper.set_data_path(espeakng_loader.get_data_path())
             self._espeak_separator = Separator(phone="", word=" ", syllable="")
             self._espeak_ready = True
         except Exception as e:
@@ -157,7 +157,7 @@ class TextProcessor:
             return re.sub(r"\s+", " ", r.stdout.replace("\n", " ")).strip()
         except Exception as e:
             print(f"[WARN] espeak-ng subprocess failed for {lang}: {e}")
-            return text
     def _phonemize_segment(self, content: str, lang: str) -> str:
         content = content.strip()
@@ -211,7 +211,7 @@ class UnicodeProcessor:
             if isinstance(raw, dict) and "char_to_id" in raw:
                 self.pad_id = int(raw.get("pad_id", 0))
                 self._char_to_id = {k: int(v) for k, v in raw["char_to_id"].items()}
-            else:
                 self._char_to_id = {chr(int(k)): int(v) for k, v in raw.items()}
             print(f"[INFO] Loaded vocab from {indexer_path} ({len(self._char_to_id)} entries)")
         else:
@@ -327,7 +327,7 @@ def chunk_text(text: str, max_len: int = 300) -> List[str]:
         for sentence in re.split(pattern, paragraph):
             if len(current) + len(sentence) + 1 <= max_len:
                 current += (" " if current else "") + sentence
-            else:
                 if current:
                     chunks.append(current.strip())
                 current = sentence
@@ -550,7 +550,30 @@ def synthesize_text(text: str, voice: str, lang: str, steps: int, speed: float,
 # Voice-clone tab (runs export_new_voice.py)
 # ============================================================
 EXPORT_SCRIPT = os.path.join(os.path.dirname(__file__), "export_new_voice.py")
-PT_WEIGHTS_DIR = "pt_weights"
 def _refresh_voices() -> None:
@@ -568,19 +591,20 @@ def clone_voice(ref_wav: Optional[str], voice_name: str):
     safe = re.sub(r"[^\w\-]+", "_", voice_name.strip())
     out_path = os.path.join(VOICES_DIR, f"{safe}.json")
-    needed = {
-        "ae_ckpt":  os.path.join(PT_WEIGHTS_DIR, "blue_codec.safetensors"),
-        "ttl_ckpt": os.path.join(PT_WEIGHTS_DIR, "vf_estimator.safetensors"),
-        "dp_ckpt":  os.path.join(PT_WEIGHTS_DIR, "duration_predictor.safetensors"),
-        "stats":    os.path.join(PT_WEIGHTS_DIR, "stats_multilingual.pt"),
-    }
-    missing = [v for v in needed.values() if not os.path.exists(v)]
     if missing:
         return (
-            "Voice cloning needs PyTorch checkpoints. Please fetch them first:\n"
-            "  hf download notmax123/blue blue_codec.safetensors duration_predictor.safetensors "
-            "vf_estimator.safetensors stats_multilingual.pt --local-dir pt_weights\n\n"
-            f"Missing: {', '.join(missing)}"
         ), gr.update()
     cmd = [
@@ -666,40 +690,40 @@ with gr.Blocks(title="BlueTTS — Multilingual TTS") as demo:
     with gr.Tabs():
         with gr.TabItem("Synthesize"):
-            with gr.Column(elem_classes="card"):
-                text_input = gr.Textbox(
                     label="Text", placeholder="Type or paste text here…",
                     lines=4, elem_classes="big-input",
-                    value="Great ideas become real when a small team keeps building every single day.",
-                )
-                with gr.Column(elem_classes="controls-row"):
-                    with gr.Row(elem_classes="ctrl-row1"):
-                        lang_input = gr.Dropdown(
                             choices=[("English 🇺🇸", "en"), ("Hebrew 🇮🇱", "he"),
                                      ("Spanish 🇪🇸", "es"), ("German 🇩🇪", "de"),
                                      ("Italian 🇮🇹", "it")],
                             value="en", label="Language", elem_classes="ctrl-lang",
                         )
-                        voice_input = gr.Dropdown(
                             choices=list(VOICES.keys()),
                             value=next(iter(VOICES.keys()), None),
                             label="Voice", elem_classes="ctrl-voice",
-                        )
-                    with gr.Row(elem_classes="ctrl-row2"):
                         steps_input = gr.Slider(2, 32, 8, step=1, label="Quality (steps)", elem_classes="ctrl-steps")
                         speed_input = gr.Slider(0.5, 2.0, 1.0, step=0.05, label="Speed", elem_classes="ctrl-speed")
                         cfg_input   = gr.Slider(1.0, 7.0, 3.0, step=0.1, label="CFG Scale", elem_classes="ctrl-cfg")
-                btn = gr.Button("⚡ Generate Speech", elem_classes="gen-btn")
-            audio_out = gr.Audio(label="Output", type="numpy", autoplay=True)
-            stats_out = gr.HTML()
             gr.Examples(examples=EXAMPLES, inputs=[text_input, lang_input], label="Examples")
-            btn.click(
-                synthesize_text,
                 inputs=[text_input, voice_input, lang_input, steps_input, speed_input, cfg_input],
-                outputs=[audio_out, stats_out],
-            )
         with gr.TabItem("Clone Voice"):
             with gr.Column(elem_classes="card"):

             from phonemizer.separator import Separator
             EspeakWrapper.set_library(espeakng_loader.get_library_path())
             if hasattr(EspeakWrapper, "set_data_path"):
+            EspeakWrapper.set_data_path(espeakng_loader.get_data_path())
             self._espeak_separator = Separator(phone="", word=" ", syllable="")
             self._espeak_ready = True
         except Exception as e:
             return re.sub(r"\s+", " ", r.stdout.replace("\n", " ")).strip()
         except Exception as e:
             print(f"[WARN] espeak-ng subprocess failed for {lang}: {e}")
+        return text
     def _phonemize_segment(self, content: str, lang: str) -> str:
         content = content.strip()
             if isinstance(raw, dict) and "char_to_id" in raw:
                 self.pad_id = int(raw.get("pad_id", 0))
                 self._char_to_id = {k: int(v) for k, v in raw["char_to_id"].items()}
+        else:
                 self._char_to_id = {chr(int(k)): int(v) for k, v in raw.items()}
             print(f"[INFO] Loaded vocab from {indexer_path} ({len(self._char_to_id)} entries)")
         else:
         for sentence in re.split(pattern, paragraph):
             if len(current) + len(sentence) + 1 <= max_len:
                 current += (" " if current else "") + sentence
+        else:
                 if current:
                     chunks.append(current.strip())
                 current = sentence
 # Voice-clone tab (runs export_new_voice.py)
 # ============================================================
 EXPORT_SCRIPT = os.path.join(os.path.dirname(__file__), "export_new_voice.py")
+# Accept checkpoints from a handful of common locations (with the filename
+# variants we've seen in the wild) so the clone tab works out of the box.
+PT_WEIGHTS_SEARCH = [
+    "pt_weights",
+    os.path.join("fonts", "pt_models"),
+    "pt_models",
+]
+PT_WEIGHT_ALIASES: dict[str, list[str]] = {
+    "ae_ckpt":  ["blue_codec.safetensors", "blue_codec.pt"],
+    "ttl_ckpt": ["vf_estimator.safetensors", "vf_estimator.pt", "vf_estimetor.pt"],
+    "dp_ckpt":  ["duration_predictor.safetensors", "duration_predictor.pt",
+                 "duration_predictor_final.pt"],
+    "stats":    ["stats_multilingual.pt", "stats.pt"],
+}
+def _find_pt_weight(aliases: list[str]) -> Optional[str]:
+    for d in PT_WEIGHTS_SEARCH:
+        for name in aliases:
+            p = os.path.join(d, name)
+            if os.path.exists(p):
+                return p
+    return None
 def _refresh_voices() -> None:
     safe = re.sub(r"[^\w\-]+", "_", voice_name.strip())
     out_path = os.path.join(VOICES_DIR, f"{safe}.json")
+    needed: dict[str, Optional[str]] = {k: _find_pt_weight(v) for k, v in PT_WEIGHT_ALIASES.items()}
+    missing = [k for k, v in needed.items() if v is None]
     if missing:
+        searched = ", ".join(PT_WEIGHTS_SEARCH)
+        wanted = "\n".join(
+            f"  {k}: any of {PT_WEIGHT_ALIASES[k]}" for k in missing
+        )
         return (
+            "Voice cloning needs PyTorch checkpoints. I looked in: "
+            f"{searched}\nMissing:\n{wanted}\n\n"
+            "Fetch them with:\n"
+            "  hf download notmax123/blue blue_codec.safetensors "
+            "duration_predictor.safetensors vf_estimator.safetensors "
+            "stats_multilingual.pt --local-dir pt_weights"
         ), gr.update()
     cmd = [
     with gr.Tabs():
         with gr.TabItem("Synthesize"):
+    with gr.Column(elem_classes="card"):
+        text_input = gr.Textbox(
                     label="Text", placeholder="Type or paste text here…",
                     lines=4, elem_classes="big-input",
+            value="Great ideas become real when a small team keeps building every single day.",
+        )
+        with gr.Column(elem_classes="controls-row"):
+            with gr.Row(elem_classes="ctrl-row1"):
+                    lang_input = gr.Dropdown(
                             choices=[("English 🇺🇸", "en"), ("Hebrew 🇮🇱", "he"),
                                      ("Spanish 🇪🇸", "es"), ("German 🇩🇪", "de"),
                                      ("Italian 🇮🇹", "it")],
                             value="en", label="Language", elem_classes="ctrl-lang",
                         )
+                    voice_input = gr.Dropdown(
                             choices=list(VOICES.keys()),
                             value=next(iter(VOICES.keys()), None),
                             label="Voice", elem_classes="ctrl-voice",
+                    )
+            with gr.Row(elem_classes="ctrl-row2"):
                         steps_input = gr.Slider(2, 32, 8, step=1, label="Quality (steps)", elem_classes="ctrl-steps")
                         speed_input = gr.Slider(0.5, 2.0, 1.0, step=0.05, label="Speed", elem_classes="ctrl-speed")
                         cfg_input   = gr.Slider(1.0, 7.0, 3.0, step=0.1, label="CFG Scale", elem_classes="ctrl-cfg")
+        btn = gr.Button("⚡ Generate Speech", elem_classes="gen-btn")
+    audio_out = gr.Audio(label="Output", type="numpy", autoplay=True)
+    stats_out = gr.HTML()
             gr.Examples(examples=EXAMPLES, inputs=[text_input, lang_input], label="Examples")
+    btn.click(
+        synthesize_text,
                 inputs=[text_input, voice_input, lang_input, steps_input, speed_input, cfg_input],
+        outputs=[audio_out, stats_out],
+    )
         with gr.TabItem("Clone Voice"):
             with gr.Column(elem_classes="card"):

models/__init__.py ADDED Viewed

File without changes

models/reference_encoder.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+import torch.nn as nn
+from .text_encoder import ConvNeXtWrapper
+class ReferenceEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 144,
+        d_model: int = 256,
+        hidden_dim: int = 1024,
+        num_blocks: int = 6,
+        num_tokens: int = 50,
+        num_heads: int = 2,
+        kernel_size: int = 5,
+        dilation_lst: list = None,
+        prototype_dim: int = 256,
+        n_units: int = 256,
+        style_value_dim: int = 256,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.num_tokens = num_tokens
+        if hidden_dim % d_model != 0:
+            raise ValueError(f"hidden_dim ({hidden_dim}) must be divisible by d_model ({d_model})")
+        mlp_ratio = hidden_dim // d_model
+        self.input_proj = nn.Conv1d(in_channels, d_model, kernel_size=1)
+        self.convnext = ConvNeXtWrapper(
+            d_model,
+            n_layers=num_blocks,
+            expansion_factor=mlp_ratio,
+            kernel_size=kernel_size,
+            dilation_lst=dilation_lst,
+        )
+        self.ref_keys = nn.Parameter(torch.randn(num_tokens, prototype_dim) * 0.02)
+        self.q_proj = nn.Linear(prototype_dim, n_units) if prototype_dim != n_units else nn.Identity()
+        self.out_proj = nn.Linear(n_units, style_value_dim) if n_units != style_value_dim else nn.Identity()
+        self.attn1 = nn.MultiheadAttention(
+            embed_dim=n_units, num_heads=num_heads, kdim=d_model, vdim=d_model, batch_first=True
+        )
+        self.attn2 = nn.MultiheadAttention(
+            embed_dim=n_units, num_heads=num_heads, kdim=d_model, vdim=d_model, batch_first=True
+        )
+    def forward(self, z_ref: torch.Tensor, mask: torch.Tensor = None):
+        B = z_ref.shape[0]
+        x = self.input_proj(z_ref)
+        x = self.convnext(x, mask=mask)
+        kv = x.transpose(1, 2)
+        key_padding_mask = None
+        if mask is not None:
+            key_padding_mask = (mask.squeeze(1) == 0)
+        q0 = self.ref_keys.unsqueeze(0).expand(B, -1, -1)
+        q0 = self.q_proj(q0)
+        q1, _ = self.attn1(query=q0, key=kv, value=kv, key_padding_mask=key_padding_mask, need_weights=False)
+        q2 = q0 + q1
+        out, _ = self.attn2(query=q2, key=kv, value=kv, key_padding_mask=key_padding_mask, need_weights=False)
+        return self.out_proj(out)
+    @staticmethod
+    def remap_legacy_state_dict(state_dict: dict) -> dict:
+        """Remap pre-refactor checkpoints (per-layer pre-norm + FFN) onto current layout."""
+        remapped = {}
+        legacy_prefix_map = {
+            "attn_layers.0.attn.": "attn1.",
+            "attn_layers.1.attn.": "attn2.",
+        }
+        drop_substrings = (".norm_q.", ".norm_kv.", ".ffn.", "pos_emb.")
+        for k, v in state_dict.items():
+            if any(s in k for s in drop_substrings):
+                continue
+            new_key = k
+            for old, new in legacy_prefix_map.items():
+                if new_key.startswith(old):
+                    new_key = new + new_key[len(old):]
+                    break
+            remapped[new_key] = v
+        return remapped

models/text_encoder.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LayerNorm(nn.Module):
+    def __init__(self, channels: int, eps: float = 1e-6):
+        super().__init__()
+        self.norm = nn.LayerNorm(channels, eps=eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.transpose(1, 2)
+        x = self.norm(x)
+        x = x.transpose(1, 2)
+        return x
+class ConvNeXtBlock(nn.Module):
+    def __init__(self, dim: int, expansion_factor: int = 4, kernel_size: int = 5, dilation: int = 1, layer_scale_init_value: float = 1e-6):
+        super().__init__()
+        hidden_dim = dim * expansion_factor
+        if (kernel_size % 2) != 1:
+            raise ValueError(f"ConvNeXtBlock expects odd kernel_size, got {kernel_size}")
+        self.pad = ((kernel_size - 1) // 2) * dilation
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=0, groups=dim, dilation=dilation)
+        self.norm = LayerNorm(dim, eps=1e-6)
+        self.pwconv1 = nn.Conv1d(dim, hidden_dim, kernel_size=1)
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Conv1d(hidden_dim, dim, kernel_size=1)
+        self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((1, dim, 1)), requires_grad=True)
+    def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+        if mask is not None:
+            x = x * mask
+        residual = x
+        x = F.pad(x, (self.pad, self.pad), mode="replicate")
+        x = self.dwconv(x)
+        if mask is not None:
+            x = x * mask
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        x = self.gamma * x
+        x = residual + x
+        if mask is not None:
+            x = x * mask
+        return x
+class ConvNeXtWrapper(nn.Module):
+    def __init__(self, d_model, n_layers, expansion_factor, kernel_size=5, dilation_lst=None):
+        super().__init__()
+        if dilation_lst is None:
+            dilation_lst = [1] * n_layers
+        self.convnext = nn.ModuleList([
+            ConvNeXtBlock(d_model, expansion_factor=expansion_factor, kernel_size=kernel_size, dilation=dilation_lst[i])
+            for i in range(n_layers)
+        ])
+    def forward(self, x, mask=None):
+        for block in self.convnext:
+            x = block(x, mask=mask)
+        return x
+class RelativeMultiHeadAttention(nn.Module):
+    def __init__(self, channels: int, n_heads: int, window_size: int = 4, p_dropout: float = 0.0):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.n_heads = n_heads
+        self.head_dim = channels // n_heads
+        self.scale = self.head_dim ** -0.5
+        self.window_size = window_size
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, channels, 1)
+        self.emb_rel_k = nn.Parameter(torch.randn(1, 2 * window_size + 1, self.head_dim) * 0.02)
+        self.emb_rel_v = nn.Parameter(torch.randn(1, 2 * window_size + 1, self.head_dim) * 0.02)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x: torch.Tensor, attn_mask: torch.Tensor | None = None) -> torch.Tensor:
+        B, C, L = x.shape
+        q = self.conv_q(x).view(B, self.n_heads, self.head_dim, L).transpose(2, 3)
+        q = q * self.scale
+        k = self.conv_k(x).view(B, self.n_heads, self.head_dim, L).transpose(2, 3)
+        v = self.conv_v(x).view(B, self.n_heads, self.head_dim, L).transpose(2, 3)
+        scores = torch.matmul(q, k.transpose(-2, -1))
+        t = torch.arange(L, device=x.device)
+        diff = t[None, :] - t[:, None]
+        window_mask = (diff.abs() <= self.window_size)
+        diff_clamped = torch.clamp(diff, -self.window_size, self.window_size)
+        indices = diff_clamped + self.window_size
+        rel_k = self.emb_rel_k[0][indices]
+        rel_scores = torch.einsum("bhld,ljd->bhlj", q, rel_k)
+        rel_scores = rel_scores * window_mask[None, None, :, :]
+        scores = scores + rel_scores
+        if attn_mask is not None:
+            scores = scores.masked_fill(attn_mask == 0, -1e4)
+        attn = torch.softmax(scores, dim=-1)
+        attn = self.drop(attn)
+        out = torch.matmul(attn, v)
+        rel_v = self.emb_rel_v[0][indices]
+        rel_v = rel_v * window_mask[:, :, None]
+        out_rel = torch.einsum("bhlj,ljd->bhld", attn, rel_v)
+        out = out + out_rel
+        out = out.transpose(2, 3).contiguous().view(B, C, L)
+        out = self.conv_o(out)
+        return out
+class FeedForward(nn.Module):
+    def __init__(self, channels: int, filter_channels: int, kernel_size: int = 1, p_dropout: float = 0.0):
+        super().__init__()
+        self.conv_1 = nn.Conv1d(channels, filter_channels, kernel_size)
+        self.relu = nn.ReLU()
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_2 = nn.Conv1d(filter_channels, channels, kernel_size)
+    def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+        if mask is not None:
+            x = x * mask
+        x = self.conv_1(x)
+        x = self.relu(x)
+        x = self.drop(x)
+        if mask is not None:
+            x = x * mask
+        x = self.conv_2(x)
+        if mask is not None:
+            x = x * mask
+        return x
+class AttnEncoder(nn.Module):
+    def __init__(self, channels: int, n_heads: int, filter_channels: int, n_layers: int, p_dropout: float = 0.0):
+        super().__init__()
+        self.attn_layers = nn.ModuleList(
+            [RelativeMultiHeadAttention(channels, n_heads, window_size=4, p_dropout=p_dropout) for _ in range(n_layers)]
+        )
+        self.norm_layers_1 = nn.ModuleList([LayerNorm(channels) for _ in range(n_layers)])
+        self.ffn_layers = nn.ModuleList(
+            [FeedForward(channels, filter_channels, p_dropout=p_dropout) for _ in range(n_layers)]
+        )
+        self.norm_layers_2 = nn.ModuleList([LayerNorm(channels) for _ in range(n_layers)])
+    def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+        if mask is not None:
+            x = x * mask
+        attn_mask = None
+        if mask is not None:
+            attn_mask = mask.unsqueeze(-1) * mask.unsqueeze(-2)
+        for i in range(len(self.attn_layers)):
+            residual = x
+            x = self.attn_layers[i](x, attn_mask=attn_mask)
+            x = residual + x
+            x = self.norm_layers_1[i](x)
+            residual_ffn = x
+            x_ffn = self.ffn_layers[i](x, mask=mask)
+            x = residual_ffn + x_ffn
+            x = self.norm_layers_2[i](x)
+        if mask is not None:
+            x = x * mask
+        return x
+class LinearWrapped(nn.Module):
+    def __init__(self, in_dim, out_dim=None):
+        super().__init__()
+        if out_dim is None:
+            out_dim = in_dim
+        self.linear = nn.Linear(in_dim, out_dim)
+    def forward(self, x):
+        return self.linear(x)
+class StyleNorm(nn.Module):
+    def __init__(self, dim, eps: float = 1e-6):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, eps=eps)
+    def forward(self, x):
+        x = self.norm(x)
+        x = x.transpose(1, 2)
+        return x
+class TextEmbedderWrapper(nn.Module):
+    def __init__(self, vocab_size, d_model):
+        super().__init__()
+        self.char_embedder = nn.Embedding(vocab_size, d_model)
+    def forward(self, x):
+        return self.char_embedder(x)
+class StyleAttentionLayer(nn.Module):
+    def __init__(self, text_dim: int, style_dim: int, n_units: int, num_heads: int = 2, num_style_tokens: int = 50):
+        super().__init__()
+        assert n_units % num_heads == 0
+        self.num_heads = num_heads
+        self.dim = n_units
+        self.head_dim = n_units // num_heads
+        self.scale = n_units ** -0.5
+        self.W_query = LinearWrapped(text_dim, n_units)
+        self.W_value = LinearWrapped(style_dim, n_units)
+        self.out_fc = LinearWrapped(n_units, text_dim)
+        # ONNX folds `tanh(W_key(style_key))` into a baked constant; mirror with a learnable parameter.
+        self.key_const = nn.Parameter(torch.randn(num_heads, 1, self.head_dim, num_style_tokens) * 0.02)
+    def forward(self, x: torch.Tensor, values: torch.Tensor, mask_t: torch.Tensor | None = None) -> torch.Tensor:
+        B, T, C = x.shape
+        q = self.W_query(x)
+        qs = q.chunk(self.num_heads, dim=-1)
+        q = torch.stack(qs, dim=0)
+        k = self.key_const
+        if values.dim() == 2:
+            values = values.unsqueeze(0)
+        if values.shape[0] != B:
+            values = values.expand(B, -1, -1)
+        v = self.W_value(values)
+        vs = v.chunk(self.num_heads, dim=-1)
+        v = torch.stack(vs, dim=0)
+        scores = torch.matmul(q, k) * self.scale
+        attn = torch.softmax(scores, dim=-1)
+        if mask_t is not None:
+            attn_mask = (mask_t.unsqueeze(0) == 0)
+            attn = attn.masked_fill(attn_mask, 0.0)
+        out = torch.matmul(attn, v)
+        outs = out.chunk(self.num_heads, dim=0)
+        out = torch.cat(outs, dim=-1).squeeze(0)
+        out = self.out_fc(out)
+        if mask_t is not None:
+            out = out * mask_t
+        return out
+class StyleAttention(nn.Module):
+    def __init__(self, text_dim: int, style_dim: int, n_units: int, num_heads: int = 2, num_style_tokens: int = 50):
+        super().__init__()
+        # attention1 / attention2 are separate: each owns its baked key constant.
+        self.attention1 = StyleAttentionLayer(text_dim, style_dim, n_units, num_heads, num_style_tokens)
+        self.attention2 = StyleAttentionLayer(text_dim, style_dim, n_units, num_heads, num_style_tokens)
+        self.norm = StyleNorm(text_dim)
+    def forward(self, x: torch.Tensor, style_values: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+        x = x.transpose(1, 2)
+        mask_t = None
+        if mask is not None:
+            mask_t = mask.transpose(1, 2)
+        out1 = self.attention1(x, style_values, mask_t=mask_t)
+        x1 = x + out1
+        out2 = self.attention2(x1, style_values, mask_t=mask_t)
+        x2 = x + out2
+        x = self.norm(x2)
+        if mask is not None:
+            x = x * mask
+        return x
+class TextEncoder(nn.Module):
+    def __init__(
+        self,
+        vocab_size: int = 256,
+        d_model: int = 256,
+        n_conv_layers: int = 6,
+        n_attn_layers: int = 4,
+        expansion_factor: int = 4,
+        p_dropout: float = 0.1,
+        kernel_size: int = 5,
+        dilation_lst: list = None,
+        attn_n_heads: int = 4,
+        attn_filter_channels: int = 1024,
+        spte_n_heads: int = 2,
+        spte_text_dim: int = 256,
+        spte_style_dim: int = 256,
+        spte_n_units: int = 256,
+        spte_n_style: int = 50,
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.text_embedder = TextEmbedderWrapper(vocab_size, d_model)
+        self.convnext = ConvNeXtWrapper(
+            d_model, n_conv_layers, expansion_factor, kernel_size=kernel_size, dilation_lst=dilation_lst
+        )
+        self.attn_encoder = AttnEncoder(
+            d_model,
+            n_heads=attn_n_heads,
+            filter_channels=attn_filter_channels,
+            n_layers=n_attn_layers,
+            p_dropout=p_dropout,
+        )
+        self.speech_prompted_text_encoder = StyleAttention(
+            text_dim=spte_text_dim,
+            style_dim=spte_style_dim,
+            n_units=spte_n_units,
+            num_heads=spte_n_heads,
+            num_style_tokens=spte_n_style,
+        )
+        self.proj_out = nn.Identity()
+    def forward(self, text_ids: torch.Tensor, style_ttl: torch.Tensor, text_mask: torch.Tensor | None = None) -> torch.Tensor:
+        x = self.text_embedder(text_ids)
+        x = x.transpose(1, 2)
+        if text_mask is not None:
+            x = x * text_mask
+        x = self.convnext(x, mask=text_mask)
+        convnext_output = x
+        x = self.attn_encoder(x, mask=text_mask)
+        x = x + convnext_output
+        x = self.proj_out(x)
+        if text_mask is not None:
+            x = x * text_mask
+        x = self.speech_prompted_text_encoder(x, style_values=style_ttl, mask=text_mask)
+        return x

models/utils.py ADDED Viewed

	@@ -0,0 +1,197 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.transforms as T
+def compress_latents(z: torch.Tensor, factor: int = 6) -> torch.Tensor:
+    B, C, T = z.shape
+    if T % factor != 0:
+        pad = factor - (T % factor)
+        z = torch.nn.functional.pad(z, (0, pad))
+        T = T + pad
+    return z.view(B, C, T // factor, factor).permute(0, 1, 3, 2).flatten(1, 2)
+def decompress_latents(z: torch.Tensor, factor: int = 6, target_channels: int = 24) -> torch.Tensor:
+    B, _, T_low = z.shape
+    return z.view(B, target_channels, factor, T_low).permute(0, 1, 3, 2).flatten(2, 3)
+def _resolve_vocab_size(char_dict_path, default=256):
+    import json as _json
+    import os as _os
+    if char_dict_path and _os.path.exists(char_dict_path):
+        try:
+            with open(char_dict_path, "r") as f:
+                cd = _json.load(f)
+            if isinstance(cd, dict) and "vocab_size" in cd:
+                return int(cd["vocab_size"])
+            if isinstance(cd, dict) and "char_to_id" in cd and isinstance(cd["char_to_id"], dict):
+                return max(cd["char_to_id"].values()) + 1
+            if isinstance(cd, dict):
+                return max(cd.values()) + 1 if cd else default
+            return len(cd)
+        except Exception:
+            pass
+    return default
+def load_ttl_config(config_path="configs/tts.json"):
+    import json
+    with open(config_path, "r") as f:
+        full_config = json.load(f)
+    ttl = full_config["ttl"]
+    ae  = full_config.get("ae", {})
+    dp  = full_config.get("dp", {})
+    te = ttl["text_encoder"]
+    se = ttl["style_encoder"]
+    vf = ttl["vector_field"]
+    um = ttl["uncond_masker"]
+    char_dict_path = te.get("char_dict_path", te.get("text_embedder", {}).get("char_dict_path"))
+    vocab_size = _resolve_vocab_size(char_dict_path, default=256)
+    dp_char_dict_path = (
+        dp.get("sentence_encoder", {}).get("char_dict_path")
+        or dp.get("sentence_encoder", {}).get("text_embedder", {}).get("char_dict_path")
+    )
+    dp_vocab_size = _resolve_vocab_size(dp_char_dict_path, default=vocab_size)
+    ae_dec = ae.get("decoder", {})
+    ae_dec_cfg = {
+        "idim":                 ae_dec.get("idim", 24),
+        "hdim":                 ae_dec.get("hdim", 512),
+        "intermediate_dim":     ae_dec.get("intermediate_dim", 2048),
+        "ksz":                  ae_dec.get("ksz", 7),
+        "dilation_lst":         ae_dec.get("dilation_lst", [1, 2, 4, 1, 2, 4, 1, 1, 1, 1]),
+        "chunk_compress_factor": ae.get("chunk_compress_factor", 1),
+        "head": {
+            "idim": ae_dec.get("head", {}).get("idim", ae_dec.get("hdim", 512)),
+            "hdim": ae_dec.get("head", {}).get("hdim", 2048),
+            "odim": ae_dec.get("head", {}).get("odim", 512),
+            "ksz":  ae_dec.get("head", {}).get("ksz", 3),
+        },
+    }
+    ae_enc = ae.get("encoder", {})
+    ae_enc_spec = ae_enc.get("spec_processor", {})
+    ae_enc_cfg = {
+        "ksz":              ae_enc.get("ksz", 7),
+        "hdim":             ae_enc.get("hdim", 512),
+        "intermediate_dim": ae_enc.get("intermediate_dim", 2048),
+        "dilation_lst":     ae_enc.get("dilation_lst", [1] * 10),
+        "odim":             ae_enc.get("odim", 24),
+        "idim":             ae_enc.get("idim", 1253),
+    }
+    dp_se = dp.get("style_encoder", {}).get("style_token_layer", {})
+    return {
+        "full_config":  full_config,
+        "ttl":          ttl,
+        "ae":           ae,
+        "dp":           dp,
+        "vocab_size":       vocab_size,
+        "char_dict_path":   char_dict_path,
+        "dp_vocab_size":    dp_vocab_size,
+        "latent_dim":           ttl["latent_dim"],
+        "chunk_compress_factor": ttl["chunk_compress_factor"],
+        "compressed_channels":  ttl["latent_dim"] * ttl["chunk_compress_factor"],
+        "normalizer_scale":     ttl["normalizer"]["scale"],
+        "sigma_min":            ttl["flow_matching"]["sig_min"],
+        "Ke":                   ttl["batch_expander"]["n_batch_expand"],
+        "te_d_model":           te["text_embedder"]["char_emb_dim"],
+        "te_convnext_layers":   te["convnext"]["num_layers"],
+        "te_expansion_factor":  te["convnext"]["intermediate_dim"] // te["text_embedder"]["char_emb_dim"],
+        "te_attn_n_layers":     te["attn_encoder"]["n_layers"],
+        "te_attn_p_dropout":    te["attn_encoder"]["p_dropout"],
+        "se_d_model":    se["proj_in"]["odim"],
+        "se_hidden_dim": se["convnext"]["intermediate_dim"],
+        "se_num_blocks": se["convnext"]["num_layers"],
+        "se_n_style":    se["style_token_layer"]["n_style"],
+        "se_n_heads":    se["style_token_layer"]["n_heads"],
+        "prob_both_uncond": um["prob_both_uncond"],
+        "prob_text_uncond": um["prob_text_uncond"],
+        "uncond_init_std":  um["std"],
+        "um_text_dim":      um["text_dim"],
+        "um_n_style":       um["n_style"],
+        "um_style_key_dim":   um["style_key_dim"],
+        "um_style_value_dim": um["style_value_dim"],
+        "vf_hidden":        vf["proj_in"]["odim"],
+        "vf_time_dim":      vf["time_encoder"]["time_dim"],
+        "vf_n_blocks":      vf["main_blocks"]["n_blocks"],
+        "vf_text_dim":      vf["main_blocks"]["text_cond_layer"]["text_dim"],
+        "vf_text_n_heads":  vf["main_blocks"]["text_cond_layer"]["n_heads"],
+        "vf_style_dim":     vf["main_blocks"]["style_cond_layer"]["style_dim"],
+        "vf_rotary_scale":  vf["main_blocks"]["text_cond_layer"]["rotary_scale"],
+        "ae_dec_cfg":    ae_dec_cfg,
+        "ae_enc_cfg":    ae_enc_cfg,
+        "ae_sample_rate":  ae.get("sample_rate", 44100),
+        "ae_n_fft":        ae_enc_spec.get("n_fft", 2048),
+        "ae_hop_length":   ae_enc_spec.get("hop_length", 512),
+        "ae_n_mels":       ae_enc_spec.get("n_mels", 1253),
+        "dp_style_tokens": dp_se.get("n_style", 8),
+        "dp_style_dim":    dp_se.get("style_value_dim", 16),
+    }
+class MelSpectrogram(nn.Module):
+    def __init__(self, sample_rate=44100, n_fft=2048, win_length=2048,
+                 hop_length=512, n_mels=1253, f_min=0, f_max=None):
+        super().__init__()
+        self.mel = T.MelSpectrogram(
+            sample_rate=sample_rate, n_fft=n_fft, win_length=win_length,
+            hop_length=hop_length, n_mels=n_mels, f_min=f_min, f_max=f_max,
+            center=True, power=1.0,
+        )
+    def forward(self, audio):
+        mel = torch.log(torch.clamp(self.mel(audio), min=1e-5))
+        return mel.squeeze(1) if mel.dim() == 4 and mel.shape[1] == 1 else mel
+class MelSpectrogramNoLog(nn.Module):
+    def __init__(self, sample_rate=44100, n_fft=2048, win_length=2048,
+                 hop_length=512, n_mels=1253, f_min=0, f_max=12000, power=1.0):
+        super().__init__()
+        self.mel = T.MelSpectrogram(
+            sample_rate=sample_rate, n_fft=n_fft, win_length=win_length,
+            hop_length=hop_length, n_mels=n_mels, f_min=f_min, f_max=f_max,
+            center=True, power=power,
+        )
+    def forward(self, audio):
+        mel = self.mel(audio)
+        return mel.squeeze(1) if mel.dim() == 4 and mel.shape[1] == 1 else mel
+class LinearMelSpectrogram(nn.Module):
+    def __init__(self, sample_rate=44100, n_fft=2048, win_length=2048,
+                 hop_length=512, n_mels=1253, f_min=0, f_max=None):
+        super().__init__()
+        self.spectrogram = T.Spectrogram(
+            n_fft=n_fft, win_length=win_length, hop_length=hop_length,
+            center=True, power=1.0,
+        )
+        self.mel_scale = T.MelScale(
+            n_mels=n_mels, sample_rate=sample_rate,
+            n_stft=n_fft // 2 + 1, f_min=f_min, f_max=f_max,
+        )
+    def forward(self, audio):
+        spec = self.spectrogram(audio)
+        mel = self.mel_scale(spec)
+        log_spec = torch.log(torch.clamp(spec, min=1e-5))
+        log_mel = torch.log(torch.clamp(mel, min=1e-5))
+        return torch.cat([log_spec, log_mel], dim=1)

models/vf_estimator.py ADDED Viewed

	@@ -0,0 +1,507 @@

+import math
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LinearWrapper(nn.Module):
+    def __init__(self, in_features: int, out_features: int):
+        super().__init__()
+        self.linear = nn.Linear(in_features, out_features)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.linear(x)
+class LayerNormWrapper(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.norm = nn.LayerNorm(dim, eps=eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x.transpose(1, 2)
+        x = self.norm(x)
+        x = x.transpose(1, 2)
+        return x
+class ProjectionWrapper(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.net = nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=False)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
+class Mish(nn.Module):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.tanh(F.softplus(x))
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim: int, scale: float = 1000.0):
+        super().__init__()
+        self.dim = dim
+        self.scale = scale
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x * self.scale
+        device = x.device
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
+        return emb
+class TimeEncoder(nn.Module):
+    def __init__(self, embed_dim: int, hdim: int = 256):
+        super().__init__()
+        self.sinusoidal = SinusoidalPosEmb(embed_dim, scale=1000.0)
+        self.mlp = nn.Sequential(
+            LinearWrapper(embed_dim, hdim),
+            Mish(),
+            LinearWrapper(hdim, embed_dim),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.sinusoidal(x)
+        x = self.mlp(x)
+        return x
+class TimeCondBlock(nn.Module):
+    def __init__(self, time_dim: int, channels: int):
+        super().__init__()
+        self.linear = LinearWrapper(time_dim, channels)
+        # Zero-init so the block starts as identity.
+        nn.init.zeros_(self.linear.linear.weight)
+        nn.init.zeros_(self.linear.linear.bias)
+    def forward(self, x: torch.Tensor, time_emb: torch.Tensor) -> torch.Tensor:
+        cond = self.linear(time_emb)
+        cond = cond.unsqueeze(-1)
+        return x + cond
+class ConvNeXtBlock1D(nn.Module):
+    def __init__(self, dim: int, kernel_size: int = 5, expansion: int = 2, dropout: float = 0.0, dilation: int = 1):
+        super().__init__()
+        self.pad = ((kernel_size - 1) // 2) * dilation
+        self.dwconv = nn.Conv1d(dim, dim, kernel_size=kernel_size, padding=0, groups=dim, dilation=dilation)
+        self.norm = LayerNormWrapper(dim)
+        self.pwconv1 = nn.Conv1d(dim, dim * expansion, kernel_size=1)
+        self.act = nn.GELU()
+        self.pwconv2 = nn.Conv1d(dim * expansion, dim, kernel_size=1)
+        self.gamma = nn.Parameter(torch.ones(1, dim, 1) * 1e-6)
+        self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
+    def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if mask is not None:
+            x = x * mask
+        residual = x
+        x = F.pad(x, (self.pad, self.pad), mode="replicate")
+        x = self.dwconv(x)
+        if mask is not None:
+            x = x * mask
+        x = self.norm(x)
+        x = self.pwconv1(x)
+        x = self.act(x)
+        x = self.pwconv2(x)
+        x = self.gamma * x
+        x = self.dropout(x)
+        x = x + residual
+        if mask is not None:
+            x = x * mask
+        return x
+class ConvNeXtStack(nn.Module):
+    def __init__(self, channels, kernel_size, dilations):
+        super().__init__()
+        self.convnext = nn.ModuleList([
+            ConvNeXtBlock1D(channels, kernel_size=kernel_size, dilation=d, expansion=2)
+            for d in dilations
+        ])
+    def forward(self, x, mask=None):
+        for blk in self.convnext:
+            x = blk(x, mask)
+        return x
+def apply_rotary_pos_emb(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+    B, H, T, D = x.shape
+    assert D % 2 == 0, "head_dim must be even for RoPE"
+    x1 = x[..., : D // 2]
+    x2 = x[..., D // 2 :]
+    if cos.dim() == 2:
+        cos = cos[None, None, :, :]
+        sin = sin[None, None, :, :]
+    elif cos.dim() == 3:
+        cos = cos.unsqueeze(1)
+        sin = sin.unsqueeze(1)
+    x1_rot = x1 * cos - x2 * sin
+    x2_rot = x1 * sin + x2 * cos
+    return torch.cat([x1_rot, x2_rot], dim=-1)
+class AttentionModule(nn.Module):
+    """Text path uses LARoPE; style path uses tanh on keys (no RoPE)."""
+    def __init__(
+        self,
+        d_model: int,
+        d_context: int,
+        num_heads: int,
+        attn_dim: int,
+        use_rope: bool,
+        dropout: float = 0.0,
+        rope_gamma: float = 10.0,
+        attn_scale: Optional[float] = None,
+        rotary_base: float = 10000.0,
+        use_residual: bool = True,
+    ):
+        super().__init__()
+        assert attn_dim % num_heads == 0
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.head_dim = attn_dim // num_heads
+        self.attn_dim = attn_dim
+        self.use_rope = use_rope
+        self.use_residual = use_residual
+        self.rope_gamma = rope_gamma
+        self.attn_scale = attn_scale if attn_scale is not None else math.sqrt(self.attn_dim)
+        self.W_query = LinearWrapper(d_model, attn_dim)
+        self.W_key = LinearWrapper(d_context, attn_dim)
+        self.W_value = LinearWrapper(d_context, attn_dim)
+        self.out_fc = LinearWrapper(attn_dim, d_model)
+        self.dropout = nn.Dropout(dropout) if dropout > 0.0 else nn.Identity()
+        if use_rope:
+            inv_freq = 1.0 / (rotary_base ** (torch.arange(0, self.head_dim, 2, dtype=torch.float32) / self.head_dim))
+            theta = (inv_freq * rope_gamma).view(1, 1, -1)
+            self.register_buffer("theta", theta, persistent=True)
+            self.register_buffer("increments", torch.arange(1000).view(1, 1000, 1), persistent=True)
+            self.tanh = None
+        else:
+            self.theta = None
+            self.increments = None
+            self.tanh = nn.Tanh()
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: torch.Tensor,
+        context_keys: Optional[torch.Tensor] = None,
+        x_mask: Optional[torch.Tensor] = None,
+        context_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        B, d_model, T = x.shape
+        L = context.shape[1]
+        x_t = x.transpose(1, 2)
+        q = self.W_query(x_t)
+        k_src = context_keys if context_keys is not None else context
+        k = self.W_key(k_src)
+        v = self.W_value(context)
+        if not self.use_rope and self.tanh is not None:
+            k = self.tanh(k)
+        H = self.num_heads
+        D = self.head_dim
+        q = q.view(B, T, H, D).permute(2, 0, 1, 3)
+        k = k.view(B, L, H, D).permute(2, 0, 1, 3)
+        v = v.view(B, L, H, D).permute(2, 0, 1, 3)
+        if self.use_rope:
+            device = x.device
+            if x_mask is not None:
+                len_q = x_mask.sum(dim=(-2, -1)).reshape(-1, 1, 1)
+            else:
+                len_q = torch.tensor([T], device=device, dtype=torch.float32).reshape(1, 1, 1)
+            if context_mask is not None:
+                len_k = context_mask.sum(dim=(-2, -1)).reshape(-1, 1, 1)
+            else:
+                len_k = torch.tensor([L], device=device, dtype=torch.float32).reshape(1, 1, 1)
+            if self.increments is not None and self.increments.shape[1] >= max(T, L):
+                pos_q = self.increments[:, :T, :].to(device).float()
+                pos_k = self.increments[:, :L, :].to(device).float()
+            else:
+                pos_q = torch.arange(T, device=device, dtype=torch.float32).reshape(1, -1, 1)
+                pos_k = torch.arange(L, device=device, dtype=torch.float32).reshape(1, -1, 1)
+            norm_pos_q = pos_q / len_q
+            norm_pos_k = pos_k / len_k
+            theta = self.theta if self.theta is not None else (
+                (1.0 / (10000 ** (torch.arange(0, D, 2, device=device).float() / D))) * self.rope_gamma
+            ).view(1, 1, -1)
+            freqs_q = norm_pos_q * theta
+            freqs_k = norm_pos_k * theta
+            cos_q, sin_q = freqs_q.cos(), freqs_q.sin()
+            cos_k, sin_k = freqs_k.cos(), freqs_k.sin()
+            cos_q, sin_q = cos_q.unsqueeze(0), sin_q.unsqueeze(0)
+            cos_k, sin_k = cos_k.unsqueeze(0), sin_k.unsqueeze(0)
+            q = apply_rotary_pos_emb(q, cos_q, sin_q)
+            k = apply_rotary_pos_emb(k, cos_k, sin_k)
+        attn_logits = torch.matmul(q, k.transpose(-1, -2)) / self.attn_scale
+        if context_mask is not None:
+            if context_mask.dim() == 2:
+                context_mask = context_mask.unsqueeze(1)
+            cm = (context_mask == 0)
+            attn_logits = attn_logits.masked_fill(cm.unsqueeze(0), float("-inf"))
+        attn = torch.softmax(attn_logits, dim=-1)
+        if x_mask is not None:
+            if x_mask.dim() == 2:
+                x_mask = x_mask.unsqueeze(1)
+            qm = (x_mask == 0).permute(1, 0, 2).unsqueeze(-1)
+            attn = attn.masked_fill(qm, 0.0)
+        out = torch.matmul(attn, v)
+        out = out.permute(1, 2, 0, 3).contiguous().view(B, T, self.attn_dim)
+        out = self.out_fc(out)
+        out = self.dropout(out)
+        if x_mask is not None:
+            out = out * x_mask.transpose(1, 2)
+        out = out.transpose(1, 2)
+        return out
+class CrossAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        d_context: int,
+        num_heads: int = 8,
+        attn_dim: int = 256,
+        use_rope: bool = True,
+        rope_gamma: float = 10.0,
+        attn_scale: Optional[float] = None,
+        use_residual: bool = True,
+        rotary_base: float = 10000.0,
+    ):
+        super().__init__()
+        self.use_rope = use_rope
+        self.use_residual = use_residual
+        attn_module = AttentionModule(
+            d_model, d_context, num_heads, attn_dim, use_rope,
+            rope_gamma=rope_gamma, attn_scale=attn_scale, rotary_base=rotary_base, use_residual=use_residual,
+        )
+        # Checkpoint naming: text (RoPE) -> 'attn'; style (no RoPE) -> 'attention'.
+        if use_rope:
+            self.attn = attn_module
+        else:
+            self.attention = attn_module
+        self.norm = LayerNormWrapper(d_model)
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: torch.Tensor,
+        context_keys: Optional[torch.Tensor],
+        x_mask: Optional[torch.Tensor],
+        context_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        if x_mask is not None:
+            x = x * x_mask
+        residual = x
+        if self.use_rope:
+            attn_out = self.attn(x, context, context_keys, x_mask, context_mask)
+        else:
+            attn_out = self.attention(x, context, context_keys, x_mask, context_mask)
+        if self.use_residual:
+            x = residual + attn_out
+        else:
+            x = attn_out
+        x = self.norm(x)
+        if x_mask is not None:
+            x = x * x_mask
+        return x
+class VectorFieldEstimator(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 144,
+        hidden_channels: int = 512,
+        out_channels: int = 144,
+        text_dim: int = 256,
+        style_dim: int = 256,
+        num_style_tokens: int = 50,
+        num_superblocks: int = 4,
+        time_embed_dim: int = 64,
+        rope_gamma: float = 10.0,
+        main_blocks_cfg: dict = None,
+        last_convnext_cfg: dict = None,
+        text_n_heads: int = 4,
+        time_hdim: int = 256,
+        use_residual: bool = True,
+        rotary_base: float = 10000.0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.out_channels = out_channels
+        self.text_dim = text_dim
+        self.style_dim = style_dim
+        self.rope_gamma = rope_gamma
+        # Shared tiled constant ([1, 50, 256]) consumed by every style-attn W_key.
+        self.tile = nn.Parameter(torch.randn(1, num_style_tokens, style_dim) * 0.02)
+        self.proj_in = ProjectionWrapper(in_channels, hidden_channels)
+        self.time_encoder = TimeEncoder(time_embed_dim, hdim=time_hdim)
+        self.main_blocks = nn.ModuleList()
+        shared_attn_scale = math.sqrt(256)
+        mb_cfg = main_blocks_cfg or {}
+        lc_cfg = last_convnext_cfg or {}
+        c0_cfg = mb_cfg.get("convnext_0", {})
+        c1_cfg = mb_cfg.get("convnext_1", {})
+        c2_cfg = mb_cfg.get("convnext_2", {})
+        for _ in range(num_superblocks):
+            self.main_blocks.append(
+                ConvNeXtStack(hidden_channels, kernel_size=c0_cfg.get("ksz", 5), dilations=c0_cfg.get("dilation_lst", [1, 2, 4, 8]))
+            )
+            self.main_blocks.append(
+                TimeCondBlock(time_dim=time_embed_dim, channels=hidden_channels)
+            )
+            self.main_blocks.append(
+                ConvNeXtStack(hidden_channels, kernel_size=c1_cfg.get("ksz", 5), dilations=c1_cfg.get("dilation_lst", [1]))
+            )
+            self.main_blocks.append(
+                CrossAttentionBlock(
+                    d_model=hidden_channels,
+                    d_context=text_dim,
+                    num_heads=text_n_heads,
+                    attn_dim=256,
+                    use_rope=True,
+                    rope_gamma=self.rope_gamma,
+                    attn_scale=shared_attn_scale,
+                    use_residual=use_residual,
+                    rotary_base=rotary_base,
+                )
+            )
+            self.main_blocks.append(
+                ConvNeXtStack(hidden_channels, kernel_size=c2_cfg.get("ksz", 5), dilations=c2_cfg.get("dilation_lst", [1]))
+            )
+            self.main_blocks.append(
+                CrossAttentionBlock(
+                    d_model=hidden_channels,
+                    d_context=style_dim,
+                    num_heads=2,
+                    attn_dim=256,
+                    use_rope=False,
+                    attn_scale=shared_attn_scale,
+                    use_residual=use_residual,
+                    rotary_base=rotary_base,
+                )
+            )
+        self.last_convnext = ConvNeXtStack(
+            hidden_channels, kernel_size=lc_cfg.get("ksz", 5), dilations=lc_cfg.get("dilation_lst", [1, 1, 1, 1])
+        )
+        self.proj_out = ProjectionWrapper(hidden_channels, out_channels)
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        # Back-compat: older checkpoints stored the tiled style-key under `style_key`.
+        legacy_key = prefix + "style_key"
+        new_key = prefix + "tile"
+        if legacy_key in state_dict and new_key not in state_dict:
+            state_dict[new_key] = state_dict.pop(legacy_key)
+        return super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict,
+            missing_keys, unexpected_keys, error_msgs,
+        )
+    def forward(
+        self,
+        noisy_latent: torch.Tensor,
+        text_emb: torch.Tensor,
+        style_ttl: torch.Tensor,
+        latent_mask: torch.Tensor,
+        text_mask: torch.Tensor,
+        current_step: torch.Tensor,
+        total_step: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        B = noisy_latent.shape[0]
+        if total_step is not None:
+            t_norm = current_step.reshape(B, 1, 1) / total_step.reshape(B, 1, 1)
+            reciprocal = 1.0 / total_step.reshape(B, 1, 1)
+            t_norm_flat = t_norm.reshape(B)
+        else:
+            t_norm_flat = current_step.reshape(B)
+        t_emb = self.time_encoder(t_norm_flat)
+        text_blc = text_emb.transpose(1, 2)
+        x = self.proj_in(noisy_latent)
+        x = x * latent_mask
+        for i, block in enumerate(self.main_blocks):
+            idx_in_super = i % 6
+            if idx_in_super == 0:
+                x = block(x, mask=latent_mask)
+            elif idx_in_super == 1:
+                x = block(x, t_emb)
+                x = x * latent_mask
+            elif idx_in_super == 2:
+                x = block(x, mask=latent_mask)
+            elif idx_in_super == 3:
+                x = block(x, context=text_blc, context_keys=None,
+                          x_mask=latent_mask, context_mask=text_mask)
+            elif idx_in_super == 4:
+                x = block(x, mask=latent_mask)
+            elif idx_in_super == 5:
+                x = block(x, context=style_ttl,
+                          context_keys=self.tile.expand(B, -1, -1),
+                          x_mask=latent_mask, context_mask=None)
+        x = self.last_convnext(x, mask=latent_mask)
+        diff_out = self.proj_out(x) * latent_mask
+        if total_step is not None:
+            denoised = noisy_latent + reciprocal * diff_out
+            return denoised * latent_mask
+        return diff_out