acul3
/

Qwen3-TTS-1.7B-Base-ExecuTorch

+#!/usr/bin/env python3
+"""
+Phase 1: Deep Architecture Analysis of Qwen3-TTS for ExecuTorch Export
+======================================================================
+Loads the model, maps all modules with parameter counts, traces a real
+voice-clone inference to capture shapes, and identifies export blockers.
+"""
+import sys
+import os
+import time
+import json
+import numpy as np
+import torch
+import torch.nn as nn
+# ── paths ────────────────────────────────────────────────────────────
+MODEL_PATH = os.path.expanduser("~/Documents/Qwen3-TTS/models/1.7B-Base")
+VENV_SITE = os.path.expanduser("~/Documents/Qwen3-TTS/.venv/lib/python3.10/site-packages")
+QWEN_TTS_SRC = os.path.expanduser("~/Documents/Qwen3-TTS")
+# Ensure the venv's site-packages is on the path so qwen_tts can be imported
+if VENV_SITE not in sys.path:
+    sys.path.insert(0, VENV_SITE)
+if QWEN_TTS_SRC not in sys.path:
+    sys.path.insert(0, QWEN_TTS_SRC)
+# ── helpers ──────────────────────────────────────────────────────────
+def count_params(module: nn.Module) -> int:
+    return sum(p.numel() for p in module.parameters())
+def fmt(n: int) -> str:
+    if n >= 1e9:
+        return f"{n / 1e9:.1f}B"
+    if n >= 1e6:
+        return f"{n / 1e6:.1f}M"
+    if n >= 1e3:
+        return f"{n / 1e3:.1f}K"
+    return str(n)
+def param_table(module: nn.Module, prefix: str = "", depth: int = 0, max_depth: int = 3):
+    """Print a hierarchical parameter table."""
+    total = count_params(module)
+    indent = "  " * depth
+    name = prefix or module.__class__.__name__
+    print(f"{indent}{name}: {fmt(total)} params")
+    if depth < max_depth:
+        for child_name, child in module.named_children():
+            child_prefix = f"{prefix}.{child_name}" if prefix else child_name
+            param_table(child, child_prefix, depth + 1, max_depth)
+# ── 1. Load Model ───────────────────────────────────────────────────
+print("=" * 70)
+print("PHASE 1: Deep Architecture Analysis — Qwen3-TTS 1.7B-Base")
+print("=" * 70)
+print("\n[1/5] Loading model from", MODEL_PATH)
+t0 = time.time()
+from qwen_tts.core.models.configuration_qwen3_tts import Qwen3TTSConfig
+from qwen_tts.core.models.modeling_qwen3_tts import (
+    Qwen3TTSForConditionalGeneration,
+    mel_spectrogram,
+)
+config = Qwen3TTSConfig.from_pretrained(MODEL_PATH)
+# Force SDPA attention for exportability
+model = Qwen3TTSForConditionalGeneration.from_pretrained(
+    MODEL_PATH,
+    config=config,
+    torch_dtype=torch.float32,
+    attn_implementation="sdpa",
+    device_map="cpu",
+)
+model.eval()
+print(f"  Loaded in {time.time() - t0:.1f}s")
+# ── 2. Parameter Map ────────────────────────────────────────────────
+print("\n[2/5] Parameter Map (hierarchical)")
+print("-" * 60)
+param_table(model, "Qwen3TTSForConditionalGeneration", max_depth=4)
+print("\n--- Top-level component sizes ---")
+components = {
+    "speaker_encoder": model.speaker_encoder,
+    "talker": model.talker,
+    "talker.model": model.talker.model,
+    "talker.text_projection": model.talker.text_projection,
+    "talker.codec_head": model.talker.codec_head,
+    "talker.code_predictor": model.talker.code_predictor,
+}
+for name, mod in components.items():
+    print(f"  {name:40s}: {fmt(count_params(mod)):>8s} params")
+if model.speech_tokenizer is not None and hasattr(model.speech_tokenizer, 'model'):
+    st = model.speech_tokenizer.model  # Qwen3TTSTokenizerV2Model (nn.Module)
+    print(f"  {'speech_tokenizer.model':40s}: {fmt(count_params(st)):>8s} params")
+    if hasattr(st, 'encoder'):
+        print(f"  {'speech_tokenizer.model.encoder':40s}: {fmt(count_params(st.encoder)):>8s} params")
+    if hasattr(st, 'decoder'):
+        print(f"  {'speech_tokenizer.model.decoder':40s}: {fmt(count_params(st.decoder)):>8s} params")
+# ── 3. Config Summary ───────────────────────────────────────────────
+print("\n[3/5] Key Config Values")
+print("-" * 60)
+tc = config.talker_config
+cpc = tc.code_predictor_config
+sec = config.speaker_encoder_config
+info = {
+    "Speaker Encoder": {
+        "mel_dim": sec.mel_dim,
+        "enc_dim (output)": sec.enc_dim,
+        "enc_channels": sec.enc_channels,
+        "sample_rate": sec.sample_rate,
+    },
+    "Talker (Main LM)": {
+        "hidden_size": tc.hidden_size,
+        "num_hidden_layers": tc.num_hidden_layers,
+        "num_attention_heads": tc.num_attention_heads,
+        "num_key_value_heads": tc.num_key_value_heads,
+        "head_dim": tc.head_dim,
+        "intermediate_size": tc.intermediate_size,
+        "text_vocab_size": tc.text_vocab_size,
+        "codec_vocab_size": tc.vocab_size,
+        "num_code_groups": tc.num_code_groups,
+        "max_position_embeddings": tc.max_position_embeddings,
+        "rope_scaling": tc.rope_scaling,
+    },
+    "Code Predictor": {
+        "hidden_size": cpc.hidden_size,
+        "num_hidden_layers": cpc.num_hidden_layers,
+        "num_attention_heads": cpc.num_attention_heads,
+        "num_key_value_heads": cpc.num_key_value_heads,
+        "num_code_groups": cpc.num_code_groups,
+        "vocab_size": cpc.vocab_size,
+    },
+}
+for section, kvs in info.items():
+    print(f"\n  {section}:")
+    for k, v in kvs.items():
+        print(f"    {k:35s}: {v}")
+# ── 4. Trace Real Inference ─────────────────────────────────────────
+print("\n[4/5] Tracing Real Voice-Clone Inference")
+print("-" * 60)
+# Create synthetic reference audio: 3 seconds of white noise at 24kHz
+ref_sr = 24000
+ref_duration = 3.0
+ref_audio = np.random.randn(int(ref_sr * ref_duration)).astype(np.float32) * 0.1
+# --- 4a. Speaker Encoder ---
+print("\n  === Speaker Encoder ===")
+mels = mel_spectrogram(
+    torch.from_numpy(ref_audio).unsqueeze(0),
+    n_fft=1024,
+    num_mels=128,
+    sampling_rate=24000,
+    hop_size=256,
+    win_size=1024,
+    fmin=0,
+    fmax=12000,
+).transpose(1, 2)
+print(f"  Mel input shape:        {list(mels.shape)}")  # [1, T, 128]
+with torch.no_grad():
+    spk_embed = model.speaker_encoder(mels)
+print(f"  Speaker embedding shape: {list(spk_embed.shape)}")  # [1, enc_dim]
+x_vector = spk_embed[0]
+print(f"  X-vector (per sample):   {list(x_vector.shape)}")  # [enc_dim]
+# --- 4b. Speech Tokenizer Encode (ref audio -> codes) ---
+print("\n  === Speech Tokenizer Encode ===")
+if model.speech_tokenizer is not None:
+    st_model = model.speech_tokenizer.model
+    ref_wav_tensor = torch.from_numpy(ref_audio).unsqueeze(0).float()  # [1, samples]
+    padding_mask = torch.ones_like(ref_wav_tensor, dtype=torch.long)
+    with torch.no_grad():
+        enc_out = st_model.encode(ref_wav_tensor, padding_mask=padding_mask, return_dict=True)
+    ref_codes = enc_out.audio_codes
+    print(f"  Ref audio samples:      {ref_wav_tensor.shape[1]}")
+    print(f"  Number of code tensors: {len(ref_codes)}")
+    for i, c in enumerate(ref_codes):
+        print(f"  ref_codes[{i}] shape:     {list(c.shape)}")  # [T, num_quantizers]
+else:
+    print("  Speech tokenizer not loaded (will skip encode)")
+    ref_codes = None
+# --- 4c. Talker Prefill Input Construction ---
+print("\n  === Talker Input Construction ===")
+# Simulate tokenized text: "<|im_start|>assistant\nHello world<|im_end|>\n<|im_start|>assistant\n"
+# Using config token IDs
+from transformers import AutoTokenizer
+try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
+    text = "Hello world."
+    chat_text = f"<|im_start|>assistant\n{text}<|im_end|>\n<|im_start|>assistant\n"
+    input_ids = tokenizer(chat_text, return_tensors="pt", add_special_tokens=False).input_ids
+    print(f"  Text input_ids shape:   {list(input_ids.shape)}")
+    print(f"  Text input_ids:         {input_ids[0].tolist()[:20]}...")
+except Exception as e:
+    print(f"  Tokenizer load failed: {e}")
+    # Fallback: synthetic token IDs
+    input_ids = torch.tensor([[config.im_start_token_id, 77091, 198, 9707, 1879, 13,
+                               config.im_end_token_id, 198,
+                               config.im_start_token_id, 77091, 198]])
+    print(f"  Fallback input_ids shape: {list(input_ids.shape)}")
+# --- 4d. Talker Key Shapes ---
+print("\n  === Talker Architecture Key Shapes ===")
+talker = model.talker
+# Text embedding
+text_emb = talker.get_text_embeddings()
+print(f"  text_embedding:         {text_emb.weight.shape}")  # [text_vocab, hidden]
+# Codec embedding
+codec_emb = talker.get_input_embeddings()
+print(f"  codec_embedding:        {codec_emb.weight.shape}")  # [codec_vocab, hidden]
+# text_projection (ResizeMLP)
+print(f"  text_projection type:   {type(talker.text_projection).__name__}")
+with torch.no_grad():
+    sample_text_hidden = text_emb(torch.tensor([[0]]))
+    proj_out = talker.text_projection(sample_text_hidden)
+print(f"  text_projection in/out: {list(sample_text_hidden.shape)} -> {list(proj_out.shape)}")
+# codec_head
+print(f"  codec_head:             Linear({talker.codec_head.in_features} -> {talker.codec_head.out_features})")
+# KV cache dimensions
+num_layers = tc.num_hidden_layers
+num_kv_heads = tc.num_key_value_heads
+head_dim = tc.head_dim
+print(f"\n  Static KV cache per layer: 2 x [B, {num_kv_heads}, max_seq_len, {head_dim}]")
+print(f"  Total KV layers:        {num_layers}")
+print(f"  Total KV cache (fp32, B=1, seq=2048): "
+      f"{2 * num_layers * num_kv_heads * 2048 * head_dim * 4 / 1e6:.1f} MB")
+# --- 4e. Code Predictor Key Shapes ---
+print("\n  === Code Predictor Key Shapes ===")
+cp = talker.code_predictor
+print(f"  small_to_mtp_projection: {type(cp.small_to_mtp_projection).__name__}")
+if hasattr(cp.small_to_mtp_projection, 'weight'):
+    print(f"    weight shape:         {list(cp.small_to_mtp_projection.weight.shape)}")
+print(f"  lm_heads:               {len(cp.lm_head)} heads")
+for i, head in enumerate(cp.lm_head):
+    print(f"    lm_head[{i}]:           Linear({head.in_features} -> {head.out_features})")
+print(f"  codec_embeddings:       {len(cp.model.codec_embedding)} embeddings")
+for i, emb in enumerate(cp.model.codec_embedding):
+    print(f"    codec_embedding[{i}]:   {emb.weight.shape}")
+cp_layers = cpc.num_hidden_layers
+cp_kv_heads = cpc.num_key_value_heads
+cp_head_dim = cpc.head_dim
+print(f"\n  Static KV cache per layer: 2 x [B, {cp_kv_heads}, max_seq_len, {cp_head_dim}]")
+print(f"  Total KV layers:        {cp_layers}")
+# --- 4f. Speech Tokenizer Decoder Key Shapes ---
+print("\n  === Speech Tokenizer Decoder Key Shapes ===")
+if model.speech_tokenizer is not None:
+    st_dec = model.speech_tokenizer.model.decoder
+    print(f"  Decoder type:           {type(st_dec).__name__}")
+    print(f"  Total params:           {fmt(count_params(st_dec))}")
+    # Test decode with synthetic codes
+    # codes shape: [batch, num_quantizers, seq_len]
+    test_codes = torch.randint(0, 2048, (1, 16, 10))
+    with torch.no_grad():
+        test_wav = st_dec(test_codes)
+    print(f"  Test input codes:       {list(test_codes.shape)}")
+    print(f"  Test output wav:        {list(test_wav.shape)}")
+    upsample_factor = test_wav.shape[-1] // test_codes.shape[-1]
+    print(f"  Upsample factor:        {upsample_factor}x")
+# ── 5. Export Blocker Analysis ───────────────────────────────────────
+print("\n[5/5] Export Blocker Analysis")
+print("-" * 60)
+blockers = []
+# Check speaker encoder
+print("\n  === Speaker Encoder Export Blockers ===")
+se_issues = []
+# Conv1d with padding="same" and padding_mode="reflect"
+for name, mod in model.speaker_encoder.named_modules():
+    if isinstance(mod, nn.Conv1d):
+        if hasattr(mod, 'padding') and mod.padding == 'same':
+            se_issues.append(f"Conv1d '{name}' uses padding='same' (dynamic pad calc)")
+        if hasattr(mod, 'padding_mode') and mod.padding_mode == 'reflect':
+            se_issues.append(f"Conv1d '{name}' uses padding_mode='reflect'")
+# AttentiveStatisticsPooling dynamic masking
+se_issues.append("AttentiveStatisticsPooling: dynamic _length_to_mask(), .repeat(), masked_fill_")
+se_issues.append("Res2NetBlock: torch.chunk + for loop (but fixed scale=8, should be OK)")
+for issue in se_issues:
+    print(f"  [!] {issue}")
+blockers.extend([("speaker_encoder", i) for i in se_issues])
+# Check talker
+print("\n  === Talker Export Blockers ===")
+t_issues = []
+t_issues.append("MROPE: 3D rotary embedding with sections [24,20,20] — need custom handling")
+t_issues.append("DynamicCache: must replace with static KV cache tensors")
+t_issues.append("create_causal_mask/create_sliding_window_causal_mask from transformers")
+t_issues.append("Two embedding tables (text + codec) with interleaving logic")
+t_issues.append("code_predictor.generate() called inside forward() — autoregressive sub-loop")
+t_issues.append("trailing_text_hidden conditional addition in decode step")
+t_issues.append("@can_return_tuple decorator")
+t_issues.append("@use_kernel_forward_from_hub on RMSNorm")
+for issue in t_issues:
+    print(f"  [!] {issue}")
+blockers.extend([("talker", i) for i in t_issues])
+# Check code predictor
+print("\n  === Code Predictor Export Blockers ===")
+cp_issues = []
+cp_issues.append("Uses GenerationMixin.generate() — full autoregressive loop")
+cp_issues.append("generation_steps counter used to index into lm_head ModuleList")
+cp_issues.append("DynamicCache")
+cp_issues.append("get_input_embeddings() returns ModuleList (indexed by generation step)")
+for issue in cp_issues:
+    print(f"  [!] {issue}")
+blockers.extend([("code_predictor", i) for i in cp_issues])
+# Check speech tokenizer
+print("\n  === Speech Tokenizer Export Blockers ===")
+st_issues = []
+if model.speech_tokenizer is not None:
+    st_issues.append("chunked_decode: while loop with dynamic chunk boundaries")
+    st_issues.append("ConvTranspose1d with dynamic slicing (right_pad removal)")
+    st_issues.append("CausalConv1d: dynamic padding calculation")
+    st_issues.append("SnakeBeta: custom activation (should be OK)")
+    st_issues.append("SplitResidualVectorQuantizer: F.embedding based (OK)")
+    st_issues.append("Transformer decoder with @dynamic_rope_update and torch.autocast")
+    st_issues.append("Sliding window attention (window=72)")
+for issue in st_issues:
+    print(f"  [!] {issue}")
+blockers.extend([("speech_tokenizer", i) for i in st_issues])
+# ── Summary ───────────────────────────────────────────────────────��──
+print("\n" + "=" * 70)
+print("SUMMARY")
+print("=" * 70)
+print(f"""
+Model: Qwen3TTSForConditionalGeneration (1.7B-Base)
+Total params: {fmt(count_params(model))}
+Export Targets (4 modules):
+  1. Speaker Encoder       ({fmt(count_params(model.speaker_encoder))} params) — ECAPA-TDNN
+  2. Talker (Main LM)      ({fmt(count_params(model.talker.model))} + heads) — Qwen3 28L
+  3. Code Predictor         ({fmt(count_params(model.talker.code_predictor))} params) — 5L transformer
+  4. Speech Tokenizer Dec   ({fmt(count_params(model.speech_tokenizer.model.decoder)) if model.speech_tokenizer else 'N/A'} params) — Transformer + ConvTranspose
+Voice Clone Pipeline:
+  ref_audio (24kHz)
+    -> mel_spectrogram -> [B, T, 128]
+    -> speaker_encoder -> x_vector [B, {sec.enc_dim}]
+  ref_audio -> speech_tokenizer.encode -> ref_codes [T, 16]
+  text -> tokenizer -> input_ids
+  [x_vector, ref_codes, input_ids]
+    -> talker.generate() -> codec_tokens [T', 16]
+    (internally calls code_predictor.generate() per step)
+  codec_tokens -> speech_tokenizer.decode -> PCM waveform
+Key Dimensions:
+  Talker: hidden=2048, layers=28, heads=16, kv_heads=8, head_dim=128
+  Code Predictor: hidden=1024, layers=5, heads=16, kv_heads=8
+  Codec: vocab=3072 (talker), 2048 (code_predictor), 16 code groups
+  Speaker: enc_dim={sec.enc_dim}
+Export Strategy:
+  Phase 2: Speaker encoder — fixed mel length, handle Conv1d padding
+  Phase 3: Talker — static KV cache, unrolled MROPE, separate prefill/decode
+  Phase 4: Code predictor — static KV, unroll 15-step generation
+  Phase 5: Vocoder (decoder only) — fixed code length, handle ConvTranspose1d
+  Phase 6: INT8 via torchao int8_weight_only (instant, no calibration)
+Total export blockers found: {len(blockers)}
+""")
+print("Phase 1 analysis complete!")