acul3
/

Qwen3-TTS-1.7B-Base-ExecuTorch

+#!/usr/bin/env python3
+"""
+Token-by-token validation v2: Build TalkerForExport inline (no import of export script).
+Compares generated tokens: Original HF talker vs Fixed wrapper (same as .pte source).
+Runs on CPU, greedy decoding, 10 steps.
+"""
+import sys, os, time, copy, torch, torch.nn as nn, torch.nn.functional as F
+sys.path.insert(0, os.path.expanduser("~/Documents/Qwen3-TTS"))
+MAX_SEQ_LEN = 2048
+NUM_LAYERS = 28
+NUM_HEADS = 16
+NUM_KV_HEADS = 8
+HEAD_DIM = 128
+HIDDEN_SIZE = 2048
+KV_GROUPS = NUM_HEADS // NUM_KV_HEADS
+NUM_STEPS = 10
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        return (self.weight * (x.float() * torch.rsqrt(x.float().pow(2).mean(-1, keepdim=True) + self.eps))).to(x.dtype)
+def rotate_half(x):
+    x1, x2 = x[..., :x.shape[-1]//2], x[..., x.shape[-1]//2:]
+    return torch.cat((-x2, x1), dim=-1)
+class FixedAttn(nn.Module):
+    def __init__(self, orig):
+        super().__init__()
+        self.q_proj = copy.deepcopy(orig.q_proj)
+        self.k_proj = copy.deepcopy(orig.k_proj)
+        self.v_proj = copy.deepcopy(orig.v_proj)
+        self.o_proj = copy.deepcopy(orig.o_proj)
+        self.q_norm = RMSNorm(HEAD_DIM); self.q_norm.weight = nn.Parameter(orig.q_norm.weight.clone())
+        self.k_norm = RMSNorm(HEAD_DIM); self.k_norm.weight = nn.Parameter(orig.k_norm.weight.clone())
+        self.scale = HEAD_DIM ** -0.5
+    def forward(self, h, cos, sin, cp, kc, vc, am):
+        B, S, _ = h.shape
+        q = self.q_norm(self.q_proj(h).view(B, S, NUM_HEADS, HEAD_DIM)).transpose(1, 2)
+        k = self.k_norm(self.k_proj(h).view(B, S, NUM_KV_HEADS, HEAD_DIM)).transpose(1, 2)
+        v = self.v_proj(h).view(B, S, NUM_KV_HEADS, HEAD_DIM).transpose(1, 2)
+        # RoPE
+        q = q * cos + rotate_half(q) * sin
+        k = k * cos + rotate_half(k) * sin
+        # Update KV cache
+        kc = kc.clone(); vc = vc.clone()
+        kc[:, :, cp, :] = k; vc[:, :, cp, :] = v
+        # GQA expand
+        cache_len = kc.shape[2]
+        ke = kc.unsqueeze(2).repeat(1, 1, KV_GROUPS, 1, 1).reshape(B, NUM_HEADS, cache_len, HEAD_DIM)
+        ve = vc.unsqueeze(2).repeat(1, 1, KV_GROUPS, 1, 1).reshape(B, NUM_HEADS, cache_len, HEAD_DIM)
+        # Attention
+        o = F.scaled_dot_product_attention(q, ke, ve, attn_mask=am, scale=self.scale)
+        return self.o_proj(o.transpose(1, 2).reshape(B, S, -1)), kc, vc
+class FixedLayer(nn.Module):
+    def __init__(self, orig):
+        super().__init__()
+        self.attn = FixedAttn(orig.self_attn)
+        self.n1 = RMSNorm(HIDDEN_SIZE); self.n1.weight = nn.Parameter(orig.input_layernorm.weight.clone())
+        self.n2 = RMSNorm(HIDDEN_SIZE); self.n2.weight = nn.Parameter(orig.post_attention_layernorm.weight.clone())
+        self.gp = copy.deepcopy(orig.mlp.gate_proj)
+        self.up = copy.deepcopy(orig.mlp.up_proj)
+        self.dp = copy.deepcopy(orig.mlp.down_proj)
+    def forward(self, h, cos, sin, cp, kc, vc, am):
+        r = h; a, kc, vc = self.attn(self.n1(h), cos, sin, cp, kc, vc, am); h = r + a
+        r = h; x = self.n2(h); h = r + self.dp(F.silu(self.gp(x)) * self.up(x))
+        return h, kc, vc
+class FixedTalker(nn.Module):
+    def __init__(self, orig_talker):
+        super().__init__()
+        self.layers = nn.ModuleList([FixedLayer(l) for l in orig_talker.model.layers])
+        self.norm = RMSNorm(HIDDEN_SIZE); self.norm.weight = nn.Parameter(orig_talker.model.norm.weight.clone())
+        self.codec_head = copy.deepcopy(orig_talker.codec_head)
+        self.register_buffer("inv_freq", orig_talker.model.rotary_emb.inv_freq.clone())
+        self.rs = getattr(orig_talker.model.rotary_emb, 'attention_scaling', 1.0)
+    def forward(self, ie, pid, cp, am, *kv):
+        pos = pid[0].float()
+        freqs = pos.unsqueeze(-1) * self.inv_freq.float().unsqueeze(0).unsqueeze(0)
+        emb = torch.cat([freqs, freqs], dim=-1)
+        cos = (emb.cos() * self.rs).to(ie.dtype).unsqueeze(1)
+        sin = (emb.sin() * self.rs).to(ie.dtype).unsqueeze(1)
+        h = ie
+        ukv = []
+        for i, l in enumerate(self.layers):
+            h, nk, nv = l(h, cos, sin, cp, kv[i*2], kv[i*2+1], am)
+            ukv.append(nk); ukv.append(nv)
+        return (self.codec_head(self.norm(h)), *ukv)
+def main():
+    print("="*60)
+    print(f"Token-by-Token Validation (v2, {NUM_STEPS} steps, greedy)")
+    print("="*60)
+    from qwen_tts import Qwen3TTSModel
+    from transformers import AutoTokenizer
+    from transformers.cache_utils import DynamicCache
+    print("\n[1] Loading model...")
+    model = Qwen3TTSModel.from_pretrained(
+        os.path.expanduser("~/Documents/Qwen3-TTS/models/1.7B-Base"),
+        device_map="cpu", dtype=torch.float32, attn_implementation="sdpa")
+    talker = model.model.talker
+    talker.eval()
+    tokenizer = AutoTokenizer.from_pretrained(
+        os.path.expanduser("~/Documents/Qwen3-TTS/models/1.7B-Base"))
+    # Build input
+    text = "Hi"
+    text_ids = tokenizer.encode(text, add_special_tokens=False)
+    print(f"  Text: '{text}' → {text_ids}")
+    # Embeddings
+    emb_w = talker.model.text_embedding.weight.data
+    codec_w = talker.model.codec_embedding.weight.data
+    proj = talker.text_projection
+    raw = F.embedding(torch.tensor(text_ids), emb_w)
+    with torch.no_grad():
+        text_embeds = proj(raw)
+    inputs_embeds = text_embeds.unsqueeze(0)  # [1, T, 2048]
+    seq_len = inputs_embeds.shape[1]
+    # ── Original talker ──
+    print(f"\n[2] Original talker ({NUM_STEPS} steps)...")
+    orig_tokens = []
+    with torch.no_grad():
+        past_kv = DynamicCache()
+        pos_ids = torch.arange(seq_len).unsqueeze(0).unsqueeze(0).expand(3, 1, -1)
+        cache_pos = torch.arange(seq_len)
+        out = talker.model(input_ids=None, inputs_embeds=inputs_embeds,
+                          position_ids=pos_ids, cache_position=cache_pos,
+                          attention_mask=torch.ones(1, seq_len),
+                          past_key_values=past_kv, use_cache=True)
+        logits = talker.codec_head(out.last_hidden_state)
+        next_token = logits[0, -1].argmax().item()
+        orig_tokens.append(next_token)
+        past_kv = out.past_key_values
+        for step in range(NUM_STEPS - 1):
+            te = F.embedding(torch.tensor([[next_token]]), codec_w)
+            pi = torch.tensor([[[seq_len + step]]]).expand(3, 1, 1)
+            cp = torch.tensor([seq_len + step])
+            out = talker.model(input_ids=None, inputs_embeds=te,
+                              position_ids=pi, cache_position=cp,
+                              attention_mask=torch.ones(1, seq_len + step + 1),
+                              past_key_values=past_kv, use_cache=True)
+            logits = talker.codec_head(out.last_hidden_state)
+            next_token = logits[0, -1].argmax().item()
+            orig_tokens.append(next_token)
+            past_kv = out.past_key_values
+    print(f"  Tokens: {orig_tokens}")
+    # ── Fixed talker wrapper ──
+    print(f"\n[3] Building FixedTalker wrapper...")
+    t0 = time.time()
+    fixed = FixedTalker(talker)
+    fixed.eval()
+    print(f"  Built in {time.time()-t0:.1f}s")
+    # Free original to save RAM
+    del talker, model
+    import gc; gc.collect()
+    print(f"\n[4] Fixed talker ({NUM_STEPS} steps)...")
+    kv = [torch.zeros(1, NUM_KV_HEADS, MAX_SEQ_LEN, HEAD_DIM) for _ in range(NUM_LAYERS * 2)]
+    pid = torch.arange(seq_len).unsqueeze(0).unsqueeze(0).expand(3, 1, -1)
+    cp = torch.arange(seq_len)
+    mask = torch.full((1, 1, seq_len, MAX_SEQ_LEN), float('-inf'))
+    for i in range(seq_len):
+        mask[0, 0, i, :i+1] = 0.0
+    fixed_tokens = []
+    with torch.no_grad():
+        t0 = time.time()
+        result = fixed(inputs_embeds, pid, cp, mask, *kv)
+        logits = result[0]; kv = list(result[1:])
+        next_token = logits[0, -1].argmax().item()
+        fixed_tokens.append(next_token)
+        print(f"  Prefill: {time.time()-t0:.1f}s, token={next_token}", flush=True)
+        for step in range(NUM_STEPS - 1):
+            cur = seq_len + step
+            te = F.embedding(torch.tensor([[next_token]]), codec_w)
+            pi = torch.tensor([[[cur]]]).expand(3, 1, 1)
+            cp = torch.tensor([cur])
+            dm = torch.full((1, 1, 1, MAX_SEQ_LEN), float('-inf'))
+            dm[0, 0, 0, :cur+1] = 0.0
+            t1 = time.time()
+            result = fixed(te, pi, cp, dm, *kv)
+            logits = result[0]; kv = list(result[1:])
+            next_token = logits[0, -1].argmax().item()
+            fixed_tokens.append(next_token)
+            print(f"  Step {step+1}: {time.time()-t1:.1f}s, token={next_token}", flush=True)
+    # ── Compare ──
+    print("\n" + "="*60)
+    print("COMPARISON")
+    print("="*60)
+    match = 0
+    for i in range(NUM_STEPS):
+        m = orig_tokens[i] == fixed_tokens[i]
+        if m: match += 1
+        print(f"  Step {i+1:2d}: orig={orig_tokens[i]:5d}  fixed={fixed_tokens[i]:5d}  {'✅' if m else '❌'}")
+    print(f"\n  Match: {match}/{NUM_STEPS} ({100*match/NUM_STEPS:.0f}%)")
+    if match == NUM_STEPS:
+        print("  🎉 PERFECT — Fixed wrapper produces identical tokens!")
+    elif match >= NUM_STEPS * 0.8:
+        print("  ✅ NEAR-PERFECT — minor numerical drift")
+    else:
+        print("  ❌ DIVERGENCE — needs investigation")
+if __name__ == "__main__":
+    main()