Upload scripts/export_vocoder.py with huggingface_hub

b3779d4 verified 22 days ago

8.02 kB

	#!/usr/bin/env python3
	"""
	Phase 5: Export Speech Tokenizer Decoder (Vocoder) to ExecuTorch .pte
	======================================================================
	The vocoder converts codec tokens → audio waveform.

	Architecture:
	codes [B, 16, T] → VQ decode → [B, codebook_dim, T]
	→ Conv1d → Transformer (8 layers) → Conv1d
	→ Upsample (2x, 2x) via ConvTranspose1d + ConvNeXt
	→ Decoder (8x, 5x, 4x, 3x) via ConvTranspose1d + SnakeBeta + ResBlocks
	→ Conv1d → waveform [B, 1, T*1920]

	Total upsample: 22854*3 = 3840x (but code downsample is 1920x, so net 1920x)
	Wait — the decoder forward uses total_upsample which is upsample_rates * upsampling_ratios.
	"""

	import sys
	import os
	import copy
	import time
	import math
	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	MODEL_PATH = os.path.expanduser("~/Documents/Qwen3-TTS/models/1.7B-Base")
	VENV_SITE = os.path.expanduser("~/Documents/Qwen3-TTS/.venv/lib/python3.10/site-packages")
	QWEN_TTS_SRC = os.path.expanduser("~/Documents/Qwen3-TTS")
	OUTPUT_DIR = os.path.expanduser("~/Documents/Qwen3-TTS-ExecuTorch/exported")

	if VENV_SITE not in sys.path:
	sys.path.insert(0, VENV_SITE)
	if QWEN_TTS_SRC not in sys.path:
	sys.path.insert(0, QWEN_TTS_SRC)

	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# Fixed code length for export (50 frames ≈ 4 seconds of audio)
	FIXED_CODE_LEN = 50
	NUM_QUANTIZERS = 16

	print("=" * 70)
	print("PHASE 5: Export Vocoder (Speech Tokenizer Decoder) → .pte")
	print("=" * 70)

	# ── 1. Load Model ───────────────────────────────────────────────────

	print("\n[1/5] Loading model...")
	from qwen_tts.core.models.configuration_qwen3_tts import Qwen3TTSConfig
	from qwen_tts.core.models.modeling_qwen3_tts import Qwen3TTSForConditionalGeneration

	config = Qwen3TTSConfig.from_pretrained(MODEL_PATH)
	model = Qwen3TTSForConditionalGeneration.from_pretrained(
	MODEL_PATH, config=config, dtype=torch.float32,
	attn_implementation="sdpa", device_map="cpu",
	)
	model.eval()
	print(" Model loaded.")

	# ── 2. Create Vocoder Wrapper ────────────────────────────────────────

	print("\n[2/5] Creating vocoder wrapper...")

	# The decoder has dynamic padding calculations that depend on input length.
	# With a FIXED input length, these become constants. We wrap the original
	# decoder directly and let torch.export trace through the fixed-size logic.

	class VocoderForExport(nn.Module):
	"""
	Wraps the speech tokenizer decoder for export.

	Bypasses chunked_decode and calls forward() directly.
	Input: codes [1, num_quantizers, code_len] — all int64
	Output: waveform [1, 1, code_len * decode_upsample_rate]
	"""

	def __init__(self, original_decoder):
	super().__init__()
	self.decoder = copy.deepcopy(original_decoder)

	def forward(self, codes: torch.Tensor) -> torch.Tensor:
	"""
	Args:
	codes: [1, 16, FIXED_CODE_LEN] — LongTensor of codec indices
	Returns:
	waveform: [1, 1, FIXED_CODE_LEN * upsample] — float waveform in [-1, 1]
	"""
	return self.decoder(codes)


	vocoder = VocoderForExport(model.speech_tokenizer.model.decoder)
	vocoder.eval()

	param_count = sum(p.numel() for p in vocoder.parameters())
	print(f" Vocoder parameters: {param_count / 1e6:.1f}M")

	# ── 3. Validate ─────────────────────────────────────────────────────

	print("\n[3/5] Validating vocoder wrapper...")

	test_codes = torch.randint(0, 2048, (1, NUM_QUANTIZERS, FIXED_CODE_LEN))

	with torch.no_grad():
	# Test original decoder
	orig_wav = model.speech_tokenizer.model.decoder(test_codes)
	# Test wrapper
	wrap_wav = vocoder(test_codes)

	print(f" Input codes shape: {list(test_codes.shape)}")
	print(f" Original output shape: {list(orig_wav.shape)}")
	print(f" Wrapper output shape: {list(wrap_wav.shape)}")

	cos_sim = F.cosine_similarity(orig_wav.flatten().unsqueeze(0),
	wrap_wav.flatten().unsqueeze(0)).item()
	max_diff = (orig_wav - wrap_wav).abs().max().item()
	print(f" Cosine similarity: {cos_sim:.6f}")
	print(f" Max abs difference: {max_diff:.2e}")
	assert cos_sim > 0.999, f"Mismatch! cos_sim={cos_sim}"
	print(" PASS — vocoder validated")

	upsample_rate = wrap_wav.shape[-1] // FIXED_CODE_LEN
	print(f" Upsample rate: {upsample_rate}x")
	print(f" Output duration: {wrap_wav.shape[-1] / 24000:.1f}s at 24kHz")

	# ── 4. torch.export ─────────────────────────────────────────────────

	print("\n[4/5] Running torch.export...")
	t0 = time.time()

	example_input = (test_codes,)

	try:
	exported = torch.export.export(
	vocoder,
	example_input,
	strict=False,
	)
	print(f" torch.export succeeded in {time.time() - t0:.1f}s")
	print(f" Graph nodes: {len(exported.graph.nodes)}")
	except Exception as e:
	print(f" torch.export FAILED: {e}")
	exported = None

	# ── 5. Lower to .pte ────────────────────────────────────────────────

	print("\n[5/5] Lowering to ExecuTorch .pte...")
	t0 = time.time()

	if exported is not None:
	try:
	from executorch.exir import to_edge_transform_and_lower, EdgeCompileConfig
	from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner

	edge = to_edge_transform_and_lower(
	exported,
	compile_config=EdgeCompileConfig(_check_ir_validity=False),
	partitioner=[XnnpackPartitioner()],
	)
	et_program = edge.to_executorch()

	pte_path = os.path.join(OUTPUT_DIR, "vocoder.pte")
	with open(pte_path, "wb") as f:
	f.write(et_program.buffer)

	pte_size = os.path.getsize(pte_path) / 1e6
	print(f" .pte saved: {pte_path}")
	print(f" .pte size: {pte_size:.1f} MB")
	print(f" Lowered in {time.time() - t0:.1f}s")

	except Exception as e:
	print(f" ExecuTorch lowering failed: {e}")
	if exported is not None:
	pt2_path = os.path.join(OUTPUT_DIR, "vocoder.pt2")
	torch.export.save(exported, pt2_path)
	print(f" Saved exported program: {pt2_path}")

	# Validate .pte
	if os.path.exists(os.path.join(OUTPUT_DIR, "vocoder.pte")):
	print("\n Validating .pte execution...")
	try:
	from executorch.runtime import Runtime

	runtime = Runtime.get()
	program = runtime.load_program(
	open(os.path.join(OUTPUT_DIR, "vocoder.pte"), "rb").read()
	)
	method = program.load_method("forward")
	pte_out = method.execute([test_codes])
	if isinstance(pte_out, (list, tuple)):
	pte_out = pte_out[0]
	with torch.no_grad():
	ref_out = vocoder(test_codes)
	cos_pte = F.cosine_similarity(
	ref_out.flatten().unsqueeze(0),
	pte_out.flatten().unsqueeze(0)
	).item()
	print(f" .pte vs PyTorch cosine sim: {cos_pte:.6f}")
	except Exception as e:
	print(f" .pte validation: {e}")
	else:
	print(" No exported program to lower.")
	# Save state dict as fallback
	torch.save(vocoder.state_dict(), os.path.join(OUTPUT_DIR, "vocoder_state_dict.pt"))
	print(f" Saved state dict: {OUTPUT_DIR}/vocoder_state_dict.pt")

	print("\n" + "=" * 70)
	print("Phase 5 complete!")
	print(f" Fixed code length: {FIXED_CODE_LEN} frames")
	print(f" Output: {FIXED_CODE_LEN * upsample_rate} samples ({FIXED_CODE_LEN * upsample_rate / 24000:.1f}s)")
	print("=" * 70)