HY-2012

First commit

b3a7ca2 verified about 1 month ago

61.2 kB

	from __future__ import annotations

	import json
	import logging
	import time
	import unicodedata
	import wave
	from collections import defaultdict
	from pathlib import Path
	from typing import Any, Callable, Optional, Sequence

	import numpy as np
	import sentencepiece as spm

	from scripts.axe_session import AxeSession

	try:
	import onnxruntime as ort
	_HAVE_ORT = True
	except ImportError:
	_HAVE_ORT = False

	SAMPLE_MODE_GREEDY = "greedy"
	SAMPLE_MODE_FIXED = "fixed"
	SAMPLE_MODE_FULL = "full"

	SENTENCE_END_PUNCTUATION = set(".!?。！？")
	CLAUSE_SPLIT_PUNCTUATION = set(",，、；;：:")
	CLOSING_PUNCTUATION = set(['"', "'", "\u2019", "\u201d", ")", "]", "}", "）", "】", "》", "」", "』"])

	_MANIFEST_CANDIDATES = ("browser_poc_manifest.json",)

	_FULL8_IO_MAP: dict[str, tuple[list[str], list[str]]] = {
	"prefill": (
	["input_ids", "attention_mask"],
	["global_hidden",
	"present_key_0", "present_value_0", "present_key_1", "present_value_1",
	"present_key_2", "present_value_2", "present_key_3", "present_value_3",
	"present_key_4", "present_value_4", "present_key_5", "present_value_5",
	"present_key_6", "present_value_6", "present_key_7", "present_value_7",
	"present_key_8", "present_value_8", "present_key_9", "present_value_9",
	"present_key_10", "present_value_10", "present_key_11", "present_value_11"],
	),
	"decode": (
	["input_ids", "past_valid_lengths",
	"past_key_0", "past_value_0", "past_key_1", "past_value_1",
	"past_key_2", "past_value_2", "past_key_3", "past_value_3",
	"past_key_4", "past_value_4", "past_key_5", "past_value_5",
	"past_key_6", "past_value_6", "past_key_7", "past_value_7",
	"past_key_8", "past_value_8", "past_key_9", "past_value_9",
	"past_key_10", "past_value_10", "past_key_11", "past_value_11"],
	["global_hidden",
	"present_key_0", "present_value_0", "present_key_1", "present_value_1",
	"present_key_2", "present_value_2", "present_key_3", "present_value_3",
	"present_key_4", "present_value_4", "present_key_5", "present_value_5",
	"present_key_6", "present_value_6", "present_key_7", "present_value_7",
	"present_key_8", "present_value_8", "present_key_9", "present_value_9",
	"present_key_10", "present_value_10", "present_key_11", "present_value_11"],
	),
	"local_decoder": (
	["global_hidden", "text_token_id", "audio_prefix_token_ids"],
	["text_logits", "audio_logits"],
	),
	"local_fixed_sampled_frame": (
	["global_hidden", "repetition_seen_mask", "assistant_random_u", "audio_random_u"],
	["should_continue", "frame_token_ids"],
	),
	"codec_encode": (
	["waveform", "input_lengths"],
	["audio_codes", "audio_code_lengths"],
	),
	"codec_decode": (
	["audio_codes", "audio_code_lengths"],
	["audio", "audio_lengths"],
	),
	}

	_AXMODEL_FILE_MAP: dict[str, str] = {
	"prefill": "tts_prefill.axmodel",
	"decode": "tts_decode_step.axmodel",
	"local_decoder": "tts_local_decoder.axmodel",
	"local_fixed_sampled_frame": "tts_local_fixed_sampled_frame.axmodel",
	"codec_encode": "codec_encode.axmodel",
	"codec_decode": "codec_decode.axmodel",
	}

	_ONNX_FILE_MAP: dict[str, str] = {
	"decode": "tts_decode_step.onnx",
	}


	def _argmax(values: np.ndarray) -> int:
	return int(np.argmax(values))


	def _softmax(values: np.ndarray) -> np.ndarray:
	max_v = float(np.max(values))
	shifted = np.asarray(values - max_v, dtype=np.float64)
	exps = np.exp(shifted)
	return exps / np.sum(exps)


	def _apply_repetition_penalty(
	values: np.ndarray, previous_token_ids: list[int], penalty: float
	) -> np.ndarray:
	if not previous_token_ids or penalty == 1.0:
	return values
	result = values.copy()
	for tid in set(int(t) for t in previous_token_ids):
	if 0 <= tid < result.shape[0]:
	result[tid] = result[tid] * penalty if result[tid] < 0 else result[tid] / penalty
	return result


	def _argmax_with_repetition_penalty(
	values: np.ndarray, previous_token_set: set[int], penalty: float
	) -> int:
	best_idx, best_val = 0, float("-inf")
	apply_penalty = bool(previous_token_set) and penalty != 1.0
	for idx, v in enumerate(values):
	score = float(v)
	if apply_penalty and idx in previous_token_set:
	score = score * penalty if score < 0 else score / penalty
	if score > best_val:
	best_val = score
	best_idx = idx
	return int(best_idx)


	def _sample_from_scores(
	values: np.ndarray,
	*,
	do_sample: bool,
	temperature: float,
	top_k: int,
	top_p: float,
	rng: np.random.Generator,
	) -> int:
	if not do_sample:
	return _argmax(values)
	if temperature <= 0:
	raise ValueError("temperature must be positive when do_sample=True")
	scores = np.asarray(values, dtype=np.float32).copy() / float(temperature)
	if top_k > 0 and top_k < scores.shape[0]:
	threshold = float(np.sort(scores)[::-1][top_k - 1])
	scores[scores < threshold] = float("-inf")
	if 0 < top_p < 1:
	indexed = sorted(enumerate(scores.tolist()), key=lambda x: x[1], reverse=True)
	sorted_scores = np.asarray([x[1] for x in indexed], dtype=np.float32)
	sorted_probs = _softmax(sorted_scores)
	remove = [False] * len(indexed)
	cumulative = 0.0
	for i, p in enumerate(sorted_probs):
	cumulative += float(p)
	if cumulative > float(top_p):
	remove[i] = True
	for i in range(len(remove) - 1, 0, -1):
	remove[i] = remove[i - 1]
	if remove:
	remove[0] = False
	for i, should_remove in enumerate(remove):
	if should_remove:
	scores[indexed[i][0]] = float("-inf")
	probs = _softmax(scores)
	rv = float(rng.random())
	for i, p in enumerate(probs):
	rv -= float(p)
	if rv <= 0:
	return int(i)
	return _argmax(scores)


	def _sample_assistant_text_token(
	text_logits: np.ndarray,
	manifest: dict[str, Any],
	generation_defaults: dict[str, Any],
	rng: np.random.Generator,
	) -> int:
	candidate_ids = np.asarray(
	[int(manifest["tts_config"]["audio_assistant_slot_token_id"]),
	int(manifest["tts_config"]["audio_end_token_id"])],
	dtype=np.int32,
	)
	candidate_scores = text_logits[candidate_ids]
	sampled_idx = _sample_from_scores(
	candidate_scores,
	do_sample=False, # deterministic: always pick the higher-scoring token
	temperature=float(generation_defaults["text_temperature"]),
	top_k=min(int(generation_defaults["text_top_k"]), int(candidate_scores.shape[0])),
	top_p=float(generation_defaults["text_top_p"]),
	rng=rng,
	)
	return int(candidate_ids[sampled_idx])


	def _sample_audio_token(
	audio_logits: np.ndarray,
	previous_token_ids: list[int],
	previous_token_set: set[int],
	generation_defaults: dict[str, Any],
	rng: np.random.Generator,
	) -> int:
	penalty = float(generation_defaults["audio_repetition_penalty"])
	if not bool(generation_defaults["do_sample"]):
	return _argmax_with_repetition_penalty(audio_logits, previous_token_set, penalty)
	penalized = _apply_repetition_penalty(audio_logits, previous_token_ids, penalty)
	return _sample_from_scores(
	penalized,
	do_sample=True,
	temperature=float(generation_defaults["audio_temperature"]),
	top_k=int(generation_defaults["audio_top_k"]),
	top_p=float(generation_defaults["audio_top_p"]),
	rng=rng,
	)


	def _flatten3d_int32(nested: list[list[list[int]]]) -> tuple[np.ndarray, list[int]]:
	d0, d1, d2 = len(nested), len(nested[0]), len(nested[0][0])
	data = np.zeros((d0 * d1 * d2,), dtype=np.int32)
	offset = 0
	for i in range(d0):
	for j in range(d1):
	for k in range(d2):
	data[offset] = int(nested[i][j][k])
	offset += 1
	return data, [d0, d1, d2]


	def _flatten2d_int32(nested: list[list[int]]) -> tuple[np.ndarray, list[int]]:
	d0, d1 = len(nested), len(nested[0])
	data = np.zeros((d0 * d1,), dtype=np.int32)
	offset = 0
	for i in range(d0):
	for j in range(d1):
	data[offset] = int(nested[i][j])
	offset += 1
	return data, [d0, d1]


	def _extract_last_hidden(hidden: np.ndarray) -> np.ndarray:
	if hidden.ndim == 2:
	return hidden.astype(np.float32, copy=False)
	if hidden.ndim != 3 or hidden.shape[0] != 1:
	raise ValueError(f"Unexpected global_hidden shape: {hidden.shape}")
	return hidden[:, -1, :].astype(np.float32, copy=False)


	def _slice_channel_major_audio(
	audio: np.ndarray, start_sample: int = 0, end_sample: int \| None = None
	) -> list[np.ndarray]:
	if audio.ndim != 3 or audio.shape[0] != 1:
	raise ValueError(f"Unexpected audio tensor shape: {audio.shape}")
	channels = int(audio.shape[1])
	total = int(audio.shape[2])
	start = max(0, int(start_sample))
	end = total if end_sample is None else max(start, min(int(end_sample), total))
	return [audio[0, c, start:end].astype(np.float32, copy=False) for c in range(channels)]


	def _merge_audio_channels(channel_arrays: list[np.ndarray]) -> np.ndarray:
	if not channel_arrays:
	return np.zeros((0, 1), dtype=np.float32)
	if len(channel_arrays) == 1:
	return np.asarray(channel_arrays[0], dtype=np.float32).reshape(-1, 1)
	min_len = min(int(c.shape[0]) for c in channel_arrays)
	return np.stack([np.asarray(c[:min_len], dtype=np.float32) for c in channel_arrays], axis=1)


	def _concat_waveforms(waveforms: list[np.ndarray]) -> np.ndarray:
	if not waveforms:
	return np.zeros((0, 1), dtype=np.float32)
	non_empty = [w for w in waveforms if w.size > 0]
	if not non_empty:
	n_ch = int(waveforms[0].shape[1]) if waveforms[0].ndim == 2 else 1
	return np.zeros((0, n_ch), dtype=np.float32)
	return np.concatenate(non_empty, axis=0)


	def _write_waveform_to_wav(path: str \| Path, waveform: np.ndarray, sample_rate: int) -> Path:
	output_path = Path(path).expanduser().resolve()
	output_path.parent.mkdir(parents=True, exist_ok=True)
	audio = np.asarray(waveform, dtype=np.float32)
	if audio.ndim == 1:
	audio = audio.reshape(-1, 1)
	clipped = np.clip(audio, -1.0, 1.0)
	pcm16 = np.round(clipped * 32767.0).astype(np.int16)
	with wave.open(str(output_path), "wb") as wf:
	wf.setnchannels(int(pcm16.shape[1]))
	wf.setsampwidth(2)
	wf.setframerate(int(sample_rate))
	wf.writeframes(pcm16.tobytes())
	return output_path


	def _load_audio_numpy(path: str \| Path, target_sample_rate: int, target_channels: int) -> np.ndarray:
	path = Path(path).expanduser().resolve()

	waveform: np.ndarray \| None = None
	src_sr: int = 0

	try:
	import soundfile as sf
	data, src_sr = sf.read(str(path), dtype="float32", always_2d=True) # (samples, channels)
	waveform = data.T # (channels, samples)
	except Exception:
	pass

	if waveform is None:
	with wave.open(str(path), "rb") as wf:
	src_sr = wf.getframerate()
	n_ch = wf.getnchannels()
	sw = wf.getsampwidth()
	raw = wf.readframes(wf.getnframes())
	if sw == 2:
	pcm = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
	elif sw == 4:
	pcm = np.frombuffer(raw, dtype=np.int32).astype(np.float32) / 2147483648.0
	elif sw == 1:
	pcm = (np.frombuffer(raw, dtype=np.uint8).astype(np.float32) - 128.0) / 128.0
	else:
	raise ValueError(f"Unsupported sample width: {sw}")
	waveform = pcm.reshape(-1, n_ch).T # (channels, samples)

	assert waveform is not None

	src_channels = int(waveform.shape[0])
	if src_channels != target_channels:
	if src_channels == 1 and target_channels > 1:
	waveform = np.repeat(waveform, target_channels, axis=0)
	elif src_channels > 1 and target_channels == 1:
	waveform = waveform.mean(axis=0, keepdims=True)
	else:
	raise ValueError(f"Unsupported channel conversion: {src_channels} → {target_channels}")

	if src_sr != target_sample_rate:
	try:
	from scipy.signal import resample_poly
	from math import gcd
	g = gcd(int(target_sample_rate), int(src_sr))
	up = int(target_sample_rate) // g
	down = int(src_sr) // g
	resampled = []
	for ch in range(waveform.shape[0]):
	resampled.append(resample_poly(waveform[ch], up, down).astype(np.float32))
	waveform = np.stack(resampled, axis=0)
	except ImportError:
	logging.warning("scipy not available, using linear interpolation for audio resampling")
	old_len = int(waveform.shape[1])
	new_len = int(round(old_len * target_sample_rate / src_sr))
	old_idx = np.linspace(0, old_len - 1, new_len)
	resampled = []
	for ch in range(waveform.shape[0]):
	resampled.append(np.interp(old_idx, np.arange(old_len), waveform[ch]).astype(np.float32))
	waveform = np.stack(resampled, axis=0)

	return waveform[np.newaxis, :, :] # (1, channels, samples)


	def _contains_cjk(text: str) -> bool:
	for ch in str(text or ""):
	if (
	"\u4e00" <= ch <= "\u9fff"
	or "\u3400" <= ch <= "\u4dbf"
	or "\u3040" <= ch <= "\u30ff"
	or "\uac00" <= ch <= "\ud7af"
	):
	return True
	return False


	def _normalize_punctuation(text: str) -> str:
	_WORD_INTERNAL = frozenset(["'", "\u2019", "-", "\u2013"]) # don't touch these

	result: list[str] = []
	prev_is_punct: bool = False

	for ch in text:
	is_sent_end = ch in SENTENCE_END_PUNCTUATION
	is_punct = is_sent_end or (
	ch not in _WORD_INTERNAL and unicodedata.category(ch).startswith("P")
	)

	if is_punct:
	if prev_is_punct:
	if is_sent_end:
	result[-1] = ch
	else:
	result.append(ch if is_sent_end else ",")
	prev_is_punct = True
	else:
	result.append(ch)
	prev_is_punct = False

	return "".join(result)


	def _prepare_text_for_sentence_chunking(text: str) -> str:
	t = str(text or "").strip()
	if not t:
	raise ValueError("Text prompt cannot be empty.")
	t = t.replace("\r", " ").replace("\n", " ")
	while " " in t:
	t = t.replace(" ", " ")
	t = _normalize_punctuation(t)
	while ", ," in t:
	t = t.replace(", ,", ",")
	if _contains_cjk(t):
	if t[-1] not in SENTENCE_END_PUNCTUATION:
	t += "。"
	return t
	if t[:1].islower():
	t = t[:1].upper() + t[1:]
	if t[-1].isalnum():
	t += "."
	if len([x for x in t.split() if x]) < 5:
	t = f" {t}"
	return t


	def _split_text_by_punctuation(text: str, punctuation: set[str]) -> list[str]:
	sentences: list[str] = []
	current: list[str] = []
	idx = 0
	t = str(text or "")
	while idx < len(t):
	ch = t[idx]
	current.append(ch)
	if ch in punctuation:
	lookahead = idx + 1
	while lookahead < len(t) and t[lookahead] in CLOSING_PUNCTUATION:
	current.append(t[lookahead])
	lookahead += 1
	sentence = "".join(current).strip()
	if sentence:
	sentences.append(sentence)
	current.clear()
	while lookahead < len(t) and t[lookahead].isspace():
	lookahead += 1
	idx = lookahead
	continue
	idx += 1
	tail = "".join(current).strip()
	if tail:
	sentences.append(tail)
	return sentences


	def _join_sentence_parts(left: str, right: str) -> str:
	if not left:
	return right
	if not right:
	return left
	if _contains_cjk(left) or _contains_cjk(right):
	return left + right
	return f"{left} {right}"


	class AxTtsRuntime:

	def __init__(
	self,
	config_dir: str \| Path,
	axmodel_dir: str \| Path,
	*,
	onnx_dir: str \| Path \| None = None,
	use_onnx_decode: bool = False,
	max_new_frames: int \| None = None,
	do_sample: bool = True,
	sample_mode: str \| None = None,
	) -> None:
	self.config_dir = Path(config_dir).expanduser().resolve()
	self.axmodel_dir = Path(axmodel_dir).expanduser().resolve()
	self.use_onnx_decode = bool(use_onnx_decode)
	if onnx_dir is not None:
	self.onnx_dir = Path(onnx_dir).expanduser().resolve()
	else:
	self.onnx_dir = self.axmodel_dir.parent / "onnxmodels"

	self.manifest_path = self._find_manifest()
	self.manifest_dir = self.manifest_path.parent
	self.manifest: dict[str, Any] = json.loads(
	self.manifest_path.read_text(encoding="utf-8")
	)

	gen = self.manifest["generation_defaults"]
	if max_new_frames is not None:
	gen["max_new_frames"] = int(max_new_frames)
	if do_sample is not None:
	gen["do_sample"] = bool(do_sample)

	raw_mode = sample_mode if sample_mode is not None else gen.get("sample_mode", "fixed")
	gen["sample_mode"] = self._normalize_sample_mode(raw_mode, bool(gen["do_sample"]))
	gen["do_sample"] = gen["sample_mode"] != SAMPLE_MODE_GREEDY

	self.tts_meta: dict[str, Any] = json.loads(
	self._resolve_path(self.manifest["model_files"]["tts_meta"]).read_text("utf-8")
	)
	self.codec_meta: dict[str, Any] = json.loads(
	self._resolve_path(self.manifest["model_files"]["codec_meta"]).read_text("utf-8")
	)
	self.tts_static_shapes = dict(self.tts_meta.get("static_shapes", {}))
	self.codec_static_shapes = dict(self.codec_meta.get("static_shapes", {}))

	tok_rel = str(self.manifest["model_files"].get("tokenizer_model", "tokenizer.model"))
	self.sp_model = spm.SentencePieceProcessor(model_file=str(self._resolve_path(tok_rel)))

	self.rng = np.random.default_rng(1234)

	self._model_display_names: dict[str, str] = {}
	self.sessions: dict = self._create_sessions()
	self._model_time_stats: dict[str, float] = defaultdict(float)
	self._model_call_stats: dict[str, int] = defaultdict(int)
	logging.info("AxTtsRuntime ready: %d sessions loaded", len(self.sessions))


	def _find_manifest(self) -> Path:
	for name in _MANIFEST_CANDIDATES:
	p = (self.config_dir / name).resolve()
	if p.is_file():
	return p
	raise FileNotFoundError(
	f"browser_poc_manifest.json not found in {self.config_dir}"
	)

	def _resolve_path(self, rel: str \| Path) -> Path:
	p = (self.manifest_dir / Path(rel)).resolve()
	if p.exists():
	return p
	fallback = (self.config_dir / Path(rel).name).resolve()
	if fallback.exists():
	return fallback
	return p

	@staticmethod
	def _normalize_sample_mode(raw: str \| None, do_sample: bool = True) -> str:
	s = str(raw or "").strip()
	if s in {SAMPLE_MODE_GREEDY, SAMPLE_MODE_FIXED, SAMPLE_MODE_FULL}:
	return s
	if s == "mixed3":
	return SAMPLE_MODE_FIXED if do_sample else SAMPLE_MODE_GREEDY
	return SAMPLE_MODE_GREEDY if not do_sample else SAMPLE_MODE_FIXED

	def _resolve_axmodel_path(self, key: str, axmodel_name: str) -> Path:
	flat = self.axmodel_dir / axmodel_name
	if key != "local_fixed_sampled_frame":
	return flat
	if flat.exists():
	return flat
	nested = self.axmodel_dir / "build-tts_local_fixed_sampled_frame" / axmodel_name
	return nested

	def _create_sessions(self) -> dict:
	sessions: dict[str, Any] = {}
	self._session_input_names: dict[str, list[str]] = {}

	if self.use_onnx_decode:
	onnx_name = _ONNX_FILE_MAP["decode"]
	onnx_path = self.onnx_dir / onnx_name
	if not onnx_path.exists():
	raise FileNotFoundError(f"ONNX not found: {onnx_path}")
	if not _HAVE_ORT:
	raise RuntimeError("onnxruntime not installed; cannot load ONNX decode_step.")
	sess = ort.InferenceSession(str(onnx_path), providers=["CPUExecutionProvider"])
	sessions["decode"] = sess
	self._model_display_names["decode"] = f"{onnx_path.name} [onnxruntime CPU]"
	self._session_input_names["decode"] = [inp.name for inp in sess.get_inputs()]
	logging.info("loaded decode → %s (onnxruntime CPU)", onnx_path.name)
	else:
	axmodel_name = _AXMODEL_FILE_MAP["decode"]
	axmodel_path = self.axmodel_dir / axmodel_name
	if not axmodel_path.exists():
	raise FileNotFoundError(f"axmodel not found: {axmodel_path}")
	input_names, output_names = _FULL8_IO_MAP["decode"]
	sess = AxeSession(axmodel_path, input_names, output_names)
	sessions["decode"] = sess
	self._model_display_names["decode"] = f"{axmodel_path.name} [axmodel]"
	self._session_input_names["decode"] = [inp.name for inp in sess.get_inputs()]
	logging.info("loaded decode → %s (axmodel)", axmodel_path.name)

	for key in ("prefill", "local_decoder", "local_fixed_sampled_frame", "codec_decode", "codec_encode"):
	if key not in _AXMODEL_FILE_MAP:
	continue
	axmodel_name = _AXMODEL_FILE_MAP[key]
	axmodel_path = self._resolve_axmodel_path(key, axmodel_name)
	if not axmodel_path.exists():
	logging.warning("axmodel not found, skipping session %s: %s", key, axmodel_path)
	continue
	input_names, output_names = _FULL8_IO_MAP[key]
	sess = AxeSession(axmodel_path, input_names, output_names)
	sessions[key] = sess
	self._model_display_names[key] = f"{axmodel_path.name} [axmodel]"
	self._session_input_names[key] = [inp.name for inp in sess.get_inputs()]
	logging.info("loaded %s → %s (axmodel)", key, axmodel_path.name)

	if "codec_encode" not in sessions:
	logging.info("codec_encode axmodel not found; voice clone (reference audio) is disabled.")

	return sessions

	def _reset_timing_stats(self) -> None:
	self._model_time_stats = defaultdict(float)
	self._model_call_stats = defaultdict(int)
	self._used_model_keys: list[str] = []
	self._used_model_key_set: set[str] = set()

	def _mark_model_used(self, key: str) -> None:
	if key in self._used_model_key_set:
	return
	self._used_model_key_set.add(key)
	self._used_model_keys.append(key)
	logging.info(
	"[usage] first_use session=%s model=%s",
	key,
	self._model_display_names.get(key, key),
	)

	def _log_used_model_summary(self) -> None:
	if not getattr(self, "_used_model_keys", None):
	logging.info("[usage] used_models=none")
	return
	logging.info("[usage] used_models_count=%d", len(self._used_model_keys))
	for idx, key in enumerate(self._used_model_keys, start=1):
	logging.info(
	"[usage] used_model[%d]=session=%s model=%s calls=%d",
	idx,
	key,
	self._model_display_names.get(key, key),
	self._model_call_stats.get(key, 0),
	)

	def _run_session(
	self,
	key: str,
	input_feed: dict[str, np.ndarray],
	output_names: list[str] \| None = None,
	) -> list[np.ndarray]:
	sess = self.sessions[key]
	actual_input_names = [inp.name for inp in sess.get_inputs()]
	actual_input_name_set = set(actual_input_names)
	filtered_input_feed = {
	name: value for name, value in input_feed.items() if name in actual_input_name_set
	}
	missing_inputs = [name for name in actual_input_names if name not in filtered_input_feed]
	if missing_inputs:
	raise RuntimeError(
	f"session={key} missing required inputs: {missing_inputs}; "
	f"provided={sorted(input_feed.keys())}"
	)
	extra_inputs = sorted(name for name in input_feed if name not in actual_input_name_set)
	self._mark_model_used(key)
	start = time.perf_counter()
	try:
	outputs = sess.run(output_names, filtered_input_feed)
	except Exception as exc:
	input_summary = {
	name: {"shape": list(np.asarray(value).shape), "dtype": str(np.asarray(value).dtype)}
	for name, value in filtered_input_feed.items()
	}
	raise RuntimeError(
	f"session={key} model={self._model_display_names.get(key, key)} run failed; "
	f"used_models_so_far={self._used_model_keys}; "
	f"expected_inputs={actual_input_names}; extra_inputs={extra_inputs}; "
	f"input_summary={input_summary}"
	) from exc
	elapsed = time.perf_counter() - start
	self._model_time_stats[key] += elapsed
	self._model_call_stats[key] += 1
	return outputs


	def encode_text(self, text: str) -> list[int]:
	return [int(t) for t in self.sp_model.encode(str(text or ""), out_type=int)]

	def count_text_tokens(self, text: str) -> int:
	return len(self.encode_text(text))

	def split_text_by_token_budget(self, text: str, max_tokens: int) -> list[str]:
	remaining = str(text or "").strip()
	if not remaining:
	return []
	pieces: list[str] = []
	preferred_boundary = set(CLAUSE_SPLIT_PUNCTUATION) \| set(SENTENCE_END_PUNCTUATION) \| {" "}
	while remaining:
	if self.count_text_tokens(remaining) <= max_tokens:
	pieces.append(remaining)
	break
	lo, hi, best = 1, len(remaining), 1
	while lo <= hi:
	mid = (lo + hi) // 2
	cand = remaining[:mid].strip()
	if not cand:
	lo = mid + 1
	continue
	if self.count_text_tokens(cand) <= max_tokens:
	best = mid
	lo = mid + 1
	else:
	hi = mid - 1
	cut = best
	prefix = remaining[:best]
	preferred = -1
	scan_min = max(-1, len(prefix) - 25)
	for si in range(len(prefix) - 1, scan_min, -1):
	if prefix[si] in preferred_boundary:
	preferred = si + 1
	break
	if preferred > 0:
	cut = preferred
	piece = remaining[:cut].strip()
	if not piece:
	piece = remaining[:best].strip()
	cut = best
	pieces.append(piece)
	remaining = remaining[cut:].strip()
	return pieces

	def split_voice_clone_text(self, text: str, max_tokens: int = 75) -> list[str]:
	t = str(text or "").strip()
	if not t:
	return []
	safe_max = max(1, int(max_tokens))
	prepared = _prepare_text_for_sentence_chunking(t)
	sents = _split_text_by_punctuation(prepared, SENTENCE_END_PUNCTUATION) or [prepared.strip()]
	slices: list[tuple[int, str]] = []
	for sent in sents:
	s = sent.strip()
	if not s:
	continue
	cnt = self.count_text_tokens(s)
	if cnt <= safe_max:
	slices.append((cnt, s))
	continue
	clauses = _split_text_by_punctuation(s, CLAUSE_SPLIT_PUNCTUATION)
	if len(clauses) <= 1:
	clauses = [s]
	for clause in clauses:
	c = clause.strip()
	if not c:
	continue
	ccnt = self.count_text_tokens(c)
	if ccnt <= safe_max:
	slices.append((ccnt, c))
	continue
	for piece in self.split_text_by_token_budget(c, safe_max):
	p = piece.strip()
	if p:
	slices.append((self.count_text_tokens(p), p))
	chunks: list[str] = []
	cur_chunk, cur_cnt = "", 0
	for cnt, s in slices:
	if not cur_chunk:
	cur_chunk, cur_cnt = s, cnt
	continue
	if cur_cnt + cnt > safe_max:
	chunks.append(cur_chunk.strip())
	cur_chunk, cur_cnt = s, cnt
	else:
	cur_chunk = _join_sentence_parts(cur_chunk, s)
	cur_cnt = self.count_text_tokens(cur_chunk)
	if cur_chunk:
	chunks.append(cur_chunk.strip())
	return chunks or [t]

	def _split_text_by_sentence_punctuation(self, text: str) -> list[str]:
	t = str(text or "").strip()
	if not t:
	return []

	codec_limit = self._static_decode_code_length()
	if codec_limit is not None:
	_safe_text_tokens = max(1, int(codec_limit / 3 * 0.9))
	_min_tokens = max(1, int(_safe_text_tokens * 0.25))
	else:
	_safe_text_tokens = 80
	_min_tokens = 20

	prepared = _prepare_text_for_sentence_chunking(t)
	sents = _split_text_by_punctuation(prepared, SENTENCE_END_PUNCTUATION)
	if not sents:
	sents = [prepared.strip()]

	result: list[str] = []
	for sent in sents:
	s = sent.strip()
	if not s:
	continue
	cnt = self.count_text_tokens(s)
	if cnt <= _safe_text_tokens:
	result.append(s)
	continue
	clauses = _split_text_by_punctuation(s, CLAUSE_SPLIT_PUNCTUATION)
	if len(clauses) <= 1:
	clauses = [s]

	merged: list[str] = []
	buf = ""
	buf_tokens = 0
	for clause in clauses:
	c = clause.strip()
	if not c:
	continue
	ccnt = self.count_text_tokens(c)
	if ccnt >= _safe_text_tokens:
	if buf:
	merged.append(buf.strip())
	buf = ""
	buf_tokens = 0
	merged.extend(
	p.strip() for p in self.split_text_by_token_budget(c, _safe_text_tokens) if p.strip()
	)
	elif buf_tokens > 0 and buf_tokens + ccnt > _safe_text_tokens:
	merged.append(buf.strip())
	buf = c
	buf_tokens = ccnt
	else:
	buf = (buf + " " + c) if buf else c
	buf_tokens += ccnt
	if buf:
	merged.append(buf.strip())

	for m in merged:
	mcnt = self.count_text_tokens(m)
	if mcnt <= _safe_text_tokens:
	result.append(m)
	else:
	result.extend(
	p.strip() for p in self.split_text_by_token_budget(m, _safe_text_tokens) if p.strip()
	)
	return result or [t]


	def build_text_rows(self, token_ids: list[int]) -> list[list[int]]:
	row_width = int(self.manifest["tts_config"]["n_vq"]) + 1
	rows: list[list[int]] = []
	for tid in token_ids:
	row = [int(self.manifest["tts_config"]["audio_pad_token_id"])] * row_width
	row[0] = int(tid)
	rows.append(row)
	return rows

	def build_audio_prefix_rows(
	self, prompt_audio_codes: list[list[int]], slot_token_id: int \| None = None
	) -> list[list[int]]:
	row_width = int(self.manifest["tts_config"]["n_vq"]) + 1
	slot = int(
	self.manifest["tts_config"]["audio_user_slot_token_id"]
	if slot_token_id is None else slot_token_id
	)
	rows: list[list[int]] = []
	for code_row in prompt_audio_codes:
	row = [int(self.manifest["tts_config"]["audio_pad_token_id"])] * row_width
	row[0] = slot
	for i in range(min(len(code_row), int(self.manifest["tts_config"]["n_vq"]))):
	row[i + 1] = int(code_row[i])
	rows.append(row)
	return rows

	def build_voice_clone_request_rows(
	self, prompt_audio_codes: list[list[int]], text_token_ids: list[int]
	) -> dict[str, list[list[int]]]:
	prefix_ids = [
	*self.manifest["prompt_templates"]["user_prompt_prefix_token_ids"],
	int(self.manifest["tts_config"]["audio_start_token_id"]),
	]
	suffix_ids = [
	int(self.manifest["tts_config"]["audio_end_token_id"]),
	*self.manifest["prompt_templates"]["user_prompt_after_reference_token_ids"],
	*text_token_ids,
	*self.manifest["prompt_templates"]["assistant_prompt_prefix_token_ids"],
	int(self.manifest["tts_config"]["audio_start_token_id"]),
	]
	rows = [
	*self.build_text_rows(prefix_ids),
	*self.build_audio_prefix_rows(prompt_audio_codes),
	*self.build_text_rows(suffix_ids),
	]
	return {"inputIds": rows, "attentionMask": [[1 for _ in rows]]}


	def _static_prefill_length(self) -> int \| None:
	v = self.tts_static_shapes.get("prefill_seq")
	return None if v is None else int(v)

	def _static_decode_code_length(self) -> int \| None:
	v = self.codec_static_shapes.get("decode_code_length")
	return None if v is None else int(v)

	def _static_encode_waveform_length(self) -> int \| None:
	v = self.codec_static_shapes.get("waveform_length")
	return None if v is None else int(v)

	def _pad_request_rows(
	self, request_rows: dict[str, list[list[int]]]
	) -> tuple[dict[str, list[list[int]]], int]:
	static_len = self._static_prefill_length()
	actual_len = len(request_rows["inputIds"])
	if static_len is None:
	return request_rows, actual_len
	if actual_len > static_len:
	raise ValueError(
	f"static prefill_seq={static_len} < request length={actual_len}. "
	"Reduce prompt/text length or re-export with larger --sample-seq-len."
	)
	if actual_len == static_len:
	return request_rows, actual_len
	row_width = int(self.manifest["tts_config"]["n_vq"]) + 1
	pad_row = [int(self.manifest["tts_config"]["audio_pad_token_id"])] * row_width
	pad_row[0] = int(self.manifest["tts_config"]["pad_token_id"])
	padded_rows = [list(r) for r in request_rows["inputIds"]]
	padded_rows.extend([list(pad_row) for _ in range(static_len - actual_len)])
	padded_mask = [1] * actual_len + [0] * (static_len - actual_len)
	return {"inputIds": padded_rows, "attentionMask": [padded_mask]}, actual_len


	def _pad_kv(
	self, kv_dict: dict[str, np.ndarray], target_len: int
	) -> dict[str, np.ndarray]:
	result: dict[str, np.ndarray] = {}
	for key, val in kv_dict.items():
	cur = val.shape[1]
	if cur > target_len:
	padded = val[:, :target_len, :, :].astype(np.float32)
	elif cur < target_len:
	pad = np.zeros((1, target_len - cur, *val.shape[2:]), dtype=np.float32)
	padded = np.concatenate([val.astype(np.float32), pad], axis=1)
	else:
	padded = val.astype(np.float32)
	result[key] = padded
	return result


	def run_local_fixed_sampled_frame(
	self,
	global_hidden: np.ndarray,
	*,
	previous_token_sets_by_channel: list[set[int]],
	) -> tuple[bool, list[int]]:
	n_vq = int(self.manifest["tts_config"]["n_vq"])
	codebook_size = int(self.tts_meta["model_config"]["audio_codebook_sizes"][0])
	rep_mask = np.zeros((1, n_vq, codebook_size), dtype=np.int32)
	for ci, token_set in enumerate(previous_token_sets_by_channel):
	for tid in token_set:
	if 0 <= tid < codebook_size:
	rep_mask[0, ci, tid] = 1
	sess = self.sessions["local_fixed_sampled_frame"]
	session_input_names = {inp.name for inp in sess.get_inputs()}
	input_feed: dict = {
	"global_hidden": global_hidden.astype(np.float32, copy=False),
	"repetition_seen_mask": rep_mask,
	}
	asst_u: float \| None = None
	if "assistant_random_u" in session_input_names:
	asst_u = float(self.rng.random())
	input_feed["assistant_random_u"] = np.array([asst_u], dtype=np.float32)
	if "audio_random_u" in session_input_names:
	input_feed["audio_random_u"] = np.asarray(
	[[float(self.rng.random()) for _ in range(n_vq)]],
	dtype=np.float32,
	)
	outputs = self._run_session("local_fixed_sampled_frame", input_feed)
	out_names = [o.name for o in sess.get_outputs()]
	named = dict(zip(out_names, outputs, strict=True))
	frame_ids = np.asarray(named["frame_token_ids"]).reshape(-1).astype(np.int32).tolist()
	should_continue = bool(int(np.asarray(named["should_continue"]).reshape(-1)[0]))
	logging.debug(
	"[fixed_frame] asst_u=%.4f should_continue=%s tokens=%s",
	asst_u if asst_u is not None else -1.0,
	should_continue,
	frame_ids[:4],
	)
	return should_continue, [int(x) for x in frame_ids]

	def _can_use_local_fixed_sampled_frame(self) -> bool:
	sess = self.sessions.get("local_fixed_sampled_frame")
	if sess is None:
	return False
	session_input_names = {inp.name for inp in sess.get_inputs()}
	return "audio_random_u" in session_input_names

	def run_local_decoder(
	self,
	global_hidden: np.ndarray,
	text_token_id: int,
	frame_prefix: list[int],
	) -> tuple[np.ndarray, np.ndarray]:
	n_vq = int(self.manifest["tts_config"]["n_vq"])
	audio_pad = int(self.manifest["tts_config"]["audio_pad_token_id"])
	padded_prefix = np.full((1, n_vq - 1), audio_pad, dtype=np.int32)
	for i in range(min(len(frame_prefix), n_vq - 1)):
	padded_prefix[0, i] = int(frame_prefix[i])
	outputs = self._run_session(
	"local_decoder",
	{
	"global_hidden": global_hidden.astype(np.float32, copy=False),
	"text_token_id": np.asarray([int(text_token_id)], dtype=np.int32),
	"audio_prefix_token_ids": padded_prefix,
	},
	)
	out_names = [o.name for o in self.sessions["local_decoder"].get_outputs()]
	named = dict(zip(out_names, outputs, strict=True))
	return named["text_logits"].reshape(-1), named["audio_logits"]

	def slice_audio_channel_logits(self, audio_logits: np.ndarray, channel_index: int) -> np.ndarray:
	per_ch = int(audio_logits.shape[-1])
	flat = audio_logits.reshape(-1)
	return flat[channel_index * per_ch : (channel_index + 1) * per_ch]


	def _infer_global_kv_len(self) -> int \| None:
	if "decode" in self.sessions:
	dec_sess = self.sessions["decode"]
	if hasattr(dec_sess, "get_inputs"):
	for inp in dec_sess.get_inputs():
	if inp.name == "past_key_0":
	dims = getattr(inp, "shape", []) or []
	if len(dims) >= 2 and isinstance(dims[1], str):
	return None
	break
	v = self.tts_static_shapes.get("kv_seq") or self.tts_static_shapes.get("global_kv_len")
	if v is not None:
	return int(v)
	return 512

	def generate_audio_frames(
	self,
	request_rows: dict[str, list[list[int]]],
	on_frame: Callable \| None = None,
	) -> list[list[int]]:
	padded_rows, actual_prefill_len = self._pad_request_rows(request_rows)
	return self._generate_frames_with_prefill(
	padded_rows,
	actual_prefill_length=actual_prefill_len,
	on_frame=on_frame,
	)

	def _generate_frames_with_prefill(
	self,
	request_rows: dict[str, list[list[int]]],
	*,
	actual_prefill_length: int,
	on_frame: Callable \| None = None,
	) -> list[list[int]]:
	global_kv_len = self._infer_global_kv_len()
	generation_defaults = self.manifest["generation_defaults"]
	row_width = int(self.manifest["tts_config"]["n_vq"]) + 1

	prefill_ids, prefill_dims = _flatten3d_int32([request_rows["inputIds"]])
	prefill_mask, prefill_mask_dims = _flatten2d_int32(request_rows["attentionMask"])
	prefill_outputs = self._run_session(
	"prefill",
	{
	"input_ids": prefill_ids.reshape(prefill_dims),
	"attention_mask": prefill_mask.reshape(prefill_mask_dims),
	},
	)
	prefill_out_names = [o.name for o in self.sessions["prefill"].get_outputs()]
	named_prefill = dict(zip(prefill_out_names, prefill_outputs, strict=True))
	global_hidden = named_prefill["global_hidden"][:, actual_prefill_length - 1, :].astype(np.float32)
	past_valid_length = int(actual_prefill_length)

	raw_past: dict[str, np.ndarray] = {
	out.replace("present_", "past_"): named_prefill[out][:, :actual_prefill_length, :, :]
	for out in self.tts_meta["onnx"]["prefill_output_names"][1:]
	}

	use_dynamic_kv = global_kv_len is None
	if use_dynamic_kv:
	past_by_name = raw_past # shape: (1, actual_len, 12, 64)，步进增长
	else:
	past_by_name = self._pad_kv(raw_past, global_kv_len) # shape: (1, 512, 12, 64)

	generated_frames: list[list[int]] = []
	prev_tokens = [[] for _ in range(int(self.manifest["tts_config"]["n_vq"]))]
	prev_tok_sets = [set() for _ in range(int(self.manifest["tts_config"]["n_vq"]))]
	audio_pad_id = int(self.manifest["tts_config"]["audio_pad_token_id"])
	consecutive_pad_frames = 0
	_MAX_CONSECUTIVE_PAD_FRAMES = 10 # 0.8s silence → force stop (guard against quantized AXModel)

	for step in range(int(generation_defaults["max_new_frames"])):
	frame: list[int] = []

	if (
	generation_defaults["sample_mode"] == SAMPLE_MODE_FIXED
	and self._can_use_local_fixed_sampled_frame()
	):
	should_continue, frame = self.run_local_fixed_sampled_frame(
	global_hidden, previous_token_sets_by_channel=prev_tok_sets
	)
	if not should_continue:
	logging.info("[gen] frame %d: should_continue=False → 停止生成", step)
	break
	for ci, tok in enumerate(frame):
	prev_tokens[ci].append(tok)
	prev_tok_sets[ci].add(tok)
	elif "local_decoder" in self.sessions:
	if (
	generation_defaults["sample_mode"] == SAMPLE_MODE_FIXED
	and "local_fixed_sampled_frame" in self.sessions
	and step == 0
	):
	logging.warning(
	"local_fixed_sampled_frame 缺少 audio_random_u 输入，回退到 local_decoder 随机采样路径"
	)
	local_text_logits, _ = self.run_local_decoder(global_hidden, 0, [])
	next_text = _sample_assistant_text_token(
	local_text_logits, self.manifest, generation_defaults, self.rng
	)
	if next_text != int(self.manifest["tts_config"]["audio_assistant_slot_token_id"]):
	break
	for ci in range(int(self.manifest["tts_config"]["n_vq"])):
	_, audio_logits = self.run_local_decoder(global_hidden, next_text, frame)
	ch_logits = self.slice_audio_channel_logits(audio_logits, ci).astype(np.float32)
	tok = _sample_audio_token(
	ch_logits, prev_tokens[ci], prev_tok_sets[ci], generation_defaults, self.rng
	)
	frame.append(tok)
	prev_tokens[ci].append(tok)
	prev_tok_sets[ci].add(tok)
	else:
	raise RuntimeError("Neither local_fixed_sampled_frame nor local_decoder session found")

	if not frame:
	break

	if all(tok == audio_pad_id for tok in frame):
	consecutive_pad_frames += 1
	if consecutive_pad_frames >= _MAX_CONSECUTIVE_PAD_FRAMES:
	logging.info(
	"[gen] %d consecutive all-pad frames → force stop (trim %d silent frames)",
	consecutive_pad_frames, consecutive_pad_frames,
	)
	generated_frames = generated_frames[:-(consecutive_pad_frames - 1)] if len(generated_frames) >= consecutive_pad_frames else []
	break
	else:
	consecutive_pad_frames = 0

	generated_frames.append(frame)

	next_row = np.full((1, 1, row_width), int(self.manifest["tts_config"]["audio_pad_token_id"]), dtype=np.int32)
	next_row[0, 0, 0] = int(self.manifest["tts_config"]["audio_assistant_slot_token_id"])
	for idx, tok in enumerate(frame):
	next_row[0, 0, idx + 1] = int(tok)

	decode_feeds: dict[str, np.ndarray] = {
	"input_ids": next_row,
	"past_valid_lengths": np.asarray([past_valid_length], dtype=np.int32),
	}
	for inp_name in self.tts_meta["onnx"]["decode_input_names"][2:]:
	decode_feeds[inp_name] = past_by_name[inp_name]

	dec_outputs = self._run_session("decode", decode_feeds)
	dec_out_names = [o.name for o in self.sessions["decode"].get_outputs()]
	named_dec = dict(zip(dec_out_names, dec_outputs, strict=True))
	global_hidden = _extract_last_hidden(named_dec["global_hidden"])
	past_valid_length += 1

	if use_dynamic_kv:
	past_by_name = {
	out_name.replace("present_", "past_"): named_dec[out_name]
	for out_name in self.tts_meta["onnx"]["decode_output_names"][1:]
	}
	else:
	new_pos = past_valid_length - 1 # 新 token 的逻辑位置（0-indexed）
	new_padded: dict[str, np.ndarray] = {}
	for out_name in self.tts_meta["onnx"]["decode_output_names"][1:]:
	past_name = out_name.replace("present_", "past_")
	v = named_dec[out_name]
	buf = np.zeros((1, global_kv_len, 12, 64), dtype=np.float32)
	if new_pos < global_kv_len:
	buf[:, :new_pos, :, :] = v[:, :new_pos, :, :]
	buf[:, new_pos, :, :] = v[:, global_kv_len, :, :]
	else:
	buf[:, :global_kv_len - 1, :, :] = v[:, 1:global_kv_len, :, :]
	buf[:, global_kv_len - 1, :, :] = v[:, global_kv_len, :, :]
	new_padded[past_name] = buf
	past_by_name = new_padded
	if past_valid_length > global_kv_len:
	past_valid_length = global_kv_len

	if on_frame is not None:
	on_frame(generated_frames, step, frame)

	logging.info("[gen] 生成完毕: 共 %d 帧 (max_new_frames=%s)", len(generated_frames), generation_defaults["max_new_frames"])
	return generated_frames


	def decode_full_audio(self, generated_frames: list[list[int]]) -> tuple[list[np.ndarray], int]:
	if not generated_frames:
	return [], 0
	static_len = self._static_decode_code_length()
	n_vq = int(self.manifest["tts_config"]["n_vq"])
	frame_count = len(generated_frames)

	if static_len is None or frame_count <= static_len:
	return self._decode_one_segment(generated_frames, frame_count, n_vq, static_len)

	all_audio: list[np.ndarray] = []
	total_samples = 0
	for seg_start in range(0, frame_count, static_len):
	seg_end = min(seg_start + static_len, frame_count)
	seg_frames = generated_frames[seg_start:seg_end]
	seg_count = seg_end - seg_start
	channel_arrays, seg_samples = self._decode_one_segment(seg_frames, seg_count, n_vq, static_len)
	all_audio.extend(channel_arrays)
	total_samples += seg_samples
	return all_audio, total_samples

	def _decode_one_segment(
	self,
	frames: list[list[int]],
	frame_count: int,
	n_vq: int,
	static_len: int \| None,
	) -> tuple[list[np.ndarray], int]:
	max_len = static_len if static_len is not None else frame_count
	audio_codes = np.zeros((1, max_len, n_vq), dtype=np.int32)
	for fi, frame in enumerate(frames):
	for ci in range(n_vq):
	audio_codes[0, fi, ci] = int(frame[ci] if ci < len(frame) else 0)
	outputs = self._run_session(
	"codec_decode",
	{
	"audio_codes": audio_codes,
	"audio_code_lengths": np.asarray([frame_count], dtype=np.int32),
	},
	)
	out_names = [o.name for o in self.sessions["codec_decode"].get_outputs()]
	named = dict(zip(out_names, outputs, strict=True))
	audio_length = int(named["audio_lengths"].reshape(-1)[0])
	return _slice_channel_major_audio(named["audio"], 0, audio_length), audio_length


	def encode_reference_audio(self, reference_audio_path: str \| Path) -> list[list[int]]:
	codec_cfg = self.codec_meta["codec_config"]
	target_sr = int(codec_cfg["sample_rate"])
	target_ch = int(codec_cfg["channels"])
	waveform = _load_audio_numpy(reference_audio_path, target_sr, target_ch)
	waveform_length = int(waveform.shape[-1])

	if bool(codec_cfg.get("pad_reference_audio_to_downsample_rate", False)):
	ds_rate = int(codec_cfg["downsample_rate"])
	remainder = waveform_length % ds_rate
	if remainder:
	pad_len = ds_rate - remainder
	waveform = np.concatenate(
	[waveform, np.zeros((1, target_ch, pad_len), dtype=np.float32)], axis=2
	)
	waveform_length += pad_len

	static_wl = self._static_encode_waveform_length()
	num_quantizers = int(codec_cfg["num_quantizers"])
	prompt_audio_codes: list[list[int]] = []
	out_names = [o.name for o in self.sessions["codec_encode"].get_outputs()]

	if static_wl and static_wl > 0:
	for start in range(0, waveform_length, static_wl):
	chunk = waveform[..., start : start + static_wl]
	chunk_len = int(chunk.shape[-1])
	if chunk_len <= 0:
	continue
	if chunk_len < static_wl:
	padded = np.zeros((1, target_ch, static_wl), dtype=np.float32)
	padded[..., :chunk_len] = chunk
	chunk = padded
	outputs = self._run_session(
	"codec_encode",
	{
	"waveform": chunk,
	"input_lengths": np.asarray([chunk_len], dtype=np.int32),
	},
	)
	named = dict(zip(out_names, outputs, strict=True))
	codes = np.asarray(named["audio_codes"], dtype=np.int32)
	code_len = int(np.asarray(named["audio_code_lengths"]).reshape(-1)[0])
	for fi in range(code_len):
	prompt_audio_codes.append([int(codes[0, fi, qi]) for qi in range(num_quantizers)])
	else:
	outputs = self._run_session(
	"codec_encode",
	{
	"waveform": waveform,
	"input_lengths": np.asarray([waveform_length], dtype=np.int32),
	},
	)
	named = dict(zip(out_names, outputs, strict=True))
	codes = np.asarray(named["audio_codes"], dtype=np.int32)
	code_len = int(np.asarray(named["audio_code_lengths"]).reshape(-1)[0])
	for fi in range(code_len):
	prompt_audio_codes.append([int(codes[0, fi, qi]) for qi in range(num_quantizers)])

	return prompt_audio_codes


	def resolve_prompt_audio_codes(
	self, *, voice: str \| None, prompt_audio_path: str \| Path \| None
	) -> list[list[int]]:
	if prompt_audio_path:
	return self.encode_reference_audio(prompt_audio_path)
	resolved_voice = str(voice or self.manifest["builtin_voices"][0]["voice"])
	for v in self.manifest["builtin_voices"]:
	if v["voice"] == resolved_voice:
	return list(v["prompt_audio_codes"])
	raise ValueError(f"Built-in voice not found: {resolved_voice}")

	def list_builtin_voices(self) -> list[dict[str, Any]]:
	return list(self.manifest["builtin_voices"])


	def synthesize(
	self,
	*,
	text: str,
	voice: str \| None = None,
	prompt_audio_path: str \| Path \| None = None,
	output_audio_path: str \| Path \| None = None,
	sample_mode: str \| None = None,
	do_sample: bool = True,
	streaming: bool = True,
	max_new_frames: int \| None = None,
	voice_clone_max_text_tokens: int = 75,
	seed: int \| None = None,
	) -> dict[str, Any]:
	self._reset_timing_stats()
	gen = self.manifest["generation_defaults"]
	if max_new_frames is not None:
	gen["max_new_frames"] = int(max_new_frames)

	static_codec_len = self._static_decode_code_length()
	if static_codec_len is not None and gen.get("max_new_frames", float("inf")) > static_codec_len:
	gen["max_new_frames"] = static_codec_len
	logging.info("capped max_new_frames=%d (static codec limit)", static_codec_len)

	normalized_mode = self._normalize_sample_mode(sample_mode, do_sample)
	gen["sample_mode"] = normalized_mode
	gen["do_sample"] = normalized_mode != SAMPLE_MODE_GREEDY
	if seed is not None:
	self.rng = np.random.default_rng(int(seed))

	infer_start_time = time.perf_counter()

	try:
	text_chunks = self._split_text_by_sentence_punctuation(text)
	prompt_audio_codes = self.resolve_prompt_audio_codes(
	voice=voice, prompt_audio_path=prompt_audio_path
	)
	sample_rate = int(self.codec_meta["codec_config"]["sample_rate"])
	channels = int(self.codec_meta["codec_config"]["channels"])
	static_decode_code_length = self._static_decode_code_length()
	out_path = (
	Path(output_audio_path).expanduser().resolve()
	if output_audio_path
	else (Path("output.wav")).resolve()
	)

	all_waveforms: list[np.ndarray] = []
	all_generated_frames: list[list[int]] = []
	final_text_chunks: list[str] = []
	pending_text_chunks = list(text_chunks)

	wav_file = None
	if streaming:
	out_path.parent.mkdir(parents=True, exist_ok=True)
	wav_file = wave.open(str(out_path), "wb")
	wav_file.setnchannels(channels)
	wav_file.setsampwidth(2)
	wav_file.setframerate(sample_rate)

	try:
	while pending_text_chunks:
	chunk_idx = len(final_text_chunks) + 1
	total_chunks = len(final_text_chunks) + len(pending_text_chunks)
	chunk_text = pending_text_chunks.pop(0)
	logging.info("chunk %d/%d: %r", chunk_idx, total_chunks, chunk_text[:40])
	text_token_ids = self.encode_text(chunk_text)

	request_rows = self.build_voice_clone_request_rows(prompt_audio_codes, text_token_ids)
	generated_frames = self.generate_audio_frames(request_rows)

	channel_arrays, _ = self.decode_full_audio(generated_frames)
	waveform = np.asarray(_merge_audio_channels(channel_arrays), dtype=np.float32)
	all_waveforms.append(waveform)
	all_generated_frames.extend(generated_frames)
	final_text_chunks.append(chunk_text)

	if wav_file is not None and waveform.size > 0:
	clipped = np.clip(waveform, -1.0, 1.0)
	pcm16 = np.round(clipped * 32767.0).astype(np.int16)
	wav_file.writeframes(pcm16.tobytes())
	logging.info(
	"stream emitted chunk %d: samples=%d duration=%.3fs",
	len(final_text_chunks),
	int(waveform.shape[0]),
	float(waveform.shape[0]) / float(sample_rate),
	)

	if pending_text_chunks:
	pause_words = len([w for w in chunk_text.strip().split() if w])
	pause_sec = 0.40 if pause_words <= 4 else 0.24
	pause_samples = max(0, int(round(sample_rate * pause_sec)))
	if pause_samples > 0:
	pause_waveform = np.zeros((pause_samples, channels), dtype=np.float32)
	all_waveforms.append(pause_waveform)
	if wav_file is not None:
	wav_file.writeframes(np.zeros((pause_samples, channels), dtype=np.int16).tobytes())
	finally:
	if wav_file is not None:
	wav_file.close()

	final_waveform = _concat_waveforms(all_waveforms)
	total_infer_time = time.perf_counter() - infer_start_time
	audio_duration_sec = (
	float(final_waveform.shape[0]) / float(sample_rate)
	if sample_rate > 0 and final_waveform.size > 0
	else 0.0
	)
	rtf = (total_infer_time / audio_duration_sec) if audio_duration_sec > 0 else float("inf")
	model_time_total = sum(self._model_time_stats.values())
	audio_path = out_path if streaming else _write_waveform_to_wav(out_path, final_waveform, sample_rate)
	logging.info("已保存 %s sample_rate=%s frames=%s", audio_path, sample_rate, len(all_generated_frames))
	logging.info(
	"[timing] total_infer_time=%.3fs audio_duration=%.3fs rtf=%.4f model_time_total=%.3fs",
	total_infer_time,
	audio_duration_sec,
	rtf,
	model_time_total,
	)
	for key in sorted(self._model_display_names):
	model_name = self._model_display_names[key]
	logging.info(
	"[timing] session=%s model=%s calls=%d total=%.3fs",
	key,
	model_name,
	self._model_call_stats.get(key, 0),
	self._model_time_stats.get(key, 0.0),
	)
	return {
	"audio_path": str(audio_path),
	"waveform": final_waveform,
	"sample_rate": sample_rate,
	"audio_token_ids": np.asarray(all_generated_frames, dtype=np.int32),
	"text_chunks": final_text_chunks,
	"sample_mode": normalized_mode,
	"do_sample": normalized_mode != SAMPLE_MODE_GREEDY,
	"streaming": bool(streaming),
	"timing": {
	"total_infer_time_sec": total_infer_time,
	"audio_duration_sec": audio_duration_sec,
	"rtf": rtf,
	"model_time_total_sec": model_time_total,
	"per_model_time_sec": dict(self._model_time_stats),
	"per_model_calls": dict(self._model_call_stats),
	"per_model_display_name": dict(self._model_display_names),
	"used_model_keys": list(self._used_model_keys),
	},
	}
	finally:
	self._log_used_model_summary()