# Copyright 2026 Patrick Lumbantobing, Vertox-AI
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""End-to-end streaming TTS test script using ONNX Runtime.

This script demonstrates the full MOSS-TTS-Realtime ONNX pipeline by:

1. Loading four ONNX models (backbone LLM, local transformer, codec encoder,
   codec decoder) into ONNX Runtime ``InferenceSession`` instances.
2. Encoding a reference audio prompt for voice cloning.
3. Simulating a streaming LLM text source (character-by-character deltas).
4. Running the streaming TTS pipeline to produce audio chunks.
5. Writing the concatenated audio to a WAV file.

Usage[with INT8 codec decoder]::

    python test_basic_streaming-onnx.py \
        --tokenizer_vocab_path tokenizers/tokenizer.json \
        --tokenizer_config_path tokenizers/tokenizer_config.json \
        --backbone_llm_path onnx_models/backbone_f32/backbone_f32.onnx \
        --backbone_local_path onnx_models/local_transformer_f32/local_transformer_f32.onnx \
        --codec_decoder_path onnx_models_quantized/codec_decoder_int8/codec_decoder_int8.onnx \
        --codec_encoder_path onnx_models/codec_encoder/codec_encoder.onnx \
        --backbone_config_path configs/config_backbone.json \
        --codec_config_path configs/config_codec.json \
        --prompt_wav audio_ref/speaker.[wav|flac|mp3] \
        --out_wav output.wav
"""

import argparse
import json
import time
import wave
from pathlib import Path
from typing import Iterator, Tuple

import numpy as np
import numpy.typing as npt
import onnxruntime as ort

from inferencer_onnx import MossTTSRealtimeInferenceONNX
from moss_text_tokenizer import MOSSTextTokenizer

NDArrayInt = npt.NDArray[np.int64]
NDArrayFloat = npt.NDArray[np.floating]
CODEC_SAMPLE_RATE = 24000


def fake_llm_text_stream(
    text: str,
    chunk_chars: int = 1,
    delay_s: float = 0.0,
) -> Iterator[str]:
    """Simulate streaming text deltas from an LLM.

    Each iteration yields ``chunk_chars`` characters with a delay of
    ``delay_s`` seconds.  In real-world usage, this can be replaced with
    streaming responses from models such as OpenAI or vLLM.

    Parameters
    ----------
    text : str
        Full text to stream character-by-character.
    chunk_chars : int, optional
        Number of characters per delta (default ``1``).
    delay_s : float, optional
        Simulated delay in seconds between deltas (default ``0.0``).

    Yields
    ------
    str
        A text delta of up to ``chunk_chars`` characters.
    """
    if not text:
        return
    step = max(1, chunk_chars)
    for idx in range(0, len(text), step):
        if delay_s > 0 and idx > 0:
            time.sleep(delay_s)
        yield text[idx : idx + step]


def write_wav(out_path: Path, sample_rate: int, chunks: Iterator[np.ndarray]) -> None:
    """Collect audio chunks and write them to a 16-bit PCM WAV file.

    Parameters
    ----------
    out_path : Path
        Output file path.
    sample_rate : int
        Sample rate in Hz.
    chunks : Iterator[np.ndarray]
        Iterator of float32 audio chunks in ``[-1, 1]`` range.
    """
    all_chunks: list[np.ndarray] = []
    for chunk in chunks:
        all_chunks.append(chunk.astype(np.float32).reshape(-1))

    if not all_chunks:
        raise RuntimeError("No audio chunks produced.")

    audio = np.concatenate(all_chunks)
    # float32 → int16 PCM
    audio = np.clip(audio, -1.0, 1.0)
    pcm16 = (audio * 32767.0).astype(np.int16)

    out_path.parent.mkdir(parents=True, exist_ok=True)
    with wave.open(str(out_path), "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(int(sample_rate))
        wf.writeframes(pcm16.tobytes())


def _sanitize_tokens(
    tokens: NDArrayInt,
    codebook_size: int,
    eos_audio_id: int,
) -> Tuple[NDArrayInt, bool]:
    """Validate and truncate audio tokens at EOS or invalid code boundaries.

    Parameters
    ----------
    tokens : NDArrayInt
        Audio token array of shape ``(T,)`` or ``(T, C)``.
    codebook_size : int
        Valid code range is ``[0, codebook_size)``.
    eos_audio_id : int
        End-of-sequence audio token ID.

    Returns
    -------
    tuple[NDArrayInt, bool]
        Sanitized tokens and a flag indicating whether truncation occurred.
    """
    # Make sure tokens is 2D: (rows, codes)
    if tokens.ndim == 1:
        tokens = np.expand_dims(tokens, axis=0)  # same as tokens[None, :]

    if tokens.size == 0:
        return tokens, False

    # Rows whose first element is eos_audio_id
    eos_rows = np.nonzero(tokens[:, 0] == eos_audio_id)[0]  # 1D index array

    # Rows that contain any invalid code
    invalid_rows = ((tokens < 0) | (tokens >= codebook_size)).any(axis=1)  # axis instead of dim
    invalid_rows_idx = np.nonzero(invalid_rows)[0]

    stop_idx = None
    if eos_rows.size > 0:
        stop_idx = int(eos_rows[0])

    if invalid_rows_idx.size > 0:
        invalid_idx = int(invalid_rows_idx[0])
        stop_idx = invalid_idx if stop_idx is None else min(stop_idx, invalid_idx)

    if stop_idx is not None:
        tokens = tokens[:stop_idx]
        return tokens, True

    return tokens, False


def decode_audio_frames(
    audio_frames: list[NDArrayInt],
    inferencer: MossTTSRealtimeInferenceONNX,
    codebook_size: int,
    eos_audio_id: int,
) -> Iterator[np.ndarray]:
    """Sanitize, buffer, and decode audio token frames into waveform chunks.

    Parameters
    ----------
    audio_frames : list[NDArrayInt]
        List of audio token arrays from the backbone.
    inferencer : MossTTSRealtimeInferenceONNX
        The ONNX inference engine (used for ``push_tokens`` / ``audio_chunks``).
    codebook_size : int
        Valid code range for sanitization.
    eos_audio_id : int
        End-of-sequence audio token ID.

    Yields
    ------
    np.ndarray
        Decoded waveform segments.
    """
    if isinstance(audio_frames, np.ndarray):
        audio_frames = [audio_frames]
    for frame in audio_frames:
        tokens = frame
        if tokens.ndim == 3:
            tokens = tokens[0]
        if tokens.ndim != 2:
            raise ValueError(f"Expected [T, C] audio tokens, got {tuple(tokens.shape)}")
        print(f"tokens before sanitize {tokens} {tokens.shape}")
        tokens, _ = _sanitize_tokens(tokens, codebook_size, eos_audio_id)
        print(f"tokens after sanitize {tokens} {tokens.shape}")
        if tokens.size == 0:
            continue
        inferencer.push_tokens(tokens)
        for wav in inferencer.audio_chunks():
            if wav.size == 0:
                continue
            print(f"decode_audio_frames wav {wav} {wav.shape}")
            yield wav.reshape(-1)


def flush_decoder(inferencer: MossTTSRealtimeInferenceONNX) -> Iterator[np.ndarray]:
    """Flush the codec decoder buffer and yield any remaining audio.

    Parameters
    ----------
    inferencer : MossTTSRealtimeInferenceONNX
        The ONNX inference engine.

    Yields
    ------
    np.ndarray
        Final waveform segment, if any.
    """
    final_chunk = inferencer.flush()
    if final_chunk is not None and final_chunk.size > 0:
        print(f"final_chunk flush {final_chunk} {final_chunk.shape}")
        yield final_chunk.reshape(-1)


# Core: Streaming generation: text delta → push_text → audio
def run_streaming_tts(
    inferencer: MossTTSRealtimeInferenceONNX,
    text_deltas: Iterator[str],
) -> Iterator[np.ndarray]:
    """Receive streaming text deltas and produce playable WAV chunks in real time.

    The pipeline matches the Gradio demo:
    codec.streaming → push_text → decode_frames → end_text → drain → flush

    Parameters
    ----------
    inferencer : MossTTSRealtimeInferenceONNX
        A fully initialized ONNX inferencer with ``reset_turn`` already called.
    text_deltas : Iterator[str]
        An iterator of text deltas (simulating LLM streaming output).

    Yields
    ------
    np.ndarray
        Decoded waveform chunks suitable for playback or concatenation.
    """
    codebook_size = inferencer.codebook_size
    eos_audio_id = inferencer.eos_audio_id

    for delta in text_deltas:
        # print(delta, end="", flush=True)
        print(f"delta {delta}")
        audio_frames = inferencer.push_text(delta)
        if len(audio_frames) > 0:
            print(f"audio_frames {audio_frames} {len(audio_frames)} {audio_frames[0].shape}")
            yield from decode_audio_frames(audio_frames, inferencer, codebook_size, eos_audio_id)

    audio_frames = inferencer.end_text()
    if len(audio_frames) > 0:
        print(f"audio_frames end_text {audio_frames} {len(audio_frames)} {audio_frames[0].shape}")
        yield from decode_audio_frames(audio_frames, inferencer, codebook_size, eos_audio_id)

    while True:
        audio_frames = inferencer.drain(max_steps=1)
        if not audio_frames:
            break
        else:
            print(f"audio_frames drain {audio_frames} {len(audio_frames)} {audio_frames[0].shape}")
            yield from decode_audio_frames(audio_frames, inferencer, codebook_size, eos_audio_id)
            if inferencer.is_finished:
                break

    yield from flush_decoder(inferencer)


def main() -> None:
    """Entry point: parse arguments, load models, run streaming TTS, write WAV."""
    p = argparse.ArgumentParser(description="Simulated LLM streaming text → TTS streaming audio。")
    p.add_argument("--tokenizer_vocab_path", type=str, required=True)
    p.add_argument("--tokenizer_config_path", type=str, required=True)
    p.add_argument("--backbone_llm_path", type=str, required=True)
    p.add_argument("--backbone_local_path", type=str, required=True)
    p.add_argument("--codec_decoder_path", type=str, required=True)
    p.add_argument("--codec_encoder_path", type=str, required=True)
    p.add_argument("--backbone_config_path", type=str, required=True)
    p.add_argument("--codec_config_path", type=str, required=True)
    p.add_argument("--prompt_wav", type=str, required=True)
    p.add_argument("--out_wav", type=str, default="out_streaming.wav")
    p.add_argument("--sample_rate", type=int, default=CODEC_SAMPLE_RATE)
    p.add_argument("--temperature", type=float, default=0.725)
    p.add_argument("--top_p", type=float, default=0.6)
    p.add_argument("--top_k", type=int, default=34)
    p.add_argument("--repetition_penalty", type=float, default=1.9)
    p.add_argument("--repetition_window", type=int, default=50)
    p.add_argument("--max_length", type=int, default=5000)
    # 模拟 LLM streaming 参数
    p.add_argument(
        "--delta_chunk_chars", type=int, default=1, help="Number of characters to output at each delta (1 = verbatim)"
    )
    p.add_argument(
        "--delta_delay_s", type=float, default=0.0, help="Simulated delay in seconds between deltas, let 0 = no delay"
    )
    p.add_argument(
        "--assistant_text",
        type=str,
        default=(
            "в зависимости от времени не только точность, но и низкая задержка. Если это не мгновенно, то человеческое взаимодействие теряется. Мы наконец-то достигаем момента, когда технология достаточно быстра для того, чтобы люди просто общались, и это является огромным сдвигом для глобального бизнеса."
        ),
    )

    args = p.parse_args()

    tokenizer = MOSSTextTokenizer(args.tokenizer_vocab_path, args.tokenizer_config_path)
    print(f"tokenizer {tokenizer} {args.tokenizer_vocab_path} {args.tokenizer_config_path}")
    backbone_llm = ort.InferenceSession(
        args.backbone_llm_path,
        providers=["CPUExecutionProvider"],
    )
    print(f"backbone_llm {backbone_llm} {args.backbone_llm_path}")
    backbone_local = ort.InferenceSession(
        args.backbone_local_path,
        providers=["CPUExecutionProvider"],
    )
    print(f"backbone_local {backbone_local} {args.backbone_local_path}")
    codec_decoder = ort.InferenceSession(
        args.codec_decoder_path,
        providers=["CPUExecutionProvider"],
    )
    print(f"codec_decoder {codec_decoder} {args.codec_decoder_path}")
    codec_encoder = ort.InferenceSession(
        args.codec_encoder_path,
        providers=["CPUExecutionProvider"],
    )
    print(f"codec_encoder {codec_encoder} {args.codec_encoder_path}")
    with open(args.backbone_config_path, "r") as f:
        backbone_config = json.load(f)
    print(f"backbone_config {backbone_config} {args.backbone_config_path}")
    with open(args.codec_config_path, "r") as f:
        codec_config = json.load(f)
    print(f"codec_config {codec_config} {args.codec_config_path}")

    inferencer = MossTTSRealtimeInferenceONNX(
        tokenizer,
        backbone_llm,
        backbone_local,
        codec_decoder,
        codec_encoder,
        backbone_config,
        codec_config,
        max_length=args.max_length,
        codec_sample_rate=CODEC_SAMPLE_RATE,
        temperature=args.temperature,
        top_p=args.top_p,
        top_k=args.top_k,
        repetition_penalty=args.repetition_penalty,
        repetition_window=args.repetition_window,
    )
    print("Inferencer loaded.")

    print("Extracting audio prompt...")
    prompt_tokens = inferencer._encode_reference_audio(args.prompt_wav)
    print(f"prompt_tokens {prompt_tokens} {prompt_tokens.shape}")
    # ── Build input_ids without the user turn: system_prompt + assistant prefix ──
    print("Loading input ids...")
    input_ids = inferencer.processor.make_ensemble(prompt_tokens.squeeze(1))
    print(f"input_ids {input_ids} {input_ids.shape}")
    inferencer.reset_turn(
        input_ids=input_ids,
        include_system_prompt=False,
        reset_cache=True,
    )
    print("Input ids loaded.")

    text = args.assistant_text
    text_deltas = fake_llm_text_stream(
        text,
        chunk_chars=args.delta_chunk_chars,
        delay_s=args.delta_delay_s,
    )

    print("Running streaming tts simulation...")
    wav_chunks = run_streaming_tts(
        inferencer=inferencer,
        text_deltas=text_deltas,
    )
    print("Done.")

    out_path = Path(args.out_wav).expanduser()
    write_wav(out_path, args.sample_rate, wav_chunks)
    print(f"\n[OK] Write complete: {out_path}")


if __name__ == "__main__":
    main()