File size: 6,424 Bytes

f38c279

"""
ONNX-based decoder for KaniTTS - CPU-optimized version with reduced lookback
Use this if CUDA is not working with ONNX Runtime
"""

import numpy as np
import onnxruntime as ort
from typing import List
from pathlib import Path


class ONNXKaniTTSDecoderOptimized:
    """
    CPU-optimized ONNX Runtime decoder for KaniTTS
    Reduces lookback frames for faster CPU inference
    """
    def __init__(self, onnx_model_path="onnx/nano_codec_decoder.onnx", device="cuda"):
        self.sample_rate = 22050
        self.device = device

        # Decoder settings - OPTIMIZED FOR CPU
        self.num_codebooks = 4
        self.codebook_size = 4032
        self.chunk_size = 25

        # REDUCED lookback for CPU performance
        self.lookback_frames = 3  # Reduced from 15 to 3 (5x faster on CPU!)
        self.first_frame_lookback = 1  # Reduced from 3 to 1

        # Buffer for sliding window
        self.history_codes = []
        self.frame_count = 0

        # Setup ONNX Runtime
        print(f"Loading ONNX decoder (CPU-optimized): {onnx_model_path}")
        self._setup_ort_session(onnx_model_path)

        # Pre-warm decoder
        print("Pre-warming ONNX decoder...")
        self._prewarm_decoder()

    def _setup_ort_session(self, onnx_model_path):
        """Setup ONNX Runtime session - CPU optimized"""
        # Try CUDA first, fall back to CPU
        providers_to_try = []

        if self.device == "cuda":
            # Try CUDA with error handling
            providers_to_try.append(("CUDAExecutionProvider", {
                "device_id": 0,
                "arena_extend_strategy": "kNextPowerOfTwo",
                "gpu_mem_limit": 2 * 1024 * 1024 * 1024,
                "cudnn_conv_algo_search": "EXHAUSTIVE",
                "do_copy_in_default_stream": True,
            }))

        # Always include CPU as fallback
        providers_to_try.append("CPUExecutionProvider")

        # Session options optimized for CPU
        sess_options = ort.SessionOptions()
        sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
        sess_options.intra_op_num_threads = 4  # Use multiple CPU threads
        sess_options.inter_op_num_threads = 4

        # Create session
        try:
            self.session = ort.InferenceSession(
                onnx_model_path,
                sess_options=sess_options,
                providers=providers_to_try
            )
        except Exception as e:
            print(f"Warning: Failed to create session with CUDA, using CPU only: {e}")
            self.session = ort.InferenceSession(
                onnx_model_path,
                sess_options=sess_options,
                providers=["CPUExecutionProvider"]
            )

        # Verify providers
        active_providers = self.session.get_providers()
        print(f"Active providers: {active_providers}")

        if 'CUDAExecutionProvider' not in active_providers:
            print("⚠️  WARNING: Running on CPU (slow!)")
            print("   Using reduced lookback (3 frames) for better CPU performance")
            print("   See PERFORMANCE_FIX.md for GPU acceleration instructions")

        # Get input/output info
        self.input_names = [inp.name for inp in self.session.get_inputs()]
        self.output_names = [out.name for out in self.session.get_outputs()]

    def _prewarm_decoder(self):
        """Pre-warm the decoder"""
        dummy_codes = [0, 0, 0, 0]
        _ = self.decode_frame(dummy_codes)
        self.history_codes = []
        self.frame_count = 0
        print("ONNX decoder pre-warmed!")

    def decode_frame(self, codes: List[int]) -> np.ndarray:
        """
        Decode a single frame (4 tokens) to audio

        Args:
            codes: List of 4 integers

        Returns:
            numpy array of int16 audio samples
        """
        if len(codes) != self.num_codebooks:
            raise ValueError(f"Expected {self.num_codebooks} codes, got {len(codes)}")

        # Add to history
        self.history_codes.append(codes)
        self.frame_count += 1

        # Use reduced lookback
        effective_lookback = self.first_frame_lookback if self.frame_count == 1 else self.lookback_frames

        # Keep only recent history
        max_history = effective_lookback + 1
        if len(self.history_codes) > max_history:
            self.history_codes = self.history_codes[-max_history:]

        # Prepare context
        context_codes = self.history_codes if len(self.history_codes) > 1 else [codes]

        # Convert to numpy
        codes_np = np.array(
            [[frame for frame in context_codes]],
            dtype=np.int64
        ).transpose(0, 2, 1)

        num_frames = len(context_codes)
        tokens_len_np = np.array([num_frames], dtype=np.int64)

        # Run ONNX inference
        outputs = self.session.run(
            self.output_names,
            {'tokens': codes_np, 'tokens_len': tokens_len_np}
        )

        audio = outputs[0]

        # Extract only new audio
        samples_per_frame = int(self.sample_rate * 0.08)

        if len(self.history_codes) > 1:
            start_idx = (len(self.history_codes) - 1) * samples_per_frame
            audio = audio[:, start_idx:]

        # Convert to int16
        audio_np = audio.squeeze()
        audio_int16 = (audio_np * 32767).astype(np.int16)

        return audio_int16

    def reset_history(self):
        """Reset decoder history"""
        self.history_codes = []
        self.frame_count = 0


if __name__ == "__main__":
    print("Testing CPU-optimized ONNX decoder...")

    decoder = ONNXKaniTTSDecoderOptimized(
        onnx_model_path="nano_codec_decoder.onnx",
        device="cuda"  # Will fall back to CPU if CUDA not available
    )

    # Quick benchmark
    import time
    times = []

    for _ in range(20):
        codes = [np.random.randint(0, 500) for _ in range(4)]
        start = time.time()
        audio = decoder.decode_frame(codes)
        times.append((time.time() - start) * 1000)

    print(f"\n📊 Performance:")
    print(f"  Average: {np.mean(times):.2f} ms/frame")
    print(f"  Min: {np.min(times):.2f} ms")
    print(f"  Max: {np.max(times):.2f} ms")
    print(f"\n✓ Decoder test completed!")

    if np.mean(times) < 100:
        print("✓ Performance is good!")
    else:
        print("⚠️  Performance is slow. See PERFORMANCE_FIX.md for solutions.")