#!/usr/bin/env python3 # License: CC-BY-NC-ND-4.0 # Created by: Patrick Lumbantobing, Vertox-AI # Copyright (c) 2026 Vertox-AI. All rights reserved. # # This work is licensed under the Creative Commons # Attribution-NonCommercial-NoDerivatives 4.0 International License. # To view a copy of this license, visit # http://creativecommons.org/licenses/by-nc-nd/4.0/ """ Main entry point for the streaming speech translation server. This script starts a WebSocket server that performs real-time speech-to-speech translation using: - ASR: ONNX-exported NVIDIA NeMo Conformer RNN-T - NMT: TranslateGemma GGUF via llama-cpp - TTS: XTTSv2 ONNX (GPT-2 AR + HiFi-GAN vocoder) Typical usage (simplified): python app.py \\ --asr-onnx-path /path/to/asr_dir \\ --nmt-gguf-path /path/to/translategemma.gguf \\ --tts-model-dir /path/to/xtts_dir \\ --tts-vocab-path /path/to/vocab.json \\ --tts-mel-norms-path /path/to/mel_stats.npy \\ --tts-ref-audio-path /path/to/reference.wav """ from __future__ import annotations import argparse import asyncio import logging from src.pipeline.config import PipelineConfig from src.server.websocket_server import TranslationServer logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) def _build_arg_parser() -> argparse.ArgumentParser: """ Create and configure the command-line argument parser. Returns ------- argparse.ArgumentParser Parser configured with ASR, NMT, TTS, queue, and server options. """ parser = argparse.ArgumentParser( description="Streaming Speech Translation Server (ASR → NMT → TTS)", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) # ASR parser.add_argument( "--asr-onnx-path", required=True, help="Path to ASR ONNX model directory", ) parser.add_argument( "--asr-chunk-ms", type=int, default=10, help="ASR audio chunk duration in milliseconds", ) parser.add_argument( "--asr-sample-rate", type=int, default=16000, help="ASR expected input sample rate (Hz)", ) # NMT parser.add_argument( "--nmt-gguf-path", required=True, help="Path to NMT GGUF model file", ) parser.add_argument( "--nmt-n-threads", type=int, default=4, help="Number of CPU threads for NMT (llama-cpp)", ) # TTS parser.add_argument( "--tts-model-dir", required=True, help="Path to TTS ONNX model directory (XTTSv2)", ) parser.add_argument( "--tts-vocab-path", required=True, help="Path to TTS BPE vocab.json", ) parser.add_argument( "--tts-mel-norms-path", required=True, help="Path to TTS mel_stats.npy (mel normalization statistics)", ) parser.add_argument( "--tts-ref-audio-path", required=True, help="Path to TTS reference speaker audio file", ) parser.add_argument( "--tts-language", default="ru", help="Target language code for TTS output (e.g., 'ru')", ) parser.add_argument("--tts-int8-gpt", action="store_true", help="Use INT8 quantized GPT") parser.add_argument( "--tts-threads-gpt", type=int, default=2, help="Number of threads for TTS GPT ONNX inference", ) parser.add_argument( "--tts-chunk-size", type=int, default=20, help="Number of AR tokens per vocoder chunk in streaming TTS", ) # Pipeline queues parser.add_argument( "--audio-queue-max", type=int, default=256, help="Maximum size of the raw audio input queue", ) parser.add_argument( "--text-queue-max", type=int, default=64, help="Maximum size of the ASR→NMT text queue", ) parser.add_argument( "--tts-queue-max", type=int, default=16, help="Maximum size of the NMT→TTS text queue", ) parser.add_argument( "--audio-out-queue-max", type=int, default=32, help="Maximum size of the synthesized audio output queue", ) # Server parser.add_argument( "--host", default="0.0.0.0", help="Server bind host", ) parser.add_argument( "--port", type=int, default=8765, help="Server port", ) return parser def main() -> None: """ Parse CLI arguments, construct the pipeline configuration, and start the server. This function: 1. Parses command-line options for ASR, NMT, TTS, and server settings. 2. Instantiates a `PipelineConfig` dataclass from the parsed arguments. 3. Creates a `TranslationServer` and starts its asynchronous event loop. """ parser = _build_arg_parser() args = parser.parse_args() config = PipelineConfig( asr_onnx_path=args.asr_onnx_path, asr_chunk_duration_ms=args.asr_chunk_ms, asr_sample_rate=args.asr_sample_rate, nmt_gguf_path=args.nmt_gguf_path, nmt_n_threads=args.nmt_n_threads, tts_model_dir=args.tts_model_dir, tts_vocab_path=args.tts_vocab_path, tts_mel_norms_path=args.tts_mel_norms_path, tts_ref_audio_path=args.tts_ref_audio_path, tts_language=args.tts_language, tts_use_int8_gpt=args.tts_int8_gpt, tts_num_threads_gpt=args.tts_threads_gpt, tts_stream_chunk_size=args.tts_chunk_size, audio_queue_maxsize=args.audio_queue_max, text_queue_maxsize=args.text_queue_max, tts_queue_maxsize=args.tts_queue_max, audio_out_queue_maxsize=args.audio_out_queue_max, host=args.host, port=args.port, ) server = TranslationServer(config) asyncio.run(server.start()) if __name__ == "__main__": main()