Text-to-Speech
ONNX
GGUF
speech-translation
streaming-speech-translation
speech
audio
speech-recognition
automatic-speech-recognition
streaming-asr
ASR
NeMo
ONNX
cache-aware ASR
FastConformer
RNNT
Parakeet
neural-machine-translation
NMT
gemma3
llama-cpp
GGUF
conversational
TTS
xtts
xttsv2
voice-clone
gpt2
hifigan
multilingual
vq
perceiver-encoder
websocket
| #!/usr/bin/env python3 | |
| # License: CC-BY-NC-ND-4.0 | |
| # Created by: Patrick Lumbantobing, Vertox-AI | |
| # Copyright (c) 2026 Vertox-AI. All rights reserved. | |
| # | |
| # This work is licensed under the Creative Commons | |
| # Attribution-NonCommercial-NoDerivatives 4.0 International License. | |
| # To view a copy of this license, visit | |
| # http://creativecommons.org/licenses/by-nc-nd/4.0/ | |
| """ | |
| Main entry point for the streaming speech translation server. | |
| This script starts a WebSocket server that performs real-time | |
| speech-to-speech translation using: | |
| - ASR: ONNX-exported NVIDIA NeMo Conformer RNN-T | |
| - NMT: TranslateGemma GGUF via llama-cpp | |
| - TTS: XTTSv2 ONNX (GPT-2 AR + HiFi-GAN vocoder) | |
| Typical usage (simplified): | |
| python app.py \\ | |
| --asr-onnx-path /path/to/asr_dir \\ | |
| --nmt-gguf-path /path/to/translategemma.gguf \\ | |
| --tts-model-dir /path/to/xtts_dir \\ | |
| --tts-vocab-path /path/to/vocab.json \\ | |
| --tts-mel-norms-path /path/to/mel_stats.npy \\ | |
| --tts-ref-audio-path /path/to/reference.wav | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import asyncio | |
| import logging | |
| from src.pipeline.config import PipelineConfig | |
| from src.server.websocket_server import TranslationServer | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| ) | |
| def _build_arg_parser() -> argparse.ArgumentParser: | |
| """ | |
| Create and configure the command-line argument parser. | |
| Returns | |
| ------- | |
| argparse.ArgumentParser | |
| Parser configured with ASR, NMT, TTS, queue, and server options. | |
| """ | |
| parser = argparse.ArgumentParser( | |
| description="Streaming Speech Translation Server (ASR → NMT → TTS)", | |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter, | |
| ) | |
| # ASR | |
| parser.add_argument( | |
| "--asr-onnx-path", | |
| required=True, | |
| help="Path to ASR ONNX model directory", | |
| ) | |
| parser.add_argument( | |
| "--asr-chunk-ms", | |
| type=int, | |
| default=10, | |
| help="ASR audio chunk duration in milliseconds", | |
| ) | |
| parser.add_argument( | |
| "--asr-sample-rate", | |
| type=int, | |
| default=16000, | |
| help="ASR expected input sample rate (Hz)", | |
| ) | |
| # NMT | |
| parser.add_argument( | |
| "--nmt-gguf-path", | |
| required=True, | |
| help="Path to NMT GGUF model file", | |
| ) | |
| parser.add_argument( | |
| "--nmt-n-threads", | |
| type=int, | |
| default=4, | |
| help="Number of CPU threads for NMT (llama-cpp)", | |
| ) | |
| # TTS | |
| parser.add_argument( | |
| "--tts-model-dir", | |
| required=True, | |
| help="Path to TTS ONNX model directory (XTTSv2)", | |
| ) | |
| parser.add_argument( | |
| "--tts-vocab-path", | |
| required=True, | |
| help="Path to TTS BPE vocab.json", | |
| ) | |
| parser.add_argument( | |
| "--tts-mel-norms-path", | |
| required=True, | |
| help="Path to TTS mel_stats.npy (mel normalization statistics)", | |
| ) | |
| parser.add_argument( | |
| "--tts-ref-audio-path", | |
| required=True, | |
| help="Path to TTS reference speaker audio file", | |
| ) | |
| parser.add_argument( | |
| "--tts-language", | |
| default="ru", | |
| help="Target language code for TTS output (e.g., 'ru')", | |
| ) | |
| parser.add_argument("--tts-int8-gpt", action="store_true", help="Use INT8 quantized GPT") | |
| parser.add_argument( | |
| "--tts-threads-gpt", | |
| type=int, | |
| default=2, | |
| help="Number of threads for TTS GPT ONNX inference", | |
| ) | |
| parser.add_argument( | |
| "--tts-chunk-size", | |
| type=int, | |
| default=20, | |
| help="Number of AR tokens per vocoder chunk in streaming TTS", | |
| ) | |
| # Pipeline queues | |
| parser.add_argument( | |
| "--audio-queue-max", | |
| type=int, | |
| default=256, | |
| help="Maximum size of the raw audio input queue", | |
| ) | |
| parser.add_argument( | |
| "--text-queue-max", | |
| type=int, | |
| default=64, | |
| help="Maximum size of the ASR→NMT text queue", | |
| ) | |
| parser.add_argument( | |
| "--tts-queue-max", | |
| type=int, | |
| default=16, | |
| help="Maximum size of the NMT→TTS text queue", | |
| ) | |
| parser.add_argument( | |
| "--audio-out-queue-max", | |
| type=int, | |
| default=32, | |
| help="Maximum size of the synthesized audio output queue", | |
| ) | |
| # Server | |
| parser.add_argument( | |
| "--host", | |
| default="0.0.0.0", | |
| help="Server bind host", | |
| ) | |
| parser.add_argument( | |
| "--port", | |
| type=int, | |
| default=8765, | |
| help="Server port", | |
| ) | |
| return parser | |
| def main() -> None: | |
| """ | |
| Parse CLI arguments, construct the pipeline configuration, and start the server. | |
| This function: | |
| 1. Parses command-line options for ASR, NMT, TTS, and server settings. | |
| 2. Instantiates a `PipelineConfig` dataclass from the parsed arguments. | |
| 3. Creates a `TranslationServer` and starts its asynchronous event loop. | |
| """ | |
| parser = _build_arg_parser() | |
| args = parser.parse_args() | |
| config = PipelineConfig( | |
| asr_onnx_path=args.asr_onnx_path, | |
| asr_chunk_duration_ms=args.asr_chunk_ms, | |
| asr_sample_rate=args.asr_sample_rate, | |
| nmt_gguf_path=args.nmt_gguf_path, | |
| nmt_n_threads=args.nmt_n_threads, | |
| tts_model_dir=args.tts_model_dir, | |
| tts_vocab_path=args.tts_vocab_path, | |
| tts_mel_norms_path=args.tts_mel_norms_path, | |
| tts_ref_audio_path=args.tts_ref_audio_path, | |
| tts_language=args.tts_language, | |
| tts_use_int8_gpt=args.tts_int8_gpt, | |
| tts_num_threads_gpt=args.tts_threads_gpt, | |
| tts_stream_chunk_size=args.tts_chunk_size, | |
| audio_queue_maxsize=args.audio_queue_max, | |
| text_queue_maxsize=args.text_queue_max, | |
| tts_queue_maxsize=args.tts_queue_max, | |
| audio_out_queue_maxsize=args.audio_out_queue_max, | |
| host=args.host, | |
| port=args.port, | |
| ) | |
| server = TranslationServer(config) | |
| asyncio.run(server.start()) | |
| if __name__ == "__main__": | |
| main() | |