| """Kokoro TTS CLI |
| Example usage: |
| python3 -m kokoro --text "The sky above the port was the color of television, tuned to a dead channel." -o file.wav --debug |
| |
| echo "Bom dia mundo, como vão vocês" > text.txt |
| python3 -m kokoro -i text.txt -l p --voice pm_alex > audio.wav |
| |
| Common issues: |
| pip not installed: `uv pip install pip` |
| (Temporary workaround while https://github.com/explosion/spaCy/issues/13747 is not fixed) |
| |
| espeak not installed: `apt-get install espeak-ng` |
| """ |
|
|
| import argparse |
| import wave |
| from pathlib import Path |
| from typing import Generator, TYPE_CHECKING |
|
|
| import numpy as np |
| from loguru import logger |
|
|
| languages = [ |
| "a", |
| "b", |
| "h", |
| "e", |
| "f", |
| "i", |
| "p", |
| "j", |
| "z", |
| ] |
|
|
| if TYPE_CHECKING: |
| from . import KPipeline |
|
|
|
|
| def generate_audio( |
| text: str, kokoro_language: str, voice: str, speed=1 |
| ) -> Generator["KPipeline.Result", None, None]: |
| from . import KPipeline |
|
|
| if not voice.startswith(kokoro_language): |
| logger.warning(f"Voice {voice} is not made for language {kokoro_language}") |
| pipeline = KPipeline(lang_code=kokoro_language) |
| yield from pipeline(text, voice=voice, speed=speed, split_pattern=r"\n+") |
|
|
|
|
| def generate_and_save_audio( |
| output_file: Path, text: str, kokoro_language: str, voice: str, speed=1 |
| ) -> None: |
| with wave.open(str(output_file.resolve()), "wb") as wav_file: |
| wav_file.setnchannels(1) |
| wav_file.setsampwidth(2) |
| wav_file.setframerate(24000) |
|
|
| for result in generate_audio( |
| text, kokoro_language=kokoro_language, voice=voice, speed=speed |
| ): |
| logger.debug(result.phonemes) |
| if result.audio is None: |
| continue |
| audio_bytes = (result.audio.numpy() * 32767).astype(np.int16).tobytes() |
| wav_file.writeframes(audio_bytes) |
|
|
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser() |
| parser.add_argument( |
| "-m", |
| "--voice", |
| default="af_heart", |
| help="Voice to use", |
| ) |
| parser.add_argument( |
| "-l", |
| "--language", |
| help="Language to use (defaults to the one corresponding to the voice)", |
| choices=languages, |
| ) |
| parser.add_argument( |
| "-o", |
| "--output-file", |
| "--output_file", |
| type=Path, |
| help="Path to output WAV file", |
| required=True, |
| ) |
| parser.add_argument( |
| "-i", |
| "--input-file", |
| "--input_file", |
| type=Path, |
| help="Path to input text file (default: stdin)", |
| ) |
| parser.add_argument( |
| "-t", |
| "--text", |
| help="Text to use instead of reading from stdin", |
| ) |
| parser.add_argument( |
| "-s", |
| "--speed", |
| type=float, |
| default=1.0, |
| help="Speech speed", |
| ) |
| parser.add_argument( |
| "--debug", |
| action="store_true", |
| help="Print DEBUG messages to console", |
| ) |
| args = parser.parse_args() |
| if args.debug: |
| logger.level("DEBUG") |
| logger.debug(args) |
|
|
| lang = args.language or args.voice[0] |
|
|
| if args.text is not None and args.input_file is not None: |
| raise Exception("You cannot specify both 'text' and 'input_file'") |
| elif args.text: |
| text = args.text |
| elif args.input_file: |
| file: Path = args.input_file |
| text = file.read_text() |
| else: |
| import sys |
| print("Press Ctrl+D to stop reading input and start generating", flush=True) |
| text = '\n'.join(sys.stdin) |
|
|
| logger.debug(f"Input text: {text!r}") |
|
|
| out_file: Path = args.output_file |
| if not out_file.suffix == ".wav": |
| logger.warning("The output file name should end with .wav") |
| generate_and_save_audio( |
| output_file=out_file, |
| text=text, |
| kokoro_language=lang, |
| voice=args.voice, |
| speed=args.speed, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|