|
|
""" |
|
|
Chiluka TTS - pip install Example |
|
|
|
|
|
After installing via pip, model weights auto-download from HuggingFace |
|
|
on first use and are cached locally. |
|
|
|
|
|
Install: |
|
|
pip install chiluka |
|
|
sudo apt-get install espeak-ng |
|
|
|
|
|
Usage: |
|
|
python pip_example.py --reference path/to/reference.wav |
|
|
python pip_example.py --reference ref.wav --model telugu --language te |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Chiluka TTS - pip Example") |
|
|
parser.add_argument("--reference", type=str, required=True, help="Path to reference audio file") |
|
|
parser.add_argument("--model", type=str, default="hindi_english", choices=["hindi_english", "telugu"], |
|
|
help="Model variant (default: hindi_english)") |
|
|
parser.add_argument("--text", type=str, default=None, help="Text to synthesize") |
|
|
parser.add_argument("--language", type=str, default=None, help="Language code (en-us, hi, te)") |
|
|
parser.add_argument("--output", type=str, default="output_pip.wav", help="Output wav file path") |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
from chiluka import Chiluka, list_models |
|
|
|
|
|
|
|
|
if args.text is None: |
|
|
texts = { |
|
|
"hindi_english": "Hello, I am Chiluka, a text to speech system.", |
|
|
"telugu": "నమస్కారం, నేను చిలుక మాట్లాడుతున్నాను", |
|
|
} |
|
|
args.text = texts[args.model] |
|
|
|
|
|
if args.language is None: |
|
|
langs = {"hindi_english": "en-us", "telugu": "te"} |
|
|
args.language = langs[args.model] |
|
|
|
|
|
|
|
|
print("Available models:") |
|
|
for name, info in list_models().items(): |
|
|
print(f" {name}: {info['description']}") |
|
|
print() |
|
|
|
|
|
|
|
|
print(f"Loading '{args.model}' model...") |
|
|
tts = Chiluka.from_pretrained(model=args.model) |
|
|
|
|
|
|
|
|
print(f"Text: '{args.text}'") |
|
|
print(f"Language: {args.language}") |
|
|
print(f"Reference: {args.reference}") |
|
|
print() |
|
|
|
|
|
wav = tts.synthesize( |
|
|
text=args.text, |
|
|
reference_audio=args.reference, |
|
|
language=args.language, |
|
|
alpha=0.3, |
|
|
beta=0.7, |
|
|
diffusion_steps=5, |
|
|
embedding_scale=1.0, |
|
|
) |
|
|
|
|
|
|
|
|
tts.save_wav(wav, args.output) |
|
|
print(f"Duration: {len(wav) / 24000:.2f} seconds") |
|
|
|
|
|
|
|
|
if args.model == "hindi_english": |
|
|
print("\n--- Bonus: Hindi synthesis with same model ---") |
|
|
hindi_wav = tts.synthesize( |
|
|
text="नमस्ते, मैं चिलुका बोल रहा हूं", |
|
|
reference_audio=args.reference, |
|
|
language="hi", |
|
|
) |
|
|
hindi_output = args.output.replace(".wav", "_hindi.wav") |
|
|
tts.save_wav(hindi_wav, hindi_output) |
|
|
print(f"Duration: {len(hindi_wav) / 24000:.2f} seconds") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|