| """ |
| Telugu Voice Cloning with IndicF5 |
| |
| Usage: |
| 1. Place your reference audio as 'reference.wav' (10-15 seconds, clean Telugu speech) |
| 2. Edit REF_TEXT with the exact Telugu transcript of your reference audio |
| 3. Edit GEN_TEXT with the Telugu text you want to generate |
| 4. Run: source ~/indicf5-env/bin/activate && python telugu_voice_clone.py |
| """ |
|
|
| import torch |
| import numpy as np |
| import soundfile as sf |
| import io |
| import time |
| from pydub import AudioSegment, silence |
| from huggingface_hub import hf_hub_download |
| from f5_tts.infer.utils_infer import ( |
| infer_process, |
| load_model, |
| load_vocoder, |
| preprocess_ref_audio_text, |
| ) |
| from f5_tts.model import DiT |
|
|
| |
|
|
| |
| REF_AUDIO = "reference.wav" |
|
|
| |
| REF_TEXT = "ఇది నా గొంతు నమూనా, నేను తెలుగులో మాట్లాడుతున్నాను." |
|
|
| |
| GEN_TEXT = "నమస్కారం, మీరు ఎలా ఉన్నారు? నేను మీతో తెలుగులో మాట్లాడుతున్నాను." |
|
|
| |
| OUTPUT_FILE = "output_telugu.wav" |
|
|
| SPEED = 1.0 |
| REMOVE_SILENCE = True |
|
|
| |
|
|
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| print(f"Using device: {device}") |
|
|
| |
| print("Loading vocoder...") |
| vocoder = load_vocoder(vocoder_name="vocos", is_local=False, device=device) |
|
|
| |
| print("Downloading IndicF5 model...") |
| repo_id = "ai4bharat/IndicF5" |
| vocab_path = hf_hub_download(repo_id, filename="checkpoints/vocab.txt") |
|
|
| ema_model = load_model( |
| DiT, |
| dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4), |
| mel_spec_type="vocos", |
| vocab_file=vocab_path, |
| device=device, |
| ) |
| print("Model loaded!") |
|
|
| |
| print(f"Reference audio: {REF_AUDIO}") |
| ref_audio, ref_text = preprocess_ref_audio_text(REF_AUDIO, REF_TEXT) |
|
|
| |
| print(f"Generating: {GEN_TEXT[:80]}...") |
| start = time.time() |
| audio, final_sample_rate, _ = infer_process( |
| ref_audio, |
| ref_text, |
| GEN_TEXT, |
| ema_model, |
| vocoder, |
| mel_spec_type="vocos", |
| speed=SPEED, |
| device=device, |
| ) |
| print(f"Generated in {time.time() - start:.1f}s") |
|
|
| |
| buffer = io.BytesIO() |
| sf.write(buffer, audio, samplerate=24000, format="WAV") |
| buffer.seek(0) |
| audio_segment = AudioSegment.from_file(buffer, format="wav") |
|
|
| if REMOVE_SILENCE: |
| non_silent_segs = silence.split_on_silence( |
| audio_segment, |
| min_silence_len=1000, |
| silence_thresh=-50, |
| keep_silence=500, |
| seek_step=10, |
| ) |
| if non_silent_segs: |
| audio_segment = sum(non_silent_segs, AudioSegment.silent(duration=0)) |
|
|
| |
| target_dBFS = -20.0 |
| change_in_dBFS = target_dBFS - audio_segment.dBFS |
| audio_segment = audio_segment.apply_gain(change_in_dBFS) |
|
|
| |
| final_audio = np.array(audio_segment.get_array_of_samples()) |
| if final_audio.dtype == np.int16: |
| final_audio = final_audio.astype(np.float32) / 32768.0 |
| sf.write(OUTPUT_FILE, final_audio.astype(np.float32), samplerate=24000) |
| print(f"Saved to {OUTPUT_FILE}") |
| print("Done!") |
|
|