""" Telugu Voice Cloning with IndicF5 Usage: 1. Place your reference audio as 'reference.wav' (10-15 seconds, clean Telugu speech) 2. Edit REF_TEXT with the exact Telugu transcript of your reference audio 3. Edit GEN_TEXT with the Telugu text you want to generate 4. Run: source ~/indicf5-env/bin/activate && python telugu_voice_clone.py """ import torch import numpy as np import soundfile as sf import io import time from pydub import AudioSegment, silence from huggingface_hub import hf_hub_download from f5_tts.infer.utils_infer import ( infer_process, load_model, load_vocoder, preprocess_ref_audio_text, ) from f5_tts.model import DiT # === CONFIGURE THESE === # Path to your reference voice recording (WAV, 10-15 seconds, Telugu) REF_AUDIO = "reference.wav" # Exact Telugu transcript of your reference audio REF_TEXT = "ఇది నా గొంతు నమూనా, నేను తెలుగులో మాట్లాడుతున్నాను." # Telugu text you want to generate in your cloned voice GEN_TEXT = "నమస్కారం, మీరు ఎలా ఉన్నారు? నేను మీతో తెలుగులో మాట్లాడుతున్నాను." # Output file OUTPUT_FILE = "output_telugu.wav" SPEED = 1.0 REMOVE_SILENCE = True # === END CONFIG === device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # Load vocoder print("Loading vocoder...") vocoder = load_vocoder(vocoder_name="vocos", is_local=False, device=device) # Download vocab and load model print("Downloading IndicF5 model...") repo_id = "ai4bharat/IndicF5" vocab_path = hf_hub_download(repo_id, filename="checkpoints/vocab.txt") ema_model = load_model( DiT, dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4), mel_spec_type="vocos", vocab_file=vocab_path, device=device, ) print("Model loaded!") # Preprocess reference audio print(f"Reference audio: {REF_AUDIO}") ref_audio, ref_text = preprocess_ref_audio_text(REF_AUDIO, REF_TEXT) # Generate print(f"Generating: {GEN_TEXT[:80]}...") start = time.time() audio, final_sample_rate, _ = infer_process( ref_audio, ref_text, GEN_TEXT, ema_model, vocoder, mel_spec_type="vocos", speed=SPEED, device=device, ) print(f"Generated in {time.time() - start:.1f}s") # Post-process: remove silence and normalize buffer = io.BytesIO() sf.write(buffer, audio, samplerate=24000, format="WAV") buffer.seek(0) audio_segment = AudioSegment.from_file(buffer, format="wav") if REMOVE_SILENCE: non_silent_segs = silence.split_on_silence( audio_segment, min_silence_len=1000, silence_thresh=-50, keep_silence=500, seek_step=10, ) if non_silent_segs: audio_segment = sum(non_silent_segs, AudioSegment.silent(duration=0)) # Normalize loudness target_dBFS = -20.0 change_in_dBFS = target_dBFS - audio_segment.dBFS audio_segment = audio_segment.apply_gain(change_in_dBFS) # Save final_audio = np.array(audio_segment.get_array_of_samples()) if final_audio.dtype == np.int16: final_audio = final_audio.astype(np.float32) / 32768.0 sf.write(OUTPUT_FILE, final_audio.astype(np.float32), samplerate=24000) print(f"Saved to {OUTPUT_FILE}") print("Done!")