azure-scripts / telugu_voice_clone.py
vivekvar's picture
azure home scripts: data gen, training, misc
a70eb3d verified
"""
Telugu Voice Cloning with IndicF5
Usage:
1. Place your reference audio as 'reference.wav' (10-15 seconds, clean Telugu speech)
2. Edit REF_TEXT with the exact Telugu transcript of your reference audio
3. Edit GEN_TEXT with the Telugu text you want to generate
4. Run: source ~/indicf5-env/bin/activate && python telugu_voice_clone.py
"""
import torch
import numpy as np
import soundfile as sf
import io
import time
from pydub import AudioSegment, silence
from huggingface_hub import hf_hub_download
from f5_tts.infer.utils_infer import (
infer_process,
load_model,
load_vocoder,
preprocess_ref_audio_text,
)
from f5_tts.model import DiT
# === CONFIGURE THESE ===
# Path to your reference voice recording (WAV, 10-15 seconds, Telugu)
REF_AUDIO = "reference.wav"
# Exact Telugu transcript of your reference audio
REF_TEXT = "ఇది నా గొంతు నమూనా, నేను తెలుగులో మాట్లాడుతున్నాను."
# Telugu text you want to generate in your cloned voice
GEN_TEXT = "నమస్కారం, మీరు ఎలా ఉన్నారు? నేను మీతో తెలుగులో మాట్లాడుతున్నాను."
# Output file
OUTPUT_FILE = "output_telugu.wav"
SPEED = 1.0
REMOVE_SILENCE = True
# === END CONFIG ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Load vocoder
print("Loading vocoder...")
vocoder = load_vocoder(vocoder_name="vocos", is_local=False, device=device)
# Download vocab and load model
print("Downloading IndicF5 model...")
repo_id = "ai4bharat/IndicF5"
vocab_path = hf_hub_download(repo_id, filename="checkpoints/vocab.txt")
ema_model = load_model(
DiT,
dict(dim=1024, depth=22, heads=16, ff_mult=2, text_dim=512, conv_layers=4),
mel_spec_type="vocos",
vocab_file=vocab_path,
device=device,
)
print("Model loaded!")
# Preprocess reference audio
print(f"Reference audio: {REF_AUDIO}")
ref_audio, ref_text = preprocess_ref_audio_text(REF_AUDIO, REF_TEXT)
# Generate
print(f"Generating: {GEN_TEXT[:80]}...")
start = time.time()
audio, final_sample_rate, _ = infer_process(
ref_audio,
ref_text,
GEN_TEXT,
ema_model,
vocoder,
mel_spec_type="vocos",
speed=SPEED,
device=device,
)
print(f"Generated in {time.time() - start:.1f}s")
# Post-process: remove silence and normalize
buffer = io.BytesIO()
sf.write(buffer, audio, samplerate=24000, format="WAV")
buffer.seek(0)
audio_segment = AudioSegment.from_file(buffer, format="wav")
if REMOVE_SILENCE:
non_silent_segs = silence.split_on_silence(
audio_segment,
min_silence_len=1000,
silence_thresh=-50,
keep_silence=500,
seek_step=10,
)
if non_silent_segs:
audio_segment = sum(non_silent_segs, AudioSegment.silent(duration=0))
# Normalize loudness
target_dBFS = -20.0
change_in_dBFS = target_dBFS - audio_segment.dBFS
audio_segment = audio_segment.apply_gain(change_in_dBFS)
# Save
final_audio = np.array(audio_segment.get_array_of_samples())
if final_audio.dtype == np.int16:
final_audio = final_audio.astype(np.float32) / 32768.0
sf.write(OUTPUT_FILE, final_audio.astype(np.float32), samplerate=24000)
print(f"Saved to {OUTPUT_FILE}")
print("Done!")