We present a multilingual text-to-speech (TTS) system capable of synthesizing natural-sounding speech across eleven Indian languages while cloning the voice of an arbitrary unseen speaker using only a single reference audio sample. The system integrates a multilingual multispeaker Variational Inference with adversarial learning for end-to-end Text-to-Speech (VITS) model trained on 50 hours × 11 languages with a one-shot voice conversion module based on FreeVC. The pipeline generates linguistically accurate speech from text and then transfers the target speaker’s timbre and tone into the audio without requiring phoneme models or multispeaker training data. We achieved Mel Cepstral Distortion (MCD) score in the range of 5 to 6 and Word Error rate (WER) of less than 15% for all the language which indicates acoustic and intelligibility closeness with ground truth. Also the high cosine similarity score indicate speaker similarity and strong cross-lingual generalization in the cloned voice. Thus, the solution demonstrates a lightweight, deployable approach for universal voice personalization in real world Indian language applications.
import io
import tempfile
from pathlib import Path
import numpy as np
import torch
import torchaudio
import librosa
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import FileResponse
from TTS.config import load_config
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.models.vits import Vits
from TTS.utils.io import load_fsspec
import utils
from models import SynthesizerTrn
from wavlm import WavLM, WavLMConfig
from speaker_encoder.voice_encoder import SpeakerEncoder
from mel_processing import mel_spectrogram_torch
from text_normalizer_v2 import TextNormalizer
normalizer = TextNormalizer()
app = FastAPI(title="VoiceTech4All-CDAC-SVNIT-Submission", version="1.0")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" #"cpu"
print("Loading models... ")
MODEL_DIR = Path("checkpoints")
CONFIG_PATH = MODEL_DIR / "config.json"
CHECKPOINT_PATH = MODEL_DIR / "checkpoint_140000.pth"
SPEAKER_MAP_PATH = MODEL_DIR / "speaker_ids.json"
LANGUAGE_MAP_PATH = MODEL_DIR / "language_ids.json"
cfg = load_config(CONFIG_PATH)
tokenizer, _ = TTSTokenizer.init_from_config(cfg)
speaker_manager = SpeakerManager(speaker_id_file_path=str(SPEAKER_MAP_PATH))
language_manager = LanguageManager(language_ids_file_path=str(LANGUAGE_MAP_PATH))
tts_model = Vits(
config=cfg,
ap=None,
tokenizer=tokenizer,
speaker_manager=speaker_manager,
language_manager=language_manager
)
state = load_fsspec(CHECKPOINT_PATH, map_location=DEVICE)
tts_model.load_state_dict(state["model"], strict=True)
tts_model.to(DEVICE).eval()
print("Loaded VITS TTS model.")
HPS = utils.get_hparams_from_file("configs/freevc.json")
vc_model = SynthesizerTrn(
HPS.data.filter_length // 2 + 1,
HPS.train.segment_size // HPS.data.hop_length,
**HPS.model
).to(DEVICE)
_ = vc_model.eval()
_ = utils.load_checkpoint("checkpoints/freevc.pth", vc_model, None, True)
# load WavLM content encoder
cmodel = utils.get_cmodel(0).to(DEVICE)
# load speaker encoder
spk_encoder = SpeakerEncoder("speaker_encoder/ckpt/pretrained_bak_5805000.pt")
print("Loaded Voice Conversion model.\n")
@app.get("/Get_Inference")
async def Inference(text : str, lang : str, speaker_wav : UploadFile):
speaker_name= f"{lang.strip()}_female"
print(speaker_name)
text = normalizer.normalize(text)
token_ids = tokenizer.text_to_ids(text)
x = torch.LongTensor(token_ids).unsqueeze(0).to(DEVICE)
aux = {
"x_lengths": torch.LongTensor([len(token_ids)]).to(DEVICE),
"speaker_ids": torch.LongTensor(
[speaker_manager.name_to_id[speaker_name]]
).to(DEVICE),
"language_ids": torch.LongTensor(
[language_manager.name_to_id[lang]]
).to(DEVICE),
"d_vectors": None,
"durations": None,
}
with torch.no_grad():
tts_out = tts_model.inference(x, aux_input=aux)
tts_wav = tts_out["model_outputs"].squeeze().cpu()
synth_temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
torchaudio.save(synth_temp, tts_wav.unsqueeze(0), cfg.audio.sample_rate)
wav_src, _ = librosa.load(synth_temp, sr=HPS.data.sampling_rate)
wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(DEVICE)
c = utils.get_content(cmodel, wav_src)
audio_bytes = await speaker_wav.read()
tgt_np, sr = librosa.load(io.BytesIO(audio_bytes), sr=HPS.data.sampling_rate)
tgt_np, _ = librosa.effects.trim(tgt_np, top_db=20)
g_tgt = spk_encoder.embed_utterance(tgt_np)
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(DEVICE)
with torch.no_grad():
vc_audio = vc_model.infer(c, g=g_tgt)[0][0].data.cpu().float().numpy()
out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
torchaudio.save(out_path, torch.tensor(vc_audio).unsqueeze(0), HPS.data.sampling_rate)
return FileResponse(out_path, media_type="audio/wav", filename="converted.wav")
if __name__ == "__main__":
import uvicorn
uvicorn.run("API_Server:app", host="0.0.0.0", port=8000)