You need to agree to share your contact information to access this model

This repository is publicly accessible, but you have to accept the conditions to access its files and content.

Log in or Sign Up to review the conditions and access this model content.

We present a multilingual text-to-speech (TTS) system capable of synthesizing natural-sounding speech across eleven Indian languages while cloning the voice of an arbitrary unseen speaker using only a single reference audio sample. The system integrates a multilingual multispeaker Variational Inference with adversarial learning for end-to-end Text-to-Speech (VITS) model trained on 50 hours × 11 languages with a one-shot voice conversion module based on FreeVC. The pipeline generates linguistically accurate speech from text and then transfers the target speaker’s timbre and tone into the audio without requiring phoneme models or multispeaker training data. We achieved Mel Cepstral Distortion (MCD) score in the range of 5 to 6 and Word Error rate (WER) of less than 15% for all the language which indicates acoustic and intelligibility closeness with ground truth. Also the high cosine similarity score indicate speaker similarity and strong cross-lingual generalization in the cloned voice. Thus, the solution demonstrates a lightweight, deployable approach for universal voice personalization in real world Indian language applications.


import io
import tempfile
from pathlib import Path

import numpy as np
import torch
import torchaudio
import librosa
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import FileResponse

from TTS.config import load_config
from TTS.tts.utils.text.tokenizer import TTSTokenizer
from TTS.tts.utils.languages import LanguageManager
from TTS.tts.utils.speakers import SpeakerManager
from TTS.tts.models.vits import Vits
from TTS.utils.io import load_fsspec

import utils
from models import SynthesizerTrn
from wavlm import WavLM, WavLMConfig
from speaker_encoder.voice_encoder import SpeakerEncoder
from mel_processing import mel_spectrogram_torch

from text_normalizer_v2 import TextNormalizer
normalizer = TextNormalizer()

app = FastAPI(title="VoiceTech4All-CDAC-SVNIT-Submission", version="1.0")

DEVICE =  "cuda" if torch.cuda.is_available() else "cpu" #"cpu"

print("Loading models... ")


MODEL_DIR = Path("checkpoints")
CONFIG_PATH = MODEL_DIR / "config.json"
CHECKPOINT_PATH = MODEL_DIR / "checkpoint_140000.pth"
SPEAKER_MAP_PATH = MODEL_DIR / "speaker_ids.json"
LANGUAGE_MAP_PATH = MODEL_DIR / "language_ids.json"

cfg = load_config(CONFIG_PATH)

tokenizer, _ = TTSTokenizer.init_from_config(cfg)
speaker_manager = SpeakerManager(speaker_id_file_path=str(SPEAKER_MAP_PATH))
language_manager = LanguageManager(language_ids_file_path=str(LANGUAGE_MAP_PATH))

tts_model = Vits(
    config=cfg,
    ap=None,
    tokenizer=tokenizer,
    speaker_manager=speaker_manager,
    language_manager=language_manager
)

state = load_fsspec(CHECKPOINT_PATH, map_location=DEVICE)
tts_model.load_state_dict(state["model"], strict=True)
tts_model.to(DEVICE).eval()

print("Loaded VITS TTS model.")

HPS = utils.get_hparams_from_file("configs/freevc.json")

vc_model = SynthesizerTrn(
    HPS.data.filter_length // 2 + 1,
    HPS.train.segment_size // HPS.data.hop_length,
    **HPS.model
).to(DEVICE)
_ = vc_model.eval()
_ = utils.load_checkpoint("checkpoints/freevc.pth", vc_model, None, True)

# load WavLM content encoder
cmodel = utils.get_cmodel(0).to(DEVICE)

# load speaker encoder
spk_encoder = SpeakerEncoder("speaker_encoder/ckpt/pretrained_bak_5805000.pt")

print("Loaded Voice Conversion model.\n")


@app.get("/Get_Inference")

async def Inference(text : str, lang : str, speaker_wav : UploadFile):

    speaker_name= f"{lang.strip()}_female"
    print(speaker_name)
    text = normalizer.normalize(text)
    token_ids = tokenizer.text_to_ids(text)
    x = torch.LongTensor(token_ids).unsqueeze(0).to(DEVICE)

    aux = {
        "x_lengths": torch.LongTensor([len(token_ids)]).to(DEVICE),
        "speaker_ids": torch.LongTensor(
            [speaker_manager.name_to_id[speaker_name]]
        ).to(DEVICE),
        "language_ids": torch.LongTensor(
            [language_manager.name_to_id[lang]]
        ).to(DEVICE),
        "d_vectors": None,
        "durations": None,
    }

    with torch.no_grad():
        tts_out = tts_model.inference(x, aux_input=aux)
        tts_wav = tts_out["model_outputs"].squeeze().cpu()

    synth_temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
    torchaudio.save(synth_temp, tts_wav.unsqueeze(0), cfg.audio.sample_rate)

    wav_src, _ = librosa.load(synth_temp, sr=HPS.data.sampling_rate)
    wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(DEVICE)
    c = utils.get_content(cmodel, wav_src)

    audio_bytes = await speaker_wav.read()
    tgt_np, sr = librosa.load(io.BytesIO(audio_bytes), sr=HPS.data.sampling_rate)
    tgt_np, _ = librosa.effects.trim(tgt_np, top_db=20)

    g_tgt = spk_encoder.embed_utterance(tgt_np)
    g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        vc_audio = vc_model.infer(c, g=g_tgt)[0][0].data.cpu().float().numpy()

    out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
    torchaudio.save(out_path, torch.tensor(vc_audio).unsqueeze(0), HPS.data.sampling_rate)

    return FileResponse(out_path, media_type="audio/wav", filename="converted.wav")


if __name__ == "__main__":
    import uvicorn
    uvicorn.run("API_Server:app", host="0.0.0.0", port=8000)
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support