|
|
--- |
|
|
license: cc-by-4.0 |
|
|
--- |
|
|
We present a multilingual text-to-speech (TTS) system capable of synthesizing natural-sounding speech across eleven Indian languages |
|
|
while cloning the voice of an arbitrary unseen speaker using only a single reference audio sample. The system integrates a multilingual |
|
|
multispeaker Variational Inference with adversarial learning for end-to-end Text-to-Speech (VITS) model trained on 50 hours × 11 |
|
|
languages with a one-shot voice conversion module based on FreeVC. |
|
|
The pipeline generates linguistically accurate speech from text and then transfers the target speaker’s timbre and tone into the audio |
|
|
without requiring phoneme models or multispeaker training data. We achieved Mel Cepstral Distortion (MCD) score in the range of |
|
|
5 to 6 and Word Error rate (WER) of less than 15% for all the language which indicates acoustic and intelligibility closeness with |
|
|
ground truth. Also the high cosine similarity score indicate speaker similarity and strong cross-lingual generalization in the cloned voice. |
|
|
Thus, the solution demonstrates a lightweight, deployable approach for universal voice personalization in real world Indian language |
|
|
applications. |
|
|
|
|
|
```python |
|
|
|
|
|
import io |
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
|
|
|
import numpy as np |
|
|
import torch |
|
|
import torchaudio |
|
|
import librosa |
|
|
from fastapi import FastAPI, UploadFile, File, Form |
|
|
from fastapi.responses import FileResponse |
|
|
|
|
|
from TTS.config import load_config |
|
|
from TTS.tts.utils.text.tokenizer import TTSTokenizer |
|
|
from TTS.tts.utils.languages import LanguageManager |
|
|
from TTS.tts.utils.speakers import SpeakerManager |
|
|
from TTS.tts.models.vits import Vits |
|
|
from TTS.utils.io import load_fsspec |
|
|
|
|
|
import utils |
|
|
from models import SynthesizerTrn |
|
|
from wavlm import WavLM, WavLMConfig |
|
|
from speaker_encoder.voice_encoder import SpeakerEncoder |
|
|
from mel_processing import mel_spectrogram_torch |
|
|
|
|
|
from text_normalizer_v2 import TextNormalizer |
|
|
normalizer = TextNormalizer() |
|
|
|
|
|
app = FastAPI(title="VoiceTech4All-CDAC-SVNIT-Submission", version="1.0") |
|
|
|
|
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" #"cpu" |
|
|
|
|
|
print("Loading models... ") |
|
|
|
|
|
|
|
|
MODEL_DIR = Path("checkpoints") |
|
|
CONFIG_PATH = MODEL_DIR / "config.json" |
|
|
CHECKPOINT_PATH = MODEL_DIR / "checkpoint_140000.pth" |
|
|
SPEAKER_MAP_PATH = MODEL_DIR / "speaker_ids.json" |
|
|
LANGUAGE_MAP_PATH = MODEL_DIR / "language_ids.json" |
|
|
|
|
|
cfg = load_config(CONFIG_PATH) |
|
|
|
|
|
tokenizer, _ = TTSTokenizer.init_from_config(cfg) |
|
|
speaker_manager = SpeakerManager(speaker_id_file_path=str(SPEAKER_MAP_PATH)) |
|
|
language_manager = LanguageManager(language_ids_file_path=str(LANGUAGE_MAP_PATH)) |
|
|
|
|
|
tts_model = Vits( |
|
|
config=cfg, |
|
|
ap=None, |
|
|
tokenizer=tokenizer, |
|
|
speaker_manager=speaker_manager, |
|
|
language_manager=language_manager |
|
|
) |
|
|
|
|
|
state = load_fsspec(CHECKPOINT_PATH, map_location=DEVICE) |
|
|
tts_model.load_state_dict(state["model"], strict=True) |
|
|
tts_model.to(DEVICE).eval() |
|
|
|
|
|
print("Loaded VITS TTS model.") |
|
|
|
|
|
HPS = utils.get_hparams_from_file("configs/freevc.json") |
|
|
|
|
|
vc_model = SynthesizerTrn( |
|
|
HPS.data.filter_length // 2 + 1, |
|
|
HPS.train.segment_size // HPS.data.hop_length, |
|
|
**HPS.model |
|
|
).to(DEVICE) |
|
|
_ = vc_model.eval() |
|
|
_ = utils.load_checkpoint("checkpoints/freevc.pth", vc_model, None, True) |
|
|
|
|
|
# load WavLM content encoder |
|
|
cmodel = utils.get_cmodel(0).to(DEVICE) |
|
|
|
|
|
# load speaker encoder |
|
|
spk_encoder = SpeakerEncoder("speaker_encoder/ckpt/pretrained_bak_5805000.pt") |
|
|
|
|
|
print("Loaded Voice Conversion model.\n") |
|
|
|
|
|
|
|
|
@app.get("/Get_Inference") |
|
|
|
|
|
async def Inference(text : str, lang : str, speaker_wav : UploadFile): |
|
|
|
|
|
speaker_name= f"{lang.strip()}_female" |
|
|
print(speaker_name) |
|
|
text = normalizer.normalize(text) |
|
|
token_ids = tokenizer.text_to_ids(text) |
|
|
x = torch.LongTensor(token_ids).unsqueeze(0).to(DEVICE) |
|
|
|
|
|
aux = { |
|
|
"x_lengths": torch.LongTensor([len(token_ids)]).to(DEVICE), |
|
|
"speaker_ids": torch.LongTensor( |
|
|
[speaker_manager.name_to_id[speaker_name]] |
|
|
).to(DEVICE), |
|
|
"language_ids": torch.LongTensor( |
|
|
[language_manager.name_to_id[lang]] |
|
|
).to(DEVICE), |
|
|
"d_vectors": None, |
|
|
"durations": None, |
|
|
} |
|
|
|
|
|
with torch.no_grad(): |
|
|
tts_out = tts_model.inference(x, aux_input=aux) |
|
|
tts_wav = tts_out["model_outputs"].squeeze().cpu() |
|
|
|
|
|
synth_temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name |
|
|
torchaudio.save(synth_temp, tts_wav.unsqueeze(0), cfg.audio.sample_rate) |
|
|
|
|
|
wav_src, _ = librosa.load(synth_temp, sr=HPS.data.sampling_rate) |
|
|
wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(DEVICE) |
|
|
c = utils.get_content(cmodel, wav_src) |
|
|
|
|
|
audio_bytes = await speaker_wav.read() |
|
|
tgt_np, sr = librosa.load(io.BytesIO(audio_bytes), sr=HPS.data.sampling_rate) |
|
|
tgt_np, _ = librosa.effects.trim(tgt_np, top_db=20) |
|
|
|
|
|
g_tgt = spk_encoder.embed_utterance(tgt_np) |
|
|
g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(DEVICE) |
|
|
|
|
|
with torch.no_grad(): |
|
|
vc_audio = vc_model.infer(c, g=g_tgt)[0][0].data.cpu().float().numpy() |
|
|
|
|
|
out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name |
|
|
torchaudio.save(out_path, torch.tensor(vc_audio).unsqueeze(0), HPS.data.sampling_rate) |
|
|
|
|
|
return FileResponse(out_path, media_type="audio/wav", filename="converted.wav") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import uvicorn |
|
|
uvicorn.run("API_Server:app", host="0.0.0.0", port=8000) |
|
|
|