Update README.md

92fc66c verified about 2 months ago

5.33 kB

	---
	license: cc-by-4.0
	---
	We present a multilingual text-to-speech (TTS) system capable of synthesizing natural-sounding speech across eleven Indian languages
	while cloning the voice of an arbitrary unseen speaker using only a single reference audio sample. The system integrates a multilingual
	multispeaker Variational Inference with adversarial learning for end-to-end Text-to-Speech (VITS) model trained on 50 hours × 11
	languages with a one-shot voice conversion module based on FreeVC.
	The pipeline generates linguistically accurate speech from text and then transfers the target speaker’s timbre and tone into the audio
	without requiring phoneme models or multispeaker training data. We achieved Mel Cepstral Distortion (MCD) score in the range of
	5 to 6 and Word Error rate (WER) of less than 15% for all the language which indicates acoustic and intelligibility closeness with
	ground truth. Also the high cosine similarity score indicate speaker similarity and strong cross-lingual generalization in the cloned voice.
	Thus, the solution demonstrates a lightweight, deployable approach for universal voice personalization in real world Indian language
	applications.

	```python

	import io
	import tempfile
	from pathlib import Path

	import numpy as np
	import torch
	import torchaudio
	import librosa
	from fastapi import FastAPI, UploadFile, File, Form
	from fastapi.responses import FileResponse

	from TTS.config import load_config
	from TTS.tts.utils.text.tokenizer import TTSTokenizer
	from TTS.tts.utils.languages import LanguageManager
	from TTS.tts.utils.speakers import SpeakerManager
	from TTS.tts.models.vits import Vits
	from TTS.utils.io import load_fsspec

	import utils
	from models import SynthesizerTrn
	from wavlm import WavLM, WavLMConfig
	from speaker_encoder.voice_encoder import SpeakerEncoder
	from mel_processing import mel_spectrogram_torch

	from text_normalizer_v2 import TextNormalizer
	normalizer = TextNormalizer()

	app = FastAPI(title="VoiceTech4All-CDAC-SVNIT-Submission", version="1.0")

	DEVICE = "cuda" if torch.cuda.is_available() else "cpu" #"cpu"

	print("Loading models... ")


	MODEL_DIR = Path("checkpoints")
	CONFIG_PATH = MODEL_DIR / "config.json"
	CHECKPOINT_PATH = MODEL_DIR / "checkpoint_140000.pth"
	SPEAKER_MAP_PATH = MODEL_DIR / "speaker_ids.json"
	LANGUAGE_MAP_PATH = MODEL_DIR / "language_ids.json"

	cfg = load_config(CONFIG_PATH)

	tokenizer, _ = TTSTokenizer.init_from_config(cfg)
	speaker_manager = SpeakerManager(speaker_id_file_path=str(SPEAKER_MAP_PATH))
	language_manager = LanguageManager(language_ids_file_path=str(LANGUAGE_MAP_PATH))

	tts_model = Vits(
	config=cfg,
	ap=None,
	tokenizer=tokenizer,
	speaker_manager=speaker_manager,
	language_manager=language_manager
	)

	state = load_fsspec(CHECKPOINT_PATH, map_location=DEVICE)
	tts_model.load_state_dict(state["model"], strict=True)
	tts_model.to(DEVICE).eval()

	print("Loaded VITS TTS model.")

	HPS = utils.get_hparams_from_file("configs/freevc.json")

	vc_model = SynthesizerTrn(
	HPS.data.filter_length // 2 + 1,
	HPS.train.segment_size // HPS.data.hop_length,
	**HPS.model
	).to(DEVICE)
	_ = vc_model.eval()
	_ = utils.load_checkpoint("checkpoints/freevc.pth", vc_model, None, True)

	# load WavLM content encoder
	cmodel = utils.get_cmodel(0).to(DEVICE)

	# load speaker encoder
	spk_encoder = SpeakerEncoder("speaker_encoder/ckpt/pretrained_bak_5805000.pt")

	print("Loaded Voice Conversion model.\n")


	@app.get("/Get_Inference")

	async def Inference(text : str, lang : str, speaker_wav : UploadFile):

	speaker_name= f"{lang.strip()}_female"
	print(speaker_name)
	text = normalizer.normalize(text)
	token_ids = tokenizer.text_to_ids(text)
	x = torch.LongTensor(token_ids).unsqueeze(0).to(DEVICE)

	aux = {
	"x_lengths": torch.LongTensor([len(token_ids)]).to(DEVICE),
	"speaker_ids": torch.LongTensor(
	[speaker_manager.name_to_id[speaker_name]]
	).to(DEVICE),
	"language_ids": torch.LongTensor(
	[language_manager.name_to_id[lang]]
	).to(DEVICE),
	"d_vectors": None,
	"durations": None,
	}

	with torch.no_grad():
	tts_out = tts_model.inference(x, aux_input=aux)
	tts_wav = tts_out["model_outputs"].squeeze().cpu()

	synth_temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
	torchaudio.save(synth_temp, tts_wav.unsqueeze(0), cfg.audio.sample_rate)

	wav_src, _ = librosa.load(synth_temp, sr=HPS.data.sampling_rate)
	wav_src = torch.from_numpy(wav_src).unsqueeze(0).to(DEVICE)
	c = utils.get_content(cmodel, wav_src)

	audio_bytes = await speaker_wav.read()
	tgt_np, sr = librosa.load(io.BytesIO(audio_bytes), sr=HPS.data.sampling_rate)
	tgt_np, _ = librosa.effects.trim(tgt_np, top_db=20)

	g_tgt = spk_encoder.embed_utterance(tgt_np)
	g_tgt = torch.from_numpy(g_tgt).unsqueeze(0).to(DEVICE)

	with torch.no_grad():
	vc_audio = vc_model.infer(c, g=g_tgt)[0][0].data.cpu().float().numpy()

	out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
	torchaudio.save(out_path, torch.tensor(vc_audio).unsqueeze(0), HPS.data.sampling_rate)

	return FileResponse(out_path, media_type="audio/wav", filename="converted.wav")


	if __name__ == "__main__":
	import uvicorn
	uvicorn.run("API_Server:app", host="0.0.0.0", port=8000)