|
|
|
|
|
import spaces |
|
|
import sys, os, torch |
|
|
import subprocess |
|
|
|
|
|
token = os.environ["TOKEN"] |
|
|
repo = "Dubverse/MahaTTSv2" |
|
|
clone_dir = "MahaTTSv2" |
|
|
url = f"https://__token__:{token}:@huggingface.co/{repo}" |
|
|
subprocess.run("git lfs install",shell=True) |
|
|
subprocess.run(f"git clone --recurse-submodules {url}",shell=True) |
|
|
|
|
|
|
|
|
sys.path.append("MahaTTSv2/") |
|
|
import gradio as gr |
|
|
from inference import infer, prepare_inputs, load_t2s_model, load_cfm, create_wav_header |
|
|
from tqdm import tqdm |
|
|
|
|
|
|
|
|
os.makedirs("generated_samples/", exist_ok=True) |
|
|
device = "cuda" |
|
|
print("Using device", device) |
|
|
|
|
|
|
|
|
m1_checkpoint = "MahaTTSv2/pretrained_checkpoint/m1_gemma_benchmark_1_latest_weights.pt" |
|
|
m2_checkpoint = "MahaTTSv2/pretrained_checkpoint/m2.pt" |
|
|
vocoder_checkpoint = 'MahaTTSv2/pretrained_checkpoint/700_580k_multilingual_infer_ready/' |
|
|
|
|
|
global FM, vocoder, m2, mu, std, m1 |
|
|
|
|
|
|
|
|
FM, vocoder, m2, mu, std = load_cfm(m2_checkpoint, vocoder_checkpoint, device) |
|
|
m1 = load_t2s_model(m1_checkpoint, device) |
|
|
|
|
|
|
|
|
|
|
|
speaker_refs = { |
|
|
"Female_Speaker1": [ |
|
|
"MahaTTSv2/speakers/female1/train_hindifemale_02794.wav", |
|
|
"MahaTTSv2/speakers/female1/train_hindifemale_04167.wav", |
|
|
"MahaTTSv2/speakers/female1/train_hindifemale_02795.wav" |
|
|
], |
|
|
"Male_Speaker1": [ |
|
|
"MahaTTSv2/speakers/male1/train_hindimale_00016.wav", |
|
|
"MahaTTSv2/speakers/male1/train_hindimale_00017.wav", |
|
|
"MahaTTSv2/speakers/male1/train_hindimale_00018.wav" |
|
|
] |
|
|
} |
|
|
|
|
|
|
|
|
available_languages = [ |
|
|
"assamese", |
|
|
"bengali", |
|
|
"bhojpuri", |
|
|
"bodo", |
|
|
"dogri", |
|
|
"odia", |
|
|
"english", |
|
|
"french", |
|
|
"gujarati", |
|
|
"german", |
|
|
"hindi", |
|
|
"italian", |
|
|
"kannada", |
|
|
"malayalam", |
|
|
"marathi", |
|
|
"punjabi", |
|
|
"rajasthani", |
|
|
"sanskrit", |
|
|
"spanish", |
|
|
"tamil", |
|
|
"telugu", |
|
|
] |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=60) |
|
|
def generate_audio(text, speaker_name, language): |
|
|
if speaker_name not in speaker_refs: |
|
|
return f"Reference clips not available for {speaker_name}", None |
|
|
|
|
|
ref_clips = speaker_refs[speaker_name] |
|
|
|
|
|
text_ids, code_ids, language_code, ref_mels_m1, ref_mels_m2 = prepare_inputs( |
|
|
text.lower(), |
|
|
ref_clips_m1=ref_clips, |
|
|
ref_clips_m2=ref_clips, |
|
|
language=language, |
|
|
device=device |
|
|
) |
|
|
|
|
|
audio_wav = infer(m1, m2, vocoder, FM, mu, std, text_ids, code_ids, language_code, ref_mels_m1, ref_mels_m2, device) |
|
|
return 24000,audio_wav |
|
|
|
|
|
|
|
|
interface = gr.Interface( |
|
|
fn=generate_audio, |
|
|
inputs=[ |
|
|
gr.Textbox(label="Enter Text"), |
|
|
gr.Dropdown(choices=list(speaker_refs.keys()), label="Select Speaker"), |
|
|
gr.Dropdown(choices=available_languages, label="Select Language") |
|
|
], |
|
|
outputs=gr.Audio(label="Generated Speech"), |
|
|
title="MahaTTSv2 Demo", |
|
|
description="Enter text, choose a speaker and language to generate speech." |
|
|
) |
|
|
|
|
|
interface.launch() |