TTS_run / app.py
nambn0321's picture
Update app.py
c09f7ba verified
import torch
import gradio as gr
import torchaudio
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from transformers.models.speecht5 import SpeechT5HifiGan
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
vocoder = vocoder.to(device)
speaker_embedding = torch.zeros(1, 512).to(device)
# processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_with_T5_4")
# model = SpeechT5ForTextToSpeech.from_pretrained(
# "nambn0321/TTS_with_T5_4",
# use_safetensors=True,
# trust_remote_code=True
# )
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = model.to(device)
# vocoder = vocoder.to(device)
# speaker_embedding = torch.tensor([[-0.06632216, -0.02325863, 0.04376163, 0.01112046, -0.02864115,
# -0.03048201, -0.04865832, 0.00598873, 0.03105048, 0.01635859,
# -0.07552029, -0.09258246, 0.04839027, 0.04307159, 0.05019059,
# 0.05565156, 0.00533272, 0.0197331 , 0.01269842, 0.00576971,
# 0.02997943, 0.00765277, -0.01538683, -0.04164617, -0.05669912,
# -0.00767612, -0.05466911, 0.00988977, 0.05714991, 0.0216927 ,
# -0.00281803, 0.04948897, 0.04745187, -0.01738331, 0.03589115,
# -0.03788823, 0.03018526, 0.06933809, -0.01054026, -0.07338727,
# 0.01145766, -0.00347575, 0.02236829, 0.03353192, 0.01183521,
# -0.11246844, -0.01998361, 0.01333049, -0.08154028, 0.06184796,
# 0.04050031, 0.01181497, 0.0588 , 0.01634772, -0.11387676,
# -0.01355756, -0.01059065, 0.01194482, 0.03934296, 0.02436676,
# 0.00376559, -0.00813801, -0.01421188, -0.03595341, 0.02987706,
# 0.02612724, 0.03072971, -0.05161813, -0.06241557, -0.06545018,
# -0.00679519, 0.00900955, 0.03801987, 0.00294477, 0.02057374,
# 0.04256874, 0.00730863, -0.00282256, -0.05437343, -0.07569141,
# -0.07964483, -0.04049463, -0.06325456, -0.08040556, -0.03161319,
# -0.0557906 , -0.05558824, 0.05661038, 0.03932756, -0.00269612,
# 0.02999815, -0.05263155, 0.01048327, -0.05502405, 0.04730757,
# -0.03641531, 0.04466332, 0.04261209, -0.08965097, -0.06816243,
# 0.05328364, -0.0652955 , -0.09165341, 0.02487748, 0.04061233,
# 0.01143007, 0.04024159, 0.01869776, 0.02870329, 0.01503909,
# -0.07710361, 0.00802833, 0.07786133, -0.008355 , 0.02792075,
# 0.03834949, -0.07156748, 0.00127211, -0.05645351, 0.0293999 ,
# 0.03988929, -0.07301504, 0.01131906, 0.0415033 , -0.05863927,
# 0.0623733 , -0.07197598, 0.02887617, 0.03702732, 0.05255475,
# 0.03850314, 0.03016165, 0.04511765, 0.0400167 , 0.01042124,
# -0.08053102, -0.06103503, -0.02782067, -0.03948715, 0.00812866,
# -0.00215283, 0.00496819, -0.00270994, 0.04999355, -0.08324838,
# 0.01673055, -0.0224449 , -0.04158457, 0.03688109, -0.13497816,
# 0.02797874, -0.04349126, -0.06393341, 0.01634013, 0.00367471,
# 0.03441324, 0.00576339, -0.08563808, -0.08777589, 0.01206557,
# 0.01930428, 0.03046028, 0.00186808, 0.01118185, -0.06207091,
# 0.00285664, 0.04373416, 0.03865229, 0.02155851, 0.02963249,
# 0.03907783, -0.06465862, 0.00155482, -0.04207559, 0.02787214,
# 0.02055759, -0.05460549, -0.0024652 , 0.02217332, -0.07867457,
# 0.04810029, -0.0450572 , -0.01488631, 0.02080196, -0.07611465,
# -0.01182817, 0.03117848, 0.0593022 , -0.05042631, -0.06321163,
# 0.01080927, 0.03538311, -0.06461193, 0.02289902, 0.03690634,
# 0.02868471, 0.01077593, 0.00843379, 0.04739143, -0.03351105,
# 0.04080784, 0.01689551, -0.06830349, 0.01059405, 0.01843624,
# 0.01237972, 0.02619306, -0.02353077, 0.00792623, 0.02665057,
# 0.00471944, -0.08360166, -0.0301204 , 0.04510773, -0.03999252,
# 0.03273777, 0.02000749, -0.07822321, 0.04588151, 0.03334309,
# -0.09588112, 0.01911022, -0.06844518, -0.03093524, -0.02563222,
# 0.03301362, 0.03092113, 0.07978717, 0.03420616, 0.02481706,
# -0.03479896, 0.01136372, 0.02234516, -0.02502409, 0.02136666,
# -0.01978885, 0.01426617, 0.0336206 , 0.00164481, 0.05059334,
# -0.05926166, 0.01984084, -0.09437344, 0.00440842, -0.06748072,
# 0.04547653, 0.04531173, 0.02839815, 0.01182417, 0.01309258,
# 0.03345039, -0.0050239 , 0.00861029, -0.05667242, 0.01330826,
# 0.02976079, 0.03610174, 0.0056701 , -0.06830816, 0.07686577,
# 0.00055387, -0.07641901, 0.00479465, 0.0435739 , 0.00137714,
# 0.054296 , 0.02192332, 0.03526516, 0.03261713, -0.01711978,
# 0.05103486, 0.004091 , -0.04905723, 0.01632674, -0.04963868,
# 0.04549154, 0.05771144, 0.01438812, -0.08240737, -0.06134431,
# -0.03986251, 0.03224541, 0.00400033, -0.05963603, 0.02552675,
# 0.04327708, 0.00562372, 0.03411512, -0.11604068, 0.00232808,
# 0.02742139, 0.01270449, 0.02279026, -0.06613689, 0.00456405,
# 0.00770958, 0.01518244, -0.03575909, 0.05028789, 0.03181706,
# -0.02811741, 0.02930666, 0.02258663, -0.06209057, 0.01053006,
# 0.01761598, 0.02432001, -0.0141328 , 0.03561908, 0.03293756,
# 0.04713007, 0.02588944, 0.0185135 , 0.00973485, -0.09059389,
# -0.06192823, -0.0214373 , 0.02466835, -0.05554106, 0.03954491,
# -0.03995424, 0.03540933, -0.05664941, 0.00685676, 0.02727092,
# -0.06838219, 0.04708575, 0.06957678, -0.0574585 , -0.08372921,
# -0.06601643, -0.02683325, 0.02862075, 0.06086589, -0.05693608,
# 0.02700268, 0.03062632, -0.0449043 , -0.03139404, 0.01131762,
# 0.018201 , -0.05808553, 0.02667459, 0.02892675, -0.05436037,
# 0.02801878, 0.04307706, 0.0013432 , -0.06306062, -0.04901182,
# -0.05647411, 0.0226799 , -0.06727529, 0.10902219, 0.03856311,
# -0.04592182, -0.00500258, 0.00186311, -0.05330509, 0.05230814,
# -0.10676292, 0.01777823, 0.01183014, 0.05641989, 0.04702727,
# 0.00042184, -0.08117392, -0.00340278, 0.01055175, 0.02158776,
# 0.00645116, 0.05420727, -0.05439884, 0.02988858, -0.0155564 ,
# -0.00187941, 0.04348213, 0.02176837, 0.04492295, 0.05255244,
# -0.09009198, -0.12785755, 0.0270214 , 0.01281871, 0.03488814,
# 0.01032432, 0.03737413, -0.08046219, 0.03366841, 0.04788679,
# 0.02247225, 0.02758352, -0.05623886, 0.03350434, -0.03293617,
# 0.00674522, 0.02637025, -0.06836043, -0.03543041, 0.04120062,
# 0.04781871, -0.0528533 , 0.05126699, 0.01553862, 0.03617714,
# 0.0096033 , 0.01169565, -0.06753531, -0.05359954, -0.07725069,
# -0.0690423 , 0.00608264, 0.03367587, -0.01095485, 0.02317013,
# -0.03748006, -0.0396716 , -0.07376339, -0.15511133, -0.02377705,
# -0.0733289 , -0.02155393, 0.03737415, -0.00152944, -0.05182485,
# 0.0202742 , 0.04189592, 0.05077221, 0.02522502, -0.04805434,
# -0.03909 , -0.01301163, -0.02148154, 0.02039445, 0.02322994,
# 0.01821164, 0.03498985, 0.00654902, 0.00980544, -0.06337985,
# 0.00158023, 0.01253585, 0.05249537, 0.00056358, -0.03539167,
# 0.04533946, 0.02057356, 0.00598625, 0.00438659, -0.00444954,
# 0.04846435, 0.02074119, 0.00665891, 0.0347768 , -0.00355295,
# -0.00983169, 0.01239159, -0.06600927, -0.06987962, 0.04164324,
# -0.00596055, 0.01529142, 0.04804419, 0.04481226, -0.06791846,
# 0.04703787, -0.01586268, -0.06848218, 0.03964271, 0.03287267,
# -0.00166699, 0.05269769, 0.02563164, 0.00356486, -0.04681876,
# -0.05530458, 0.00568418, -0.00581932, 0.0229376 , 0.06235321,
# -0.03780747, -0.04042193, 0.01800834, 0.02682916, 0.05686411,
# 0.03996282, -0.05146077, 0.0312879 , -0.03907526, -0.01055358,
# -0.05896859, 0.02441409, -0.03880213, 0.03941878, 0.02211095,
# 0.00688374, -0.05528738, -0.01232414, -0.06249457, -0.07299529,
# 0.00938593, 0.05738097, -0.06533916, 0.03651554, 0.06204324,
# -0.01556815, -0.04757515, 0.0451969 , 0.03502326, -0.01376748,
# 0.02549847, -0.06043207]]).to(device)
def tts_generate(text):
try:
# Preprocess input
inputs = processor(text=text, return_tensors="pt").to(device)
# Generate waveform directly (with vocoder)
with torch.no_grad():
waveform = model.generate_speech(
inputs["input_ids"],
speaker_embedding,
vocoder=vocoder
)
# Save waveform
output_path = "output.wav"
if waveform.dim() == 1:
waveform = waveform.unsqueeze(0)
torchaudio.save(output_path, waveform.cpu(), sample_rate=16000)
return output_path
except Exception as e:
print("Error during TTS generation:", e)
return "Error during speech synthesis."
demo = gr.Interface(
fn=tts_generate,
inputs=gr.Textbox(label="Enter text"),
outputs=gr.Audio(label="Generated Speech", type="filepath"),
title="SpeechT5 Text-to-Speech",
description="Enter text and hear it with my custom SpeechT5."
)
if __name__ == "__main__":
print("Launching Gradio demo")
demo.launch()