| import torch | |
| import gradio as gr | |
| import torchaudio | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech | |
| from transformers.models.speecht5 import SpeechT5HifiGan | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = model.to(device) | |
| vocoder = vocoder.to(device) | |
| speaker_embedding = torch.zeros(1, 512).to(device) | |
| # processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_with_T5_4") | |
| # model = SpeechT5ForTextToSpeech.from_pretrained( | |
| # "nambn0321/TTS_with_T5_4", | |
| # use_safetensors=True, | |
| # trust_remote_code=True | |
| # ) | |
| # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # model = model.to(device) | |
| # vocoder = vocoder.to(device) | |
| # speaker_embedding = torch.tensor([[-0.06632216, -0.02325863, 0.04376163, 0.01112046, -0.02864115, | |
| # -0.03048201, -0.04865832, 0.00598873, 0.03105048, 0.01635859, | |
| # -0.07552029, -0.09258246, 0.04839027, 0.04307159, 0.05019059, | |
| # 0.05565156, 0.00533272, 0.0197331 , 0.01269842, 0.00576971, | |
| # 0.02997943, 0.00765277, -0.01538683, -0.04164617, -0.05669912, | |
| # -0.00767612, -0.05466911, 0.00988977, 0.05714991, 0.0216927 , | |
| # -0.00281803, 0.04948897, 0.04745187, -0.01738331, 0.03589115, | |
| # -0.03788823, 0.03018526, 0.06933809, -0.01054026, -0.07338727, | |
| # 0.01145766, -0.00347575, 0.02236829, 0.03353192, 0.01183521, | |
| # -0.11246844, -0.01998361, 0.01333049, -0.08154028, 0.06184796, | |
| # 0.04050031, 0.01181497, 0.0588 , 0.01634772, -0.11387676, | |
| # -0.01355756, -0.01059065, 0.01194482, 0.03934296, 0.02436676, | |
| # 0.00376559, -0.00813801, -0.01421188, -0.03595341, 0.02987706, | |
| # 0.02612724, 0.03072971, -0.05161813, -0.06241557, -0.06545018, | |
| # -0.00679519, 0.00900955, 0.03801987, 0.00294477, 0.02057374, | |
| # 0.04256874, 0.00730863, -0.00282256, -0.05437343, -0.07569141, | |
| # -0.07964483, -0.04049463, -0.06325456, -0.08040556, -0.03161319, | |
| # -0.0557906 , -0.05558824, 0.05661038, 0.03932756, -0.00269612, | |
| # 0.02999815, -0.05263155, 0.01048327, -0.05502405, 0.04730757, | |
| # -0.03641531, 0.04466332, 0.04261209, -0.08965097, -0.06816243, | |
| # 0.05328364, -0.0652955 , -0.09165341, 0.02487748, 0.04061233, | |
| # 0.01143007, 0.04024159, 0.01869776, 0.02870329, 0.01503909, | |
| # -0.07710361, 0.00802833, 0.07786133, -0.008355 , 0.02792075, | |
| # 0.03834949, -0.07156748, 0.00127211, -0.05645351, 0.0293999 , | |
| # 0.03988929, -0.07301504, 0.01131906, 0.0415033 , -0.05863927, | |
| # 0.0623733 , -0.07197598, 0.02887617, 0.03702732, 0.05255475, | |
| # 0.03850314, 0.03016165, 0.04511765, 0.0400167 , 0.01042124, | |
| # -0.08053102, -0.06103503, -0.02782067, -0.03948715, 0.00812866, | |
| # -0.00215283, 0.00496819, -0.00270994, 0.04999355, -0.08324838, | |
| # 0.01673055, -0.0224449 , -0.04158457, 0.03688109, -0.13497816, | |
| # 0.02797874, -0.04349126, -0.06393341, 0.01634013, 0.00367471, | |
| # 0.03441324, 0.00576339, -0.08563808, -0.08777589, 0.01206557, | |
| # 0.01930428, 0.03046028, 0.00186808, 0.01118185, -0.06207091, | |
| # 0.00285664, 0.04373416, 0.03865229, 0.02155851, 0.02963249, | |
| # 0.03907783, -0.06465862, 0.00155482, -0.04207559, 0.02787214, | |
| # 0.02055759, -0.05460549, -0.0024652 , 0.02217332, -0.07867457, | |
| # 0.04810029, -0.0450572 , -0.01488631, 0.02080196, -0.07611465, | |
| # -0.01182817, 0.03117848, 0.0593022 , -0.05042631, -0.06321163, | |
| # 0.01080927, 0.03538311, -0.06461193, 0.02289902, 0.03690634, | |
| # 0.02868471, 0.01077593, 0.00843379, 0.04739143, -0.03351105, | |
| # 0.04080784, 0.01689551, -0.06830349, 0.01059405, 0.01843624, | |
| # 0.01237972, 0.02619306, -0.02353077, 0.00792623, 0.02665057, | |
| # 0.00471944, -0.08360166, -0.0301204 , 0.04510773, -0.03999252, | |
| # 0.03273777, 0.02000749, -0.07822321, 0.04588151, 0.03334309, | |
| # -0.09588112, 0.01911022, -0.06844518, -0.03093524, -0.02563222, | |
| # 0.03301362, 0.03092113, 0.07978717, 0.03420616, 0.02481706, | |
| # -0.03479896, 0.01136372, 0.02234516, -0.02502409, 0.02136666, | |
| # -0.01978885, 0.01426617, 0.0336206 , 0.00164481, 0.05059334, | |
| # -0.05926166, 0.01984084, -0.09437344, 0.00440842, -0.06748072, | |
| # 0.04547653, 0.04531173, 0.02839815, 0.01182417, 0.01309258, | |
| # 0.03345039, -0.0050239 , 0.00861029, -0.05667242, 0.01330826, | |
| # 0.02976079, 0.03610174, 0.0056701 , -0.06830816, 0.07686577, | |
| # 0.00055387, -0.07641901, 0.00479465, 0.0435739 , 0.00137714, | |
| # 0.054296 , 0.02192332, 0.03526516, 0.03261713, -0.01711978, | |
| # 0.05103486, 0.004091 , -0.04905723, 0.01632674, -0.04963868, | |
| # 0.04549154, 0.05771144, 0.01438812, -0.08240737, -0.06134431, | |
| # -0.03986251, 0.03224541, 0.00400033, -0.05963603, 0.02552675, | |
| # 0.04327708, 0.00562372, 0.03411512, -0.11604068, 0.00232808, | |
| # 0.02742139, 0.01270449, 0.02279026, -0.06613689, 0.00456405, | |
| # 0.00770958, 0.01518244, -0.03575909, 0.05028789, 0.03181706, | |
| # -0.02811741, 0.02930666, 0.02258663, -0.06209057, 0.01053006, | |
| # 0.01761598, 0.02432001, -0.0141328 , 0.03561908, 0.03293756, | |
| # 0.04713007, 0.02588944, 0.0185135 , 0.00973485, -0.09059389, | |
| # -0.06192823, -0.0214373 , 0.02466835, -0.05554106, 0.03954491, | |
| # -0.03995424, 0.03540933, -0.05664941, 0.00685676, 0.02727092, | |
| # -0.06838219, 0.04708575, 0.06957678, -0.0574585 , -0.08372921, | |
| # -0.06601643, -0.02683325, 0.02862075, 0.06086589, -0.05693608, | |
| # 0.02700268, 0.03062632, -0.0449043 , -0.03139404, 0.01131762, | |
| # 0.018201 , -0.05808553, 0.02667459, 0.02892675, -0.05436037, | |
| # 0.02801878, 0.04307706, 0.0013432 , -0.06306062, -0.04901182, | |
| # -0.05647411, 0.0226799 , -0.06727529, 0.10902219, 0.03856311, | |
| # -0.04592182, -0.00500258, 0.00186311, -0.05330509, 0.05230814, | |
| # -0.10676292, 0.01777823, 0.01183014, 0.05641989, 0.04702727, | |
| # 0.00042184, -0.08117392, -0.00340278, 0.01055175, 0.02158776, | |
| # 0.00645116, 0.05420727, -0.05439884, 0.02988858, -0.0155564 , | |
| # -0.00187941, 0.04348213, 0.02176837, 0.04492295, 0.05255244, | |
| # -0.09009198, -0.12785755, 0.0270214 , 0.01281871, 0.03488814, | |
| # 0.01032432, 0.03737413, -0.08046219, 0.03366841, 0.04788679, | |
| # 0.02247225, 0.02758352, -0.05623886, 0.03350434, -0.03293617, | |
| # 0.00674522, 0.02637025, -0.06836043, -0.03543041, 0.04120062, | |
| # 0.04781871, -0.0528533 , 0.05126699, 0.01553862, 0.03617714, | |
| # 0.0096033 , 0.01169565, -0.06753531, -0.05359954, -0.07725069, | |
| # -0.0690423 , 0.00608264, 0.03367587, -0.01095485, 0.02317013, | |
| # -0.03748006, -0.0396716 , -0.07376339, -0.15511133, -0.02377705, | |
| # -0.0733289 , -0.02155393, 0.03737415, -0.00152944, -0.05182485, | |
| # 0.0202742 , 0.04189592, 0.05077221, 0.02522502, -0.04805434, | |
| # -0.03909 , -0.01301163, -0.02148154, 0.02039445, 0.02322994, | |
| # 0.01821164, 0.03498985, 0.00654902, 0.00980544, -0.06337985, | |
| # 0.00158023, 0.01253585, 0.05249537, 0.00056358, -0.03539167, | |
| # 0.04533946, 0.02057356, 0.00598625, 0.00438659, -0.00444954, | |
| # 0.04846435, 0.02074119, 0.00665891, 0.0347768 , -0.00355295, | |
| # -0.00983169, 0.01239159, -0.06600927, -0.06987962, 0.04164324, | |
| # -0.00596055, 0.01529142, 0.04804419, 0.04481226, -0.06791846, | |
| # 0.04703787, -0.01586268, -0.06848218, 0.03964271, 0.03287267, | |
| # -0.00166699, 0.05269769, 0.02563164, 0.00356486, -0.04681876, | |
| # -0.05530458, 0.00568418, -0.00581932, 0.0229376 , 0.06235321, | |
| # -0.03780747, -0.04042193, 0.01800834, 0.02682916, 0.05686411, | |
| # 0.03996282, -0.05146077, 0.0312879 , -0.03907526, -0.01055358, | |
| # -0.05896859, 0.02441409, -0.03880213, 0.03941878, 0.02211095, | |
| # 0.00688374, -0.05528738, -0.01232414, -0.06249457, -0.07299529, | |
| # 0.00938593, 0.05738097, -0.06533916, 0.03651554, 0.06204324, | |
| # -0.01556815, -0.04757515, 0.0451969 , 0.03502326, -0.01376748, | |
| # 0.02549847, -0.06043207]]).to(device) | |
| def tts_generate(text): | |
| try: | |
| # Preprocess input | |
| inputs = processor(text=text, return_tensors="pt").to(device) | |
| # Generate waveform directly (with vocoder) | |
| with torch.no_grad(): | |
| waveform = model.generate_speech( | |
| inputs["input_ids"], | |
| speaker_embedding, | |
| vocoder=vocoder | |
| ) | |
| # Save waveform | |
| output_path = "output.wav" | |
| if waveform.dim() == 1: | |
| waveform = waveform.unsqueeze(0) | |
| torchaudio.save(output_path, waveform.cpu(), sample_rate=16000) | |
| return output_path | |
| except Exception as e: | |
| print("Error during TTS generation:", e) | |
| return "Error during speech synthesis." | |
| demo = gr.Interface( | |
| fn=tts_generate, | |
| inputs=gr.Textbox(label="Enter text"), | |
| outputs=gr.Audio(label="Generated Speech", type="filepath"), | |
| title="SpeechT5 Text-to-Speech", | |
| description="Enter text and hear it with my custom SpeechT5." | |
| ) | |
| if __name__ == "__main__": | |
| print("Launching Gradio demo") | |
| demo.launch() | |