File size: 9,673 Bytes
f3d10bd
d2b5676
8d7e20b
7c03bdc
 
 
f492d3b
 
5951a1a
 
 
 
 
 
f492d3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1cd841e
8d7e20b
08d90fe
 
 
f3d10bd
2dc786b
08d90fe
2dc786b
 
 
 
 
0515d75
08d90fe
 
8aec1d9
2dc786b
 
0515d75
08d90fe
 
 
c577d87
08d90fe
d2b5676
8d7e20b
 
 
 
 
c09f7ba
808fe78
 
 
c577d87
5b18578
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import torch
import gradio as gr
import torchaudio
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
from transformers.models.speecht5 import SpeechT5HifiGan

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
vocoder = vocoder.to(device)

speaker_embedding = torch.zeros(1, 512).to(device)

# processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_with_T5_4")
# model = SpeechT5ForTextToSpeech.from_pretrained(
#     "nambn0321/TTS_with_T5_4",
#     use_safetensors=True,
#     trust_remote_code=True
# )
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# device = "cuda" if torch.cuda.is_available() else "cpu"
# model = model.to(device)
# vocoder = vocoder.to(device)

# speaker_embedding = torch.tensor([[-0.06632216, -0.02325863,  0.04376163,  0.01112046, -0.02864115,
#        -0.03048201, -0.04865832,  0.00598873,  0.03105048,  0.01635859,
#        -0.07552029, -0.09258246,  0.04839027,  0.04307159,  0.05019059,
#         0.05565156,  0.00533272,  0.0197331 ,  0.01269842,  0.00576971,
#         0.02997943,  0.00765277, -0.01538683, -0.04164617, -0.05669912,
#        -0.00767612, -0.05466911,  0.00988977,  0.05714991,  0.0216927 ,
#        -0.00281803,  0.04948897,  0.04745187, -0.01738331,  0.03589115,
#        -0.03788823,  0.03018526,  0.06933809, -0.01054026, -0.07338727,
#         0.01145766, -0.00347575,  0.02236829,  0.03353192,  0.01183521,
#        -0.11246844, -0.01998361,  0.01333049, -0.08154028,  0.06184796,
#         0.04050031,  0.01181497,  0.0588    ,  0.01634772, -0.11387676,
#        -0.01355756, -0.01059065,  0.01194482,  0.03934296,  0.02436676,
#         0.00376559, -0.00813801, -0.01421188, -0.03595341,  0.02987706,
#         0.02612724,  0.03072971, -0.05161813, -0.06241557, -0.06545018,
#        -0.00679519,  0.00900955,  0.03801987,  0.00294477,  0.02057374,
#         0.04256874,  0.00730863, -0.00282256, -0.05437343, -0.07569141,
#        -0.07964483, -0.04049463, -0.06325456, -0.08040556, -0.03161319,
#        -0.0557906 , -0.05558824,  0.05661038,  0.03932756, -0.00269612,
#         0.02999815, -0.05263155,  0.01048327, -0.05502405,  0.04730757,
#        -0.03641531,  0.04466332,  0.04261209, -0.08965097, -0.06816243,
#         0.05328364, -0.0652955 , -0.09165341,  0.02487748,  0.04061233,
#         0.01143007,  0.04024159,  0.01869776,  0.02870329,  0.01503909,
#        -0.07710361,  0.00802833,  0.07786133, -0.008355  ,  0.02792075,
#         0.03834949, -0.07156748,  0.00127211, -0.05645351,  0.0293999 ,
#         0.03988929, -0.07301504,  0.01131906,  0.0415033 , -0.05863927,
#         0.0623733 , -0.07197598,  0.02887617,  0.03702732,  0.05255475,
#         0.03850314,  0.03016165,  0.04511765,  0.0400167 ,  0.01042124,
#        -0.08053102, -0.06103503, -0.02782067, -0.03948715,  0.00812866,
#        -0.00215283,  0.00496819, -0.00270994,  0.04999355, -0.08324838,
#         0.01673055, -0.0224449 , -0.04158457,  0.03688109, -0.13497816,
#         0.02797874, -0.04349126, -0.06393341,  0.01634013,  0.00367471,
#         0.03441324,  0.00576339, -0.08563808, -0.08777589,  0.01206557,
#         0.01930428,  0.03046028,  0.00186808,  0.01118185, -0.06207091,
#         0.00285664,  0.04373416,  0.03865229,  0.02155851,  0.02963249,
#         0.03907783, -0.06465862,  0.00155482, -0.04207559,  0.02787214,
#         0.02055759, -0.05460549, -0.0024652 ,  0.02217332, -0.07867457,
#         0.04810029, -0.0450572 , -0.01488631,  0.02080196, -0.07611465,
#        -0.01182817,  0.03117848,  0.0593022 , -0.05042631, -0.06321163,
#         0.01080927,  0.03538311, -0.06461193,  0.02289902,  0.03690634,
#         0.02868471,  0.01077593,  0.00843379,  0.04739143, -0.03351105,
#         0.04080784,  0.01689551, -0.06830349,  0.01059405,  0.01843624,
#         0.01237972,  0.02619306, -0.02353077,  0.00792623,  0.02665057,
#         0.00471944, -0.08360166, -0.0301204 ,  0.04510773, -0.03999252,
#         0.03273777,  0.02000749, -0.07822321,  0.04588151,  0.03334309,
#        -0.09588112,  0.01911022, -0.06844518, -0.03093524, -0.02563222,
#         0.03301362,  0.03092113,  0.07978717,  0.03420616,  0.02481706,
#        -0.03479896,  0.01136372,  0.02234516, -0.02502409,  0.02136666,
#        -0.01978885,  0.01426617,  0.0336206 ,  0.00164481,  0.05059334,
#        -0.05926166,  0.01984084, -0.09437344,  0.00440842, -0.06748072,
#         0.04547653,  0.04531173,  0.02839815,  0.01182417,  0.01309258,
#         0.03345039, -0.0050239 ,  0.00861029, -0.05667242,  0.01330826,
#         0.02976079,  0.03610174,  0.0056701 , -0.06830816,  0.07686577,
#         0.00055387, -0.07641901,  0.00479465,  0.0435739 ,  0.00137714,
#         0.054296  ,  0.02192332,  0.03526516,  0.03261713, -0.01711978,
#         0.05103486,  0.004091  , -0.04905723,  0.01632674, -0.04963868,
#         0.04549154,  0.05771144,  0.01438812, -0.08240737, -0.06134431,
#        -0.03986251,  0.03224541,  0.00400033, -0.05963603,  0.02552675,
#         0.04327708,  0.00562372,  0.03411512, -0.11604068,  0.00232808,
#         0.02742139,  0.01270449,  0.02279026, -0.06613689,  0.00456405,
#         0.00770958,  0.01518244, -0.03575909,  0.05028789,  0.03181706,
#        -0.02811741,  0.02930666,  0.02258663, -0.06209057,  0.01053006,
#         0.01761598,  0.02432001, -0.0141328 ,  0.03561908,  0.03293756,
#         0.04713007,  0.02588944,  0.0185135 ,  0.00973485, -0.09059389,
#        -0.06192823, -0.0214373 ,  0.02466835, -0.05554106,  0.03954491,
#        -0.03995424,  0.03540933, -0.05664941,  0.00685676,  0.02727092,
#        -0.06838219,  0.04708575,  0.06957678, -0.0574585 , -0.08372921,
#        -0.06601643, -0.02683325,  0.02862075,  0.06086589, -0.05693608,
#         0.02700268,  0.03062632, -0.0449043 , -0.03139404,  0.01131762,
#         0.018201  , -0.05808553,  0.02667459,  0.02892675, -0.05436037,
#         0.02801878,  0.04307706,  0.0013432 , -0.06306062, -0.04901182,
#        -0.05647411,  0.0226799 , -0.06727529,  0.10902219,  0.03856311,
#        -0.04592182, -0.00500258,  0.00186311, -0.05330509,  0.05230814,
#        -0.10676292,  0.01777823,  0.01183014,  0.05641989,  0.04702727,
#         0.00042184, -0.08117392, -0.00340278,  0.01055175,  0.02158776,
#         0.00645116,  0.05420727, -0.05439884,  0.02988858, -0.0155564 ,
#        -0.00187941,  0.04348213,  0.02176837,  0.04492295,  0.05255244,
#        -0.09009198, -0.12785755,  0.0270214 ,  0.01281871,  0.03488814,
#         0.01032432,  0.03737413, -0.08046219,  0.03366841,  0.04788679,
#         0.02247225,  0.02758352, -0.05623886,  0.03350434, -0.03293617,
#         0.00674522,  0.02637025, -0.06836043, -0.03543041,  0.04120062,
#         0.04781871, -0.0528533 ,  0.05126699,  0.01553862,  0.03617714,
#         0.0096033 ,  0.01169565, -0.06753531, -0.05359954, -0.07725069,
#        -0.0690423 ,  0.00608264,  0.03367587, -0.01095485,  0.02317013,
#        -0.03748006, -0.0396716 , -0.07376339, -0.15511133, -0.02377705,
#        -0.0733289 , -0.02155393,  0.03737415, -0.00152944, -0.05182485,
#         0.0202742 ,  0.04189592,  0.05077221,  0.02522502, -0.04805434,
#        -0.03909   , -0.01301163, -0.02148154,  0.02039445,  0.02322994,
#         0.01821164,  0.03498985,  0.00654902,  0.00980544, -0.06337985,
#         0.00158023,  0.01253585,  0.05249537,  0.00056358, -0.03539167,
#         0.04533946,  0.02057356,  0.00598625,  0.00438659, -0.00444954,
#         0.04846435,  0.02074119,  0.00665891,  0.0347768 , -0.00355295,
#        -0.00983169,  0.01239159, -0.06600927, -0.06987962,  0.04164324,
#        -0.00596055,  0.01529142,  0.04804419,  0.04481226, -0.06791846,
#         0.04703787, -0.01586268, -0.06848218,  0.03964271,  0.03287267,
#        -0.00166699,  0.05269769,  0.02563164,  0.00356486, -0.04681876,
#        -0.05530458,  0.00568418, -0.00581932,  0.0229376 ,  0.06235321,
#        -0.03780747, -0.04042193,  0.01800834,  0.02682916,  0.05686411,
#         0.03996282, -0.05146077,  0.0312879 , -0.03907526, -0.01055358,
#        -0.05896859,  0.02441409, -0.03880213,  0.03941878,  0.02211095,
#         0.00688374, -0.05528738, -0.01232414, -0.06249457, -0.07299529,
#         0.00938593,  0.05738097, -0.06533916,  0.03651554,  0.06204324,
#        -0.01556815, -0.04757515,  0.0451969 ,  0.03502326, -0.01376748,
#         0.02549847, -0.06043207]]).to(device)

def tts_generate(text):
    try:
        # Preprocess input
        inputs = processor(text=text, return_tensors="pt").to(device)

        # Generate waveform directly (with vocoder)
        with torch.no_grad():
            waveform = model.generate_speech(
                inputs["input_ids"],
                speaker_embedding,
                vocoder=vocoder
            )

        # Save waveform
        output_path = "output.wav"
        if waveform.dim() == 1:
            waveform = waveform.unsqueeze(0)
        torchaudio.save(output_path, waveform.cpu(), sample_rate=16000)

        return output_path

    except Exception as e:
        print("Error during TTS generation:", e)
        return "Error during speech synthesis."

demo = gr.Interface(
    fn=tts_generate,
    inputs=gr.Textbox(label="Enter text"),
    outputs=gr.Audio(label="Generated Speech", type="filepath"),
    title="SpeechT5 Text-to-Speech",
    description="Enter text and hear it with my custom SpeechT5."
)

if __name__ == "__main__":
    print("Launching Gradio demo")
    demo.launch()