Update app.py
Browse files
app.py
CHANGED
|
@@ -5,33 +5,96 @@ import torchaudio
|
|
| 5 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
|
| 6 |
from transformers.models.speecht5 import SpeechT5HifiGan
|
| 7 |
|
| 8 |
-
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
| 9 |
-
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
| 10 |
-
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 11 |
-
|
| 12 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 13 |
-
model = model.to(device)
|
| 14 |
-
vocoder = vocoder.to(device)
|
| 15 |
-
|
| 16 |
-
speaker_embedding = torch.zeros(1, 512).to(device)
|
| 17 |
-
|
| 18 |
-
# Load model and processor
|
| 19 |
-
# processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_with_T5")
|
| 20 |
-
# model = SpeechT5ForTextToSpeech.from_pretrained(
|
| 21 |
-
# "nambn0321/TTS_with_T5",
|
| 22 |
-
# use_safetensors=True,
|
| 23 |
-
# trust_remote_code=True
|
| 24 |
-
# )
|
| 25 |
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 26 |
|
| 27 |
-
# # Move to CUDA if available
|
| 28 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
# model = model.to(device)
|
| 30 |
# vocoder = vocoder.to(device)
|
| 31 |
|
| 32 |
-
# # Dummy speaker embedding (or load your real one here)
|
| 33 |
# speaker_embedding = torch.zeros(1, 512).to(device)
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
def tts_generate(text):
|
| 36 |
print(f"📝 Input text: {text}")
|
| 37 |
try:
|
|
|
|
| 5 |
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
|
| 6 |
from transformers.models.speecht5 import SpeechT5HifiGan
|
| 7 |
|
| 8 |
+
# processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
| 9 |
+
# model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
# vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 11 |
|
|
|
|
| 12 |
# device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 13 |
# model = model.to(device)
|
| 14 |
# vocoder = vocoder.to(device)
|
| 15 |
|
|
|
|
| 16 |
# speaker_embedding = torch.zeros(1, 512).to(device)
|
| 17 |
|
| 18 |
+
# Load model and processor
|
| 19 |
+
processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_with_T5")
|
| 20 |
+
model = SpeechT5ForTextToSpeech.from_pretrained(
|
| 21 |
+
"nambn0321/TTS_with_T5",
|
| 22 |
+
use_safetensors=True,
|
| 23 |
+
trust_remote_code=True
|
| 24 |
+
)
|
| 25 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 26 |
+
|
| 27 |
+
# Move to CUDA if available
|
| 28 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 29 |
+
model = model.to(device)
|
| 30 |
+
vocoder = vocoder.to(device)
|
| 31 |
+
|
| 32 |
+
# # Dummy speaker embedding (or load your real one here)
|
| 33 |
+
speaker_embeddings = torch.tensor([[-0.0663, -0.0233, 0.0438, 0.0111, -0.0286, -0.0305, -0.0487, 0.0060,
|
| 34 |
+
0.0311, 0.0164, -0.0755, -0.0926, 0.0484, 0.0431, 0.0502, 0.0557,
|
| 35 |
+
0.0053, 0.0197, 0.0127, 0.0058, 0.0300, 0.0077, -0.0154, -0.0416,
|
| 36 |
+
-0.0567, -0.0077, -0.0547, 0.0099, 0.0571, 0.0217, -0.0028, 0.0495,
|
| 37 |
+
0.0475, -0.0174, 0.0359, -0.0379, 0.0302, 0.0693, -0.0105, -0.0734,
|
| 38 |
+
0.0115, -0.0035, 0.0224, 0.0335, 0.0118, -0.1125, -0.0200, 0.0133,
|
| 39 |
+
-0.0815, 0.0618, 0.0405, 0.0118, 0.0588, 0.0163, -0.1139, -0.0136,
|
| 40 |
+
-0.0106, 0.0119, 0.0393, 0.0244, 0.0038, -0.0081, -0.0142, -0.0360,
|
| 41 |
+
0.0299, 0.0261, 0.0307, -0.0516, -0.0624, -0.0655, -0.0068, 0.0090,
|
| 42 |
+
0.0380, 0.0029, 0.0206, 0.0426, 0.0073, -0.0028, -0.0544, -0.0757,
|
| 43 |
+
-0.0796, -0.0405, -0.0633, -0.0804, -0.0316, -0.0558, -0.0556, 0.0566,
|
| 44 |
+
0.0393, -0.0027, 0.0300, -0.0526, 0.0105, -0.0550, 0.0473, -0.0364,
|
| 45 |
+
0.0447, 0.0426, -0.0897, -0.0682, 0.0533, -0.0653, -0.0917, 0.0249,
|
| 46 |
+
0.0406, 0.0114, 0.0402, 0.0187, 0.0287, 0.0150, -0.0771, 0.0080,
|
| 47 |
+
0.0779, -0.0084, 0.0279, 0.0383, -0.0716, 0.0013, -0.0565, 0.0294,
|
| 48 |
+
0.0399, -0.0730, 0.0113, 0.0415, -0.0586, 0.0624, -0.0720, 0.0289,
|
| 49 |
+
0.0370, 0.0526, 0.0385, 0.0302, 0.0451, 0.0400, 0.0104, -0.0805,
|
| 50 |
+
-0.0610, -0.0278, -0.0395, 0.0081, -0.0022, 0.0050, -0.0027, 0.0500,
|
| 51 |
+
-0.0832, 0.0167, -0.0224, -0.0416, 0.0369, -0.1350, 0.0280, -0.0435,
|
| 52 |
+
-0.0639, 0.0163, 0.0037, 0.0344, 0.0058, -0.0856, -0.0878, 0.0121,
|
| 53 |
+
0.0193, 0.0305, 0.0019, 0.0112, -0.0621, 0.0029, 0.0437, 0.0387,
|
| 54 |
+
0.0216, 0.0296, 0.0391, -0.0647, 0.0016, -0.0421, 0.0279, 0.0206,
|
| 55 |
+
-0.0546, -0.0025, 0.0222, -0.0787, 0.0481, -0.0451, -0.0149, 0.0208,
|
| 56 |
+
-0.0761, -0.0118, 0.0312, 0.0593, -0.0504, -0.0632, 0.0108, 0.0354,
|
| 57 |
+
-0.0646, 0.0229, 0.0369, 0.0287, 0.0108, 0.0084, 0.0474, -0.0335,
|
| 58 |
+
0.0408, 0.0169, -0.0683, 0.0106, 0.0184, 0.0124, 0.0262, -0.0235,
|
| 59 |
+
0.0079, 0.0267, 0.0047, -0.0836, -0.0301, 0.0451, -0.0400, 0.0327,
|
| 60 |
+
0.0200, -0.0782, 0.0459, 0.0333, -0.0959, 0.0191, -0.0684, -0.0309,
|
| 61 |
+
-0.0256, 0.0330, 0.0309, 0.0798, 0.0342, 0.0248, -0.0348, 0.0114,
|
| 62 |
+
0.0223, -0.0250, 0.0214, -0.0198, 0.0143, 0.0336, 0.0016, 0.0506,
|
| 63 |
+
-0.0593, 0.0198, -0.0944, 0.0044, -0.0675, 0.0455, 0.0453, 0.0284,
|
| 64 |
+
0.0118, 0.0131, 0.0335, -0.0050, 0.0086, -0.0567, 0.0133, 0.0298,
|
| 65 |
+
0.0361, 0.0057, -0.0683, 0.0769, 0.0006, -0.0764, 0.0048, 0.0436,
|
| 66 |
+
0.0014, 0.0543, 0.0219, 0.0353, 0.0326, -0.0171, 0.0510, 0.0041,
|
| 67 |
+
-0.0491, 0.0163, -0.0496, 0.0455, 0.0577, 0.0144, -0.0824, -0.0613,
|
| 68 |
+
-0.0399, 0.0322, 0.0040, -0.0596, 0.0255, 0.0433, 0.0056, 0.0341,
|
| 69 |
+
-0.1160, 0.0023, 0.0274, 0.0127, 0.0228, -0.0661, 0.0046, 0.0077,
|
| 70 |
+
0.0152, -0.0358, 0.0503, 0.0318, -0.0281, 0.0293, 0.0226, -0.0621,
|
| 71 |
+
0.0105, 0.0176, 0.0243, -0.0141, 0.0356, 0.0329, 0.0471, 0.0259,
|
| 72 |
+
0.0185, 0.0097, -0.0906, -0.0619, -0.0214, 0.0247, -0.0555, 0.0395,
|
| 73 |
+
-0.0400, 0.0354, -0.0566, 0.0069, 0.0273, -0.0684, 0.0471, 0.0696,
|
| 74 |
+
-0.0575, -0.0837, -0.0660, -0.0268, 0.0286, 0.0609, -0.0569, 0.0270,
|
| 75 |
+
0.0306, -0.0449, -0.0314, 0.0113, 0.0182, -0.0581, 0.0267, 0.0289,
|
| 76 |
+
-0.0544, 0.0280, 0.0431, 0.0013, -0.0631, -0.0490, -0.0565, 0.0227,
|
| 77 |
+
-0.0673, 0.1090, 0.0386, -0.0459, -0.0050, 0.0019, -0.0533, 0.0523,
|
| 78 |
+
-0.1068, 0.0178, 0.0118, 0.0564, 0.0470, 0.0004, -0.0812, -0.0034,
|
| 79 |
+
0.0106, 0.0216, 0.0065, 0.0542, -0.0544, 0.0299, -0.0156, -0.0019,
|
| 80 |
+
0.0435, 0.0218, 0.0449, 0.0526, -0.0901, -0.1279, 0.0270, 0.0128,
|
| 81 |
+
0.0349, 0.0103, 0.0374, -0.0805, 0.0337, 0.0479, 0.0225, 0.0276,
|
| 82 |
+
-0.0562, 0.0335, -0.0329, 0.0067, 0.0264, -0.0684, -0.0354, 0.0412,
|
| 83 |
+
0.0478, -0.0529, 0.0513, 0.0155, 0.0362, 0.0096, 0.0117, -0.0675,
|
| 84 |
+
-0.0536, -0.0773, -0.0690, 0.0061, 0.0337, -0.0110, 0.0232, -0.0375,
|
| 85 |
+
-0.0397, -0.0738, -0.1551, -0.0238, -0.0733, -0.0216, 0.0374, -0.0015,
|
| 86 |
+
-0.0518, 0.0203, 0.0419, 0.0508, 0.0252, -0.0481, -0.0391, -0.0130,
|
| 87 |
+
-0.0215, 0.0204, 0.0232, 0.0182, 0.0350, 0.0065, 0.0098, -0.0634,
|
| 88 |
+
0.0016, 0.0125, 0.0525, 0.0006, -0.0354, 0.0453, 0.0206, 0.0060,
|
| 89 |
+
0.0044, -0.0044, 0.0485, 0.0207, 0.0067, 0.0348, -0.0036, -0.0098,
|
| 90 |
+
0.0124, -0.0660, -0.0699, 0.0416, -0.0060, 0.0153, 0.0480, 0.0448,
|
| 91 |
+
-0.0679, 0.0470, -0.0159, -0.0685, 0.0396, 0.0329, -0.0017, 0.0527,
|
| 92 |
+
0.0256, 0.0036, -0.0468, -0.0553, 0.0057, -0.0058, 0.0229, 0.0624,
|
| 93 |
+
-0.0378, -0.0404, 0.0180, 0.0268, 0.0569, 0.0400, -0.0515, 0.0313,
|
| 94 |
+
-0.0391, -0.0106, -0.0590, 0.0244, -0.0388, 0.0394, 0.0221, 0.0069,
|
| 95 |
+
-0.0553, -0.0123, -0.0625, -0.0730, 0.0094, 0.0574, -0.0653, 0.0365,
|
| 96 |
+
0.0620, -0.0156, -0.0476, 0.0452, 0.0350, -0.0138, 0.0255, -0.0604]]).unsqueeze(0)
|
| 97 |
+
|
| 98 |
def tts_generate(text):
|
| 99 |
print(f"📝 Input text: {text}")
|
| 100 |
try:
|