nambn0321 commited on
Commit
1cd841e
·
verified ·
1 Parent(s): 9a99cf5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -19
app.py CHANGED
@@ -5,33 +5,96 @@ import torchaudio
5
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
6
  from transformers.models.speecht5 import SpeechT5HifiGan
7
 
8
- processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
9
- model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
10
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
11
-
12
- device = "cuda" if torch.cuda.is_available() else "cpu"
13
- model = model.to(device)
14
- vocoder = vocoder.to(device)
15
-
16
- speaker_embedding = torch.zeros(1, 512).to(device)
17
-
18
- # Load model and processor
19
- # processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_with_T5")
20
- # model = SpeechT5ForTextToSpeech.from_pretrained(
21
- # "nambn0321/TTS_with_T5",
22
- # use_safetensors=True,
23
- # trust_remote_code=True
24
- # )
25
  # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
26
 
27
- # # Move to CUDA if available
28
  # device = "cuda" if torch.cuda.is_available() else "cpu"
29
  # model = model.to(device)
30
  # vocoder = vocoder.to(device)
31
 
32
- # # Dummy speaker embedding (or load your real one here)
33
  # speaker_embedding = torch.zeros(1, 512).to(device)
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def tts_generate(text):
36
  print(f"📝 Input text: {text}")
37
  try:
 
5
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech
6
  from transformers.models.speecht5 import SpeechT5HifiGan
7
 
8
+ # processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
9
+ # model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  # vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
11
 
 
12
  # device = "cuda" if torch.cuda.is_available() else "cpu"
13
  # model = model.to(device)
14
  # vocoder = vocoder.to(device)
15
 
 
16
  # speaker_embedding = torch.zeros(1, 512).to(device)
17
 
18
+ # Load model and processor
19
+ processor = SpeechT5Processor.from_pretrained("nambn0321/TTS_with_T5")
20
+ model = SpeechT5ForTextToSpeech.from_pretrained(
21
+ "nambn0321/TTS_with_T5",
22
+ use_safetensors=True,
23
+ trust_remote_code=True
24
+ )
25
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
26
+
27
+ # Move to CUDA if available
28
+ device = "cuda" if torch.cuda.is_available() else "cpu"
29
+ model = model.to(device)
30
+ vocoder = vocoder.to(device)
31
+
32
+ # # Dummy speaker embedding (or load your real one here)
33
+ speaker_embeddings = torch.tensor([[-0.0663, -0.0233, 0.0438, 0.0111, -0.0286, -0.0305, -0.0487, 0.0060,
34
+ 0.0311, 0.0164, -0.0755, -0.0926, 0.0484, 0.0431, 0.0502, 0.0557,
35
+ 0.0053, 0.0197, 0.0127, 0.0058, 0.0300, 0.0077, -0.0154, -0.0416,
36
+ -0.0567, -0.0077, -0.0547, 0.0099, 0.0571, 0.0217, -0.0028, 0.0495,
37
+ 0.0475, -0.0174, 0.0359, -0.0379, 0.0302, 0.0693, -0.0105, -0.0734,
38
+ 0.0115, -0.0035, 0.0224, 0.0335, 0.0118, -0.1125, -0.0200, 0.0133,
39
+ -0.0815, 0.0618, 0.0405, 0.0118, 0.0588, 0.0163, -0.1139, -0.0136,
40
+ -0.0106, 0.0119, 0.0393, 0.0244, 0.0038, -0.0081, -0.0142, -0.0360,
41
+ 0.0299, 0.0261, 0.0307, -0.0516, -0.0624, -0.0655, -0.0068, 0.0090,
42
+ 0.0380, 0.0029, 0.0206, 0.0426, 0.0073, -0.0028, -0.0544, -0.0757,
43
+ -0.0796, -0.0405, -0.0633, -0.0804, -0.0316, -0.0558, -0.0556, 0.0566,
44
+ 0.0393, -0.0027, 0.0300, -0.0526, 0.0105, -0.0550, 0.0473, -0.0364,
45
+ 0.0447, 0.0426, -0.0897, -0.0682, 0.0533, -0.0653, -0.0917, 0.0249,
46
+ 0.0406, 0.0114, 0.0402, 0.0187, 0.0287, 0.0150, -0.0771, 0.0080,
47
+ 0.0779, -0.0084, 0.0279, 0.0383, -0.0716, 0.0013, -0.0565, 0.0294,
48
+ 0.0399, -0.0730, 0.0113, 0.0415, -0.0586, 0.0624, -0.0720, 0.0289,
49
+ 0.0370, 0.0526, 0.0385, 0.0302, 0.0451, 0.0400, 0.0104, -0.0805,
50
+ -0.0610, -0.0278, -0.0395, 0.0081, -0.0022, 0.0050, -0.0027, 0.0500,
51
+ -0.0832, 0.0167, -0.0224, -0.0416, 0.0369, -0.1350, 0.0280, -0.0435,
52
+ -0.0639, 0.0163, 0.0037, 0.0344, 0.0058, -0.0856, -0.0878, 0.0121,
53
+ 0.0193, 0.0305, 0.0019, 0.0112, -0.0621, 0.0029, 0.0437, 0.0387,
54
+ 0.0216, 0.0296, 0.0391, -0.0647, 0.0016, -0.0421, 0.0279, 0.0206,
55
+ -0.0546, -0.0025, 0.0222, -0.0787, 0.0481, -0.0451, -0.0149, 0.0208,
56
+ -0.0761, -0.0118, 0.0312, 0.0593, -0.0504, -0.0632, 0.0108, 0.0354,
57
+ -0.0646, 0.0229, 0.0369, 0.0287, 0.0108, 0.0084, 0.0474, -0.0335,
58
+ 0.0408, 0.0169, -0.0683, 0.0106, 0.0184, 0.0124, 0.0262, -0.0235,
59
+ 0.0079, 0.0267, 0.0047, -0.0836, -0.0301, 0.0451, -0.0400, 0.0327,
60
+ 0.0200, -0.0782, 0.0459, 0.0333, -0.0959, 0.0191, -0.0684, -0.0309,
61
+ -0.0256, 0.0330, 0.0309, 0.0798, 0.0342, 0.0248, -0.0348, 0.0114,
62
+ 0.0223, -0.0250, 0.0214, -0.0198, 0.0143, 0.0336, 0.0016, 0.0506,
63
+ -0.0593, 0.0198, -0.0944, 0.0044, -0.0675, 0.0455, 0.0453, 0.0284,
64
+ 0.0118, 0.0131, 0.0335, -0.0050, 0.0086, -0.0567, 0.0133, 0.0298,
65
+ 0.0361, 0.0057, -0.0683, 0.0769, 0.0006, -0.0764, 0.0048, 0.0436,
66
+ 0.0014, 0.0543, 0.0219, 0.0353, 0.0326, -0.0171, 0.0510, 0.0041,
67
+ -0.0491, 0.0163, -0.0496, 0.0455, 0.0577, 0.0144, -0.0824, -0.0613,
68
+ -0.0399, 0.0322, 0.0040, -0.0596, 0.0255, 0.0433, 0.0056, 0.0341,
69
+ -0.1160, 0.0023, 0.0274, 0.0127, 0.0228, -0.0661, 0.0046, 0.0077,
70
+ 0.0152, -0.0358, 0.0503, 0.0318, -0.0281, 0.0293, 0.0226, -0.0621,
71
+ 0.0105, 0.0176, 0.0243, -0.0141, 0.0356, 0.0329, 0.0471, 0.0259,
72
+ 0.0185, 0.0097, -0.0906, -0.0619, -0.0214, 0.0247, -0.0555, 0.0395,
73
+ -0.0400, 0.0354, -0.0566, 0.0069, 0.0273, -0.0684, 0.0471, 0.0696,
74
+ -0.0575, -0.0837, -0.0660, -0.0268, 0.0286, 0.0609, -0.0569, 0.0270,
75
+ 0.0306, -0.0449, -0.0314, 0.0113, 0.0182, -0.0581, 0.0267, 0.0289,
76
+ -0.0544, 0.0280, 0.0431, 0.0013, -0.0631, -0.0490, -0.0565, 0.0227,
77
+ -0.0673, 0.1090, 0.0386, -0.0459, -0.0050, 0.0019, -0.0533, 0.0523,
78
+ -0.1068, 0.0178, 0.0118, 0.0564, 0.0470, 0.0004, -0.0812, -0.0034,
79
+ 0.0106, 0.0216, 0.0065, 0.0542, -0.0544, 0.0299, -0.0156, -0.0019,
80
+ 0.0435, 0.0218, 0.0449, 0.0526, -0.0901, -0.1279, 0.0270, 0.0128,
81
+ 0.0349, 0.0103, 0.0374, -0.0805, 0.0337, 0.0479, 0.0225, 0.0276,
82
+ -0.0562, 0.0335, -0.0329, 0.0067, 0.0264, -0.0684, -0.0354, 0.0412,
83
+ 0.0478, -0.0529, 0.0513, 0.0155, 0.0362, 0.0096, 0.0117, -0.0675,
84
+ -0.0536, -0.0773, -0.0690, 0.0061, 0.0337, -0.0110, 0.0232, -0.0375,
85
+ -0.0397, -0.0738, -0.1551, -0.0238, -0.0733, -0.0216, 0.0374, -0.0015,
86
+ -0.0518, 0.0203, 0.0419, 0.0508, 0.0252, -0.0481, -0.0391, -0.0130,
87
+ -0.0215, 0.0204, 0.0232, 0.0182, 0.0350, 0.0065, 0.0098, -0.0634,
88
+ 0.0016, 0.0125, 0.0525, 0.0006, -0.0354, 0.0453, 0.0206, 0.0060,
89
+ 0.0044, -0.0044, 0.0485, 0.0207, 0.0067, 0.0348, -0.0036, -0.0098,
90
+ 0.0124, -0.0660, -0.0699, 0.0416, -0.0060, 0.0153, 0.0480, 0.0448,
91
+ -0.0679, 0.0470, -0.0159, -0.0685, 0.0396, 0.0329, -0.0017, 0.0527,
92
+ 0.0256, 0.0036, -0.0468, -0.0553, 0.0057, -0.0058, 0.0229, 0.0624,
93
+ -0.0378, -0.0404, 0.0180, 0.0268, 0.0569, 0.0400, -0.0515, 0.0313,
94
+ -0.0391, -0.0106, -0.0590, 0.0244, -0.0388, 0.0394, 0.0221, 0.0069,
95
+ -0.0553, -0.0123, -0.0625, -0.0730, 0.0094, 0.0574, -0.0653, 0.0365,
96
+ 0.0620, -0.0156, -0.0476, 0.0452, 0.0350, -0.0138, 0.0255, -0.0604]]).unsqueeze(0)
97
+
98
  def tts_generate(text):
99
  print(f"📝 Input text: {text}")
100
  try: