Spaces:
Runtime error
Runtime error
Commit
·
d84903c
1
Parent(s):
1b59e3b
update tts
Browse files
app.py
CHANGED
|
@@ -84,6 +84,9 @@ from scipy.io import wavfile
|
|
| 84 |
|
| 85 |
import re
|
| 86 |
|
|
|
|
|
|
|
|
|
|
| 87 |
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
| 88 |
sample = ds[0]["audio"]
|
| 89 |
|
|
@@ -121,7 +124,30 @@ def text_to_speech_loc(text):
|
|
| 121 |
print("audio: ", audio)
|
| 122 |
return audio
|
| 123 |
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
class GPTRemote(LLM):
|
| 127 |
n: int
|
|
@@ -593,7 +619,7 @@ Text2Sound_tool2 = Tool(
|
|
| 593 |
Text2Sound_tool_loc = Tool(
|
| 594 |
name = "Text To Sound API 2",
|
| 595 |
# func = Text2Sound,
|
| 596 |
-
func =
|
| 597 |
description = "Useful when you need to convert text into sound file."
|
| 598 |
)
|
| 599 |
|
|
|
|
| 84 |
|
| 85 |
import re
|
| 86 |
|
| 87 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
| 88 |
+
import torch
|
| 89 |
+
|
| 90 |
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
| 91 |
sample = ds[0]["audio"]
|
| 92 |
|
|
|
|
| 124 |
print("audio: ", audio)
|
| 125 |
return audio
|
| 126 |
|
| 127 |
+
def text_to_speech_loc2(text):
|
| 128 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
| 129 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
| 130 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 131 |
+
|
| 132 |
+
inputs = processor(text="Hello, my dear. Glad to see you. hahahaha...", return_tensors="pt")
|
| 133 |
+
|
| 134 |
+
# load xvector containing speaker's voice characteristics from a dataset
|
| 135 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
| 136 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
| 137 |
+
|
| 138 |
+
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
| 139 |
+
print("Type of speech: ", type(speech))
|
| 140 |
+
|
| 141 |
+
timestr = time.strftime("%Y%m%d-%H%M%S")
|
| 142 |
+
# sampling_rate = 16000
|
| 143 |
+
with open('sample-' + timestr + '.wav', 'wb') as audio:
|
| 144 |
+
sf.write(audio, speech.numpy(), samplerate=16000)
|
| 145 |
+
# audio = sf.write("convert1.wav", speech, samplerate=16000)
|
| 146 |
+
print("audio: ", audio)
|
| 147 |
+
return audio
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
print("text to speech2: ", text_to_speech_loc2("Good morning."))
|
| 151 |
|
| 152 |
class GPTRemote(LLM):
|
| 153 |
n: int
|
|
|
|
| 619 |
Text2Sound_tool_loc = Tool(
|
| 620 |
name = "Text To Sound API 2",
|
| 621 |
# func = Text2Sound,
|
| 622 |
+
func = text_to_speech_loc2,
|
| 623 |
description = "Useful when you need to convert text into sound file."
|
| 624 |
)
|
| 625 |
|