Update app.py
Browse files
app.py
CHANGED
|
@@ -19,9 +19,63 @@ model.enable_cpu_offload()
|
|
| 19 |
import numpy as np
|
| 20 |
from scipy.io.wavfile import write as write_wav
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def infer(text_prompt):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
text_prompt = text_prompt
|
|
|
|
| 25 |
inputs = processor(text_prompt).to(device)
|
| 26 |
|
| 27 |
with torch.inference_mode():
|
|
@@ -29,19 +83,21 @@ def infer(text_prompt):
|
|
| 29 |
|
| 30 |
audio_array = speech_output[0].cpu().numpy().squeeze()
|
| 31 |
print(f'AUDIO_ARRAY: {audio_array}')
|
| 32 |
-
|
| 33 |
-
# Assuming
|
| 34 |
sampling_rate = model.generation_config.sample_rate
|
| 35 |
print(f'sampling_rate: {sampling_rate}')
|
| 36 |
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
-
return (sampling_rate, audio_array)
|
| 45 |
|
| 46 |
with gr.Blocks() as demo:
|
| 47 |
with gr.Column():
|
|
|
|
| 19 |
import numpy as np
|
| 20 |
from scipy.io.wavfile import write as write_wav
|
| 21 |
|
| 22 |
+
def split_text_into_sentences(text):
|
| 23 |
+
sentences = []
|
| 24 |
+
current_sentence = ''
|
| 25 |
+
words = text.split()
|
| 26 |
+
|
| 27 |
+
for word in words:
|
| 28 |
+
current_sentence += ' ' + word
|
| 29 |
+
if word.endswith('.'):
|
| 30 |
+
sentences.append(current_sentence.strip())
|
| 31 |
+
current_sentence = ''
|
| 32 |
+
|
| 33 |
+
if current_sentence:
|
| 34 |
+
sentences.append(current_sentence.strip())
|
| 35 |
+
|
| 36 |
+
return sentences
|
| 37 |
+
|
| 38 |
+
def join_wav_files(input_files, output_file):
|
| 39 |
+
# Open the first input file to get its parameters
|
| 40 |
+
with wave.open(input_files[0], 'rb') as first_file:
|
| 41 |
+
# Get the audio parameters from the first file
|
| 42 |
+
params = first_file.getparams()
|
| 43 |
+
|
| 44 |
+
# Create a new wave file for writing the joined audio
|
| 45 |
+
with wave.open(output_file, 'wb') as output:
|
| 46 |
+
output.setparams(params)
|
| 47 |
+
|
| 48 |
+
# Iterate over the input files and write their audio data to the output file
|
| 49 |
+
for input_file in input_files:
|
| 50 |
+
with wave.open(input_file, 'rb') as input:
|
| 51 |
+
output.writeframes(input.readframes(input.getnframes()))
|
| 52 |
+
|
| 53 |
|
| 54 |
def infer(text_prompt):
|
| 55 |
+
print("""
|
| 56 |
+
—
|
| 57 |
+
Cutting text in chunks
|
| 58 |
+
—
|
| 59 |
+
""")
|
| 60 |
+
input_waves = []
|
| 61 |
+
|
| 62 |
+
text_chunks = split_text_into_sentences(text_prompt)
|
| 63 |
+
for i, chunk in enumerate(text_chunks):
|
| 64 |
+
print(chunk)
|
| 65 |
+
result = generate(chunk, i, "wav")
|
| 66 |
+
print(result)
|
| 67 |
+
input_waves.append(result)
|
| 68 |
+
|
| 69 |
+
output_wav = 'full_story.wav'
|
| 70 |
+
|
| 71 |
+
join_wav_files(input_waves, output_wav)
|
| 72 |
+
|
| 73 |
+
return 'full_story.wav'
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def generate(text_prompt, i, out_type):
|
| 77 |
text_prompt = text_prompt
|
| 78 |
+
|
| 79 |
inputs = processor(text_prompt).to(device)
|
| 80 |
|
| 81 |
with torch.inference_mode():
|
|
|
|
| 83 |
|
| 84 |
audio_array = speech_output[0].cpu().numpy().squeeze()
|
| 85 |
print(f'AUDIO_ARRAY: {audio_array}')
|
| 86 |
+
|
| 87 |
+
# Assuming audio_array contains audio data and the sampling rate
|
| 88 |
sampling_rate = model.generation_config.sample_rate
|
| 89 |
print(f'sampling_rate: {sampling_rate}')
|
| 90 |
|
| 91 |
+
if out_type == "numpy":
|
| 92 |
+
return (sampling_rate, audio_array)
|
| 93 |
+
elif out_type == "wav":
|
| 94 |
+
#If you want to return a WAV file :
|
| 95 |
+
# Ensure the audio data is properly scaled (between -1 and 1 for 16-bit audio)
|
| 96 |
+
|
| 97 |
+
audio_data = np.int16(audio_array * 32767) # Scale for 16-bit signed integer
|
| 98 |
+
write_wav(f"output_{i}.wav", sampling_rate, audio_data)
|
| 99 |
+
return f"output_{i}.wav"
|
| 100 |
|
|
|
|
| 101 |
|
| 102 |
with gr.Blocks() as demo:
|
| 103 |
with gr.Column():
|