fffiloni commited on
Commit
c77ae43
·
1 Parent(s): 3b2c590

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -23
app.py CHANGED
@@ -58,46 +58,50 @@ def infer(text_prompt):
58
  Cutting text in chunks
59
 
60
  """)
61
- input_waves = []
62
 
63
  text_chunks = split_text_into_sentences(text_prompt)
64
- for i, chunk in enumerate(text_chunks):
65
- print(chunk)
66
- result = generate(chunk, i, "wav")
67
- print(result)
68
- input_waves.append(result)
69
 
70
  output_wav = 'full_story.wav'
71
 
72
- join_wav_files(input_waves, output_wav)
73
 
74
  return 'full_story.wav'
75
 
76
 
77
- def generate(text_prompt, i, out_type):
78
  text_prompt = text_prompt
79
 
80
  inputs = processor(text_prompt).to(device)
81
 
82
  with torch.inference_mode():
83
  speech_output = model.generate(**inputs)
 
 
84
 
85
- audio_array = speech_output[0].cpu().numpy().squeeze()
86
- print(f'AUDIO_ARRAY: {audio_array}')
87
-
88
- # Assuming audio_array contains audio data and the sampling rate
89
- sampling_rate = model.generation_config.sample_rate
90
- print(f'sampling_rate: {sampling_rate}')
91
-
92
- if out_type == "numpy":
93
- return (sampling_rate, audio_array)
94
- elif out_type == "wav":
95
- #If you want to return a WAV file :
96
- # Ensure the audio data is properly scaled (between -1 and 1 for 16-bit audio)
97
 
98
- audio_data = np.int16(audio_array * 32767) # Scale for 16-bit signed integer
99
- write_wav(f"output_{i}.wav", sampling_rate, audio_data)
100
- return f"output_{i}.wav"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
 
103
  with gr.Blocks() as demo:
 
58
  Cutting text in chunks
59
 
60
  """)
61
+
62
 
63
  text_chunks = split_text_into_sentences(text_prompt)
64
+
65
+ result = generate(text_chunks, "wav")
66
+ print(result)
67
+
 
68
 
69
  output_wav = 'full_story.wav'
70
 
71
+ join_wav_files(result, output_wav)
72
 
73
  return 'full_story.wav'
74
 
75
 
76
+ def generate(text_prompt, out_type):
77
  text_prompt = text_prompt
78
 
79
  inputs = processor(text_prompt).to(device)
80
 
81
  with torch.inference_mode():
82
  speech_output = model.generate(**inputs)
83
+
84
+ input_waves = []
85
 
86
+ for i, speech_out in enumerate(speech_output):
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ audio_array = speech_out.cpu().numpy().squeeze()
89
+ print(f'AUDIO_ARRAY: {audio_array}')
90
+
91
+ # Assuming audio_array contains audio data and the sampling rate
92
+ sampling_rate = model.generation_config.sample_rate
93
+ print(f'sampling_rate: {sampling_rate}')
94
+
95
+ if out_type == "numpy":
96
+ input_waves.append(sampling_rate, audio_array)
97
+ elif out_type == "wav":
98
+ #If you want to return a WAV file :
99
+ # Ensure the audio data is properly scaled (between -1 and 1 for 16-bit audio)
100
+
101
+ audio_data = np.int16(audio_array * 32767) # Scale for 16-bit signed integer
102
+ write_wav(f"output_{i}.wav", sampling_rate, audio_data)
103
+ input_waves.append(f"output_{i}.wav")
104
+ return input_waves
105
 
106
 
107
  with gr.Blocks() as demo: