Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -8,6 +8,8 @@ import google.generativeai as genai
|
|
| 8 |
import re
|
| 9 |
import logging
|
| 10 |
import numpy as np
|
|
|
|
|
|
|
| 11 |
|
| 12 |
logging.basicConfig(level=logging.INFO)
|
| 13 |
logger = logging.getLogger(__name__)
|
|
@@ -133,6 +135,7 @@ def redistribute_codes(code_list, snac_model):
|
|
| 133 |
audio_hat = snac_model.decode(codes)
|
| 134 |
return audio_hat.detach().squeeze().cpu().numpy() # Always return CPU numpy array
|
| 135 |
|
|
|
|
| 136 |
@spaces.GPU()
|
| 137 |
@spaces.GPU()
|
| 138 |
def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts, progress=gr.Progress()):
|
|
@@ -140,6 +143,9 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
|
|
| 140 |
return None
|
| 141 |
|
| 142 |
try:
|
|
|
|
|
|
|
|
|
|
| 143 |
progress(0.1, "Processing text...")
|
| 144 |
lines = text.split('\n')
|
| 145 |
audio_samples = []
|
|
@@ -179,12 +185,26 @@ def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty
|
|
| 179 |
# Concatenate all audio samples
|
| 180 |
final_audio = np.concatenate(audio_samples)
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
# Add a check for 15-second limitation
|
| 183 |
max_samples = 24000 * 15 # 15 seconds at 24kHz sample rate
|
| 184 |
-
if len(
|
| 185 |
-
|
| 186 |
|
| 187 |
-
return (24000,
|
| 188 |
except Exception as e:
|
| 189 |
print(f"Error generating speech: {e}")
|
| 190 |
return None
|
|
|
|
| 8 |
import re
|
| 9 |
import logging
|
| 10 |
import numpy as np
|
| 11 |
+
from pydub import AudioSegment
|
| 12 |
+
import io
|
| 13 |
|
| 14 |
logging.basicConfig(level=logging.INFO)
|
| 15 |
logger = logging.getLogger(__name__)
|
|
|
|
| 135 |
audio_hat = snac_model.decode(codes)
|
| 136 |
return audio_hat.detach().squeeze().cpu().numpy() # Always return CPU numpy array
|
| 137 |
|
| 138 |
+
@spaces.GPU()
|
| 139 |
@spaces.GPU()
|
| 140 |
@spaces.GPU()
|
| 141 |
def generate_speech(text, voice1, voice2, temperature, top_p, repetition_penalty, max_new_tokens, num_hosts, progress=gr.Progress()):
|
|
|
|
| 143 |
return None
|
| 144 |
|
| 145 |
try:
|
| 146 |
+
# Load the intro/outro music
|
| 147 |
+
music = AudioSegment.from_mp3("Maiko-intro-outro.mp3")
|
| 148 |
+
|
| 149 |
progress(0.1, "Processing text...")
|
| 150 |
lines = text.split('\n')
|
| 151 |
audio_samples = []
|
|
|
|
| 185 |
# Concatenate all audio samples
|
| 186 |
final_audio = np.concatenate(audio_samples)
|
| 187 |
|
| 188 |
+
# Convert numpy array to AudioSegment
|
| 189 |
+
speech_audio = AudioSegment(
|
| 190 |
+
final_audio.tobytes(),
|
| 191 |
+
frame_rate=24000,
|
| 192 |
+
sample_width=final_audio.dtype.itemsize,
|
| 193 |
+
channels=1
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
# Combine intro, speech, and outro
|
| 197 |
+
combined_audio = music + speech_audio + music
|
| 198 |
+
|
| 199 |
+
# Convert back to numpy array
|
| 200 |
+
combined_numpy = np.array(combined_audio.get_array_of_samples())
|
| 201 |
+
|
| 202 |
# Add a check for 15-second limitation
|
| 203 |
max_samples = 24000 * 15 # 15 seconds at 24kHz sample rate
|
| 204 |
+
if len(combined_numpy) > max_samples:
|
| 205 |
+
combined_numpy = combined_numpy[:max_samples]
|
| 206 |
|
| 207 |
+
return (24000, combined_numpy)
|
| 208 |
except Exception as e:
|
| 209 |
print(f"Error generating speech: {e}")
|
| 210 |
return None
|