Spaces:
Running on Zero
Running on Zero
Commit ·
b840b20
1
Parent(s): d96db07
Enhance audio processing in generate_response by adding prepend and append silence to improve model response timing
Browse files
app.py
CHANGED
|
@@ -171,12 +171,20 @@ def generate_response(audio_input, persona: str, voice: str):
|
|
| 171 |
import sphn
|
| 172 |
audio = sphn.resample(audio, sr, mimi.sample_rate)
|
| 173 |
|
| 174 |
-
#
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
# Add channel dimension: (T,) -> (1, T)
|
| 182 |
if audio.ndim == 1:
|
|
@@ -207,6 +215,7 @@ def generate_response(audio_input, persona: str, voice: str):
|
|
| 207 |
# Process user audio frames
|
| 208 |
generated_frames = []
|
| 209 |
generated_text = []
|
|
|
|
| 210 |
|
| 211 |
for user_encoded in encode_from_sphn(
|
| 212 |
mimi,
|
|
@@ -216,10 +225,15 @@ def generate_response(audio_input, persona: str, voice: str):
|
|
| 216 |
for c in range(user_encoded.shape[-1]):
|
| 217 |
step_in = user_encoded[:, :, c:c+1]
|
| 218 |
tokens = lm_gen.step(step_in)
|
|
|
|
| 219 |
|
| 220 |
if tokens is None:
|
| 221 |
continue
|
| 222 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
# Decode agent audio
|
| 224 |
pcm = decode_tokens_to_pcm(mimi, other_mimi, tokens)
|
| 225 |
generated_frames.append(pcm)
|
|
|
|
| 171 |
import sphn
|
| 172 |
audio = sphn.resample(audio, sr, mimi.sample_rate)
|
| 173 |
|
| 174 |
+
# PREPEND SILENCE: Let model say its default greeting during this time (we'll discard this output)
|
| 175 |
+
prepend_silence_duration = 2 # seconds
|
| 176 |
+
prepend_silence = np.zeros(int(prepend_silence_duration * mimi.sample_rate), dtype=np.float32)
|
| 177 |
+
|
| 178 |
+
# APPEND SILENCE: Give model time to complete its response after user finishes speaking
|
| 179 |
+
append_silence_duration = 8 # seconds
|
| 180 |
+
append_silence = np.zeros(int(append_silence_duration * mimi.sample_rate), dtype=np.float32)
|
| 181 |
+
|
| 182 |
+
# Final audio: [prepend_silence] + [user_audio] + [append_silence]
|
| 183 |
+
audio = np.concatenate([prepend_silence, audio, append_silence])
|
| 184 |
+
|
| 185 |
+
# Calculate how many output frames to skip (corresponds to prepend silence)
|
| 186 |
+
# frame_rate is 12.5 Hz, so frames_to_skip = prepend_silence_duration * frame_rate
|
| 187 |
+
frames_to_skip = int(prepend_silence_duration * 12.5)
|
| 188 |
|
| 189 |
# Add channel dimension: (T,) -> (1, T)
|
| 190 |
if audio.ndim == 1:
|
|
|
|
| 215 |
# Process user audio frames
|
| 216 |
generated_frames = []
|
| 217 |
generated_text = []
|
| 218 |
+
frame_count = 0 # Track frame index to skip prepend silence output
|
| 219 |
|
| 220 |
for user_encoded in encode_from_sphn(
|
| 221 |
mimi,
|
|
|
|
| 225 |
for c in range(user_encoded.shape[-1]):
|
| 226 |
step_in = user_encoded[:, :, c:c+1]
|
| 227 |
tokens = lm_gen.step(step_in)
|
| 228 |
+
frame_count += 1
|
| 229 |
|
| 230 |
if tokens is None:
|
| 231 |
continue
|
| 232 |
|
| 233 |
+
# Skip frames generated during prepend silence (model's default greeting)
|
| 234 |
+
if frame_count <= frames_to_skip:
|
| 235 |
+
continue
|
| 236 |
+
|
| 237 |
# Decode agent audio
|
| 238 |
pcm = decode_tokens_to_pcm(mimi, other_mimi, tokens)
|
| 239 |
generated_frames.append(pcm)
|