MohamedRashad commited on
Commit
b840b20
·
1 Parent(s): d96db07

Enhance audio processing in generate_response by adding prepend and append silence to improve model response timing

Browse files
Files changed (1) hide show
  1. app.py +20 -6
app.py CHANGED
@@ -171,12 +171,20 @@ def generate_response(audio_input, persona: str, voice: str):
171
  import sphn
172
  audio = sphn.resample(audio, sr, mimi.sample_rate)
173
 
174
- # PAD INPUT WITH SILENCE to give the model time to respond
175
- # This is critical because PersonaPlex output duration = input duration
176
- # Adding ~8 seconds of silence allows the model to complete its response
177
- silence_duration = 8 # seconds
178
- silence = np.zeros(int(silence_duration * mimi.sample_rate), dtype=np.float32)
179
- audio = np.concatenate([audio, silence])
 
 
 
 
 
 
 
 
180
 
181
  # Add channel dimension: (T,) -> (1, T)
182
  if audio.ndim == 1:
@@ -207,6 +215,7 @@ def generate_response(audio_input, persona: str, voice: str):
207
  # Process user audio frames
208
  generated_frames = []
209
  generated_text = []
 
210
 
211
  for user_encoded in encode_from_sphn(
212
  mimi,
@@ -216,10 +225,15 @@ def generate_response(audio_input, persona: str, voice: str):
216
  for c in range(user_encoded.shape[-1]):
217
  step_in = user_encoded[:, :, c:c+1]
218
  tokens = lm_gen.step(step_in)
 
219
 
220
  if tokens is None:
221
  continue
222
 
 
 
 
 
223
  # Decode agent audio
224
  pcm = decode_tokens_to_pcm(mimi, other_mimi, tokens)
225
  generated_frames.append(pcm)
 
171
  import sphn
172
  audio = sphn.resample(audio, sr, mimi.sample_rate)
173
 
174
+ # PREPEND SILENCE: Let model say its default greeting during this time (we'll discard this output)
175
+ prepend_silence_duration = 2 # seconds
176
+ prepend_silence = np.zeros(int(prepend_silence_duration * mimi.sample_rate), dtype=np.float32)
177
+
178
+ # APPEND SILENCE: Give model time to complete its response after user finishes speaking
179
+ append_silence_duration = 8 # seconds
180
+ append_silence = np.zeros(int(append_silence_duration * mimi.sample_rate), dtype=np.float32)
181
+
182
+ # Final audio: [prepend_silence] + [user_audio] + [append_silence]
183
+ audio = np.concatenate([prepend_silence, audio, append_silence])
184
+
185
+ # Calculate how many output frames to skip (corresponds to prepend silence)
186
+ # frame_rate is 12.5 Hz, so frames_to_skip = prepend_silence_duration * frame_rate
187
+ frames_to_skip = int(prepend_silence_duration * 12.5)
188
 
189
  # Add channel dimension: (T,) -> (1, T)
190
  if audio.ndim == 1:
 
215
  # Process user audio frames
216
  generated_frames = []
217
  generated_text = []
218
+ frame_count = 0 # Track frame index to skip prepend silence output
219
 
220
  for user_encoded in encode_from_sphn(
221
  mimi,
 
225
  for c in range(user_encoded.shape[-1]):
226
  step_in = user_encoded[:, :, c:c+1]
227
  tokens = lm_gen.step(step_in)
228
+ frame_count += 1
229
 
230
  if tokens is None:
231
  continue
232
 
233
+ # Skip frames generated during prepend silence (model's default greeting)
234
+ if frame_count <= frames_to_skip:
235
+ continue
236
+
237
  # Decode agent audio
238
  pcm = decode_tokens_to_pcm(mimi, other_mimi, tokens)
239
  generated_frames.append(pcm)