PersonaPlex

Running on Zero

App Files Files Community

MohamedRashad commited on Jan 18

Commit

b840b20

1 Parent(s): d96db07

Enhance audio processing in generate_response by adding prepend and append silence to improve model response timing

Browse files

Files changed (1) hide show

app.py +20 -6

app.py CHANGED Viewed

@@ -171,12 +171,20 @@ def generate_response(audio_input, persona: str, voice: str):
         import sphn
         audio = sphn.resample(audio, sr, mimi.sample_rate)
-    # PAD INPUT WITH SILENCE to give the model time to respond
-    # This is critical because PersonaPlex output duration = input duration
-    # Adding ~8 seconds of silence allows the model to complete its response
-    silence_duration = 8  # seconds
-    silence = np.zeros(int(silence_duration * mimi.sample_rate), dtype=np.float32)
-    audio = np.concatenate([audio, silence])
     # Add channel dimension: (T,) -> (1, T)
     if audio.ndim == 1:
@@ -207,6 +215,7 @@ def generate_response(audio_input, persona: str, voice: str):
         # Process user audio frames
         generated_frames = []
         generated_text = []
         for user_encoded in encode_from_sphn(
             mimi,
@@ -216,10 +225,15 @@ def generate_response(audio_input, persona: str, voice: str):
             for c in range(user_encoded.shape[-1]):
                 step_in = user_encoded[:, :, c:c+1]
                 tokens = lm_gen.step(step_in)
                 if tokens is None:
                     continue
                 # Decode agent audio
                 pcm = decode_tokens_to_pcm(mimi, other_mimi, tokens)
                 generated_frames.append(pcm)

         import sphn
         audio = sphn.resample(audio, sr, mimi.sample_rate)
+    # PREPEND SILENCE: Let model say its default greeting during this time (we'll discard this output)
+    prepend_silence_duration = 2  # seconds
+    prepend_silence = np.zeros(int(prepend_silence_duration * mimi.sample_rate), dtype=np.float32)
+    # APPEND SILENCE: Give model time to complete its response after user finishes speaking
+    append_silence_duration = 8  # seconds
+    append_silence = np.zeros(int(append_silence_duration * mimi.sample_rate), dtype=np.float32)
+    # Final audio: [prepend_silence] + [user_audio] + [append_silence]
+    audio = np.concatenate([prepend_silence, audio, append_silence])
+    # Calculate how many output frames to skip (corresponds to prepend silence)
+    # frame_rate is 12.5 Hz, so frames_to_skip = prepend_silence_duration * frame_rate
+    frames_to_skip = int(prepend_silence_duration * 12.5)
     # Add channel dimension: (T,) -> (1, T)
     if audio.ndim == 1:
         # Process user audio frames
         generated_frames = []
         generated_text = []
+        frame_count = 0  # Track frame index to skip prepend silence output
         for user_encoded in encode_from_sphn(
             mimi,
             for c in range(user_encoded.shape[-1]):
                 step_in = user_encoded[:, :, c:c+1]
                 tokens = lm_gen.step(step_in)
+                frame_count += 1
                 if tokens is None:
                     continue
+                # Skip frames generated during prepend silence (model's default greeting)
+                if frame_count <= frames_to_skip:
+                    continue
                 # Decode agent audio
                 pcm = decode_tokens_to_pcm(mimi, other_mimi, tokens)
                 generated_frames.append(pcm)