PersonaPlex

Running on Zero

App Files Files Community

MohamedRashad commited on Jan 18

Commit

d96db07

1 Parent(s): d3a0030

Add warmup function to initialize CUDA graphs and improve performance in get_models

Browse files

Files changed (1) hide show

app.py +29 -26

app.py CHANGED Viewed

@@ -89,6 +89,11 @@ def get_models():
         other_mimi.streaming_forever(1)
         lm_gen.streaming_forever(1)
         _model_cache.update({
             "mimi": mimi,
             "other_mimi": other_mimi,
@@ -101,6 +106,23 @@ def get_models():
     return _model_cache
 def wrap_with_system_tags(text: str) -> str:
     """Add system tags as PersonaPlex expects."""
@@ -149,6 +171,13 @@ def generate_response(audio_input, persona: str, voice: str):
         import sphn
         audio = sphn.resample(audio, sr, mimi.sample_rate)
     # Add channel dimension: (T,) -> (1, T)
     if audio.ndim == 1:
         audio = audio[None, :]
@@ -200,32 +229,6 @@ def generate_response(audio_input, persona: str, voice: str):
                 if text_token not in (0, 3):  # Skip special tokens
                     text_piece = text_tokenizer.id_to_piece(text_token).replace("▁", " ")
                     generated_text.append(text_piece)
-        # Continue generating with silence to let the model finish speaking
-        # Add extra frames (approximately 10 seconds of continuation)
-        extra_frames = int(10 * mimi.frame_rate)
-        # Use the correct SINE_TOKENS from lm.py for user audio (simulates silence/background)
-        # These represent a 440Hz sine wave encoded by Mimi - official PersonaPlex constants
-        SINE_TOKENS = [430, 1268, 381, 1611, 1095, 1495, 56, 472]
-        sine_input = torch.tensor(SINE_TOKENS, dtype=torch.long, device=DEVICE).view(1, 8, 1)
-        for _ in range(extra_frames):
-            # Pass sine tokens as user input to simulate silence on user side
-            tokens = lm_gen.step(sine_input)
-            if tokens is None:
-                continue
-            # Decode agent audio
-            pcm = decode_tokens_to_pcm(mimi, other_mimi, tokens)
-            generated_frames.append(pcm)
-            # Decode text token
-            text_token = tokens[0, 0, 0].item()
-            if text_token not in (0, 3):  # Skip special tokens
-                text_piece = text_tokenizer.id_to_piece(text_token).replace("▁", " ")
-                generated_text.append(text_piece)
     if not generated_frames:
         return None, "No audio generated. Try speaking more clearly."

         other_mimi.streaming_forever(1)
         lm_gen.streaming_forever(1)
+        # Run warmup to initialize CUDA graphs (improves performance)
+        print("Running warmup...")
+        _warmup_models(mimi, other_mimi, lm_gen, frame_size)
+        print("Warmup complete.")
         _model_cache.update({
             "mimi": mimi,
             "other_mimi": other_mimi,
     return _model_cache
+def _warmup_models(mimi, other_mimi, lm_gen, frame_size):
+    """Run warmup passes to initialize CUDA graphs."""
+    for _ in range(4):
+        chunk = torch.zeros(1, 1, frame_size, dtype=torch.float32, device=DEVICE)
+        codes = mimi.encode(chunk)
+        _ = other_mimi.encode(chunk)
+        for c in range(codes.shape[-1]):
+            tokens = lm_gen.step(codes[:, :, c:c+1])
+            if tokens is not None:
+                _ = mimi.decode(tokens[:, 1:9])
+                _ = other_mimi.decode(tokens[:, 1:9])
+    torch.cuda.synchronize()
+    # Reset after warmup
+    mimi.reset_streaming()
+    other_mimi.reset_streaming()
+    lm_gen.reset_streaming()
 def wrap_with_system_tags(text: str) -> str:
     """Add system tags as PersonaPlex expects."""
         import sphn
         audio = sphn.resample(audio, sr, mimi.sample_rate)
+    # PAD INPUT WITH SILENCE to give the model time to respond
+    # This is critical because PersonaPlex output duration = input duration
+    # Adding ~8 seconds of silence allows the model to complete its response
+    silence_duration = 8  # seconds
+    silence = np.zeros(int(silence_duration * mimi.sample_rate), dtype=np.float32)
+    audio = np.concatenate([audio, silence])
     # Add channel dimension: (T,) -> (1, T)
     if audio.ndim == 1:
         audio = audio[None, :]
                 if text_token not in (0, 3):  # Skip special tokens
                     text_piece = text_tokenizer.id_to_piece(text_token).replace("▁", " ")
                     generated_text.append(text_piece)
     if not generated_frames:
         return None, "No audio generated. Try speaking more clearly."