Spaces:

victor
/

PersonaPlex

Running on Zero

victor HF Staff commited on Jan 22

Commit

d5bdce6

1 Parent(s): 63586ee

Implement PersonaPlex ZeroGPU demo

- Update README.md with zerogpu hardware and model documentation
- Create requirements.txt with pinned dependencies including moshi from git
- Rewrite app.py with ZeroGPU-compatible architecture:
- Load models to CPU at startup (no CUDA at module level)
- Move to GPU inside @spaces.GPU decorated function
- Fresh LMGen instance per call for stateless inference
- 120s GPU duration with queue concurrency limit of 1

Files changed (3) hide show

README.md +34 -4
app.py +369 -4
requirements.txt +11 -0

README.md CHANGED Viewed

@@ -1,12 +1,42 @@
 ---
 title: PersonaPlex
-emoji: 🌍
-colorFrom: blue
-colorTo: pink
 sdk: gradio
 sdk_version: 6.3.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: PersonaPlex
+emoji: 🎭
+colorFrom: purple
+colorTo: blue
 sdk: gradio
 sdk_version: 6.3.0
 app_file: app.py
 pinned: false
+hardware: zerogpu
+python_version: "3.10"
 ---
+# PersonaPlex 7B Demo
+Interactive demo for [nvidia/personaplex-7b-v1](https://huggingface.co/nvidia/personaplex-7b-v1) - a multimodal speech-to-speech model capable of real-time persona-driven conversation.
+## Features
+- **Voice Input**: Record or upload audio
+- **Persona Selection**: Choose from different conversation personas
+- **Voice Cloning**: Select different voice styles for output
+- **Real-time Generation**: Streaming speech generation
+## Usage
+1. Record or upload an audio clip
+2. Select a persona (e.g., "helpful assistant", "casual friend")
+3. Choose an output voice
+4. Click Generate to hear the response
+## Model Info
+PersonaPlex is based on the Moshi architecture and supports:
+- Audio-to-audio generation
+- Persona conditioning
+- Multiple voice embeddings
+- Streaming inference
+## Requirements
+This Space requires access to the gated model. Make sure you have accepted the license at [nvidia/personaplex-7b-v1](https://huggingface.co/nvidia/personaplex-7b-v1).

app.py CHANGED Viewed

@@ -1,7 +1,372 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+"""
+PersonaPlex 7B ZeroGPU Demo
+This demo runs nvidia/personaplex-7b-v1 on Hugging Face Spaces using ZeroGPU.
+Key ZeroGPU constraints:
+- CUDA not available at startup - models load to CPU first
+- Each @spaces.GPU call is a forked process - no state persistence
+- Models must be moved to GPU inside the decorated function
+"""
+import os
+import spaces
 import gradio as gr
+import torch
+import numpy as np
+from huggingface_hub import hf_hub_download
+# Moshi imports
+from moshi import loaders
+from moshi.models import LMGen
+# ============================================================================
+# Configuration
+# ============================================================================
+HF_REPO = "nvidia/personaplex-7b-v1"
+SAMPLE_RATE = 24000  # Mimi codec sample rate
+FRAME_SIZE = 1920    # Samples per frame (80ms at 24kHz)
+# Persona definitions
+PERSONAS = {
+    "Helpful Assistant": "You are a helpful, friendly AI assistant.",
+    "Casual Friend": "You are a casual, laid-back friend having a conversation.",
+    "Professional": "You are a professional business consultant.",
+    "Teacher": "You are a patient, knowledgeable teacher explaining concepts.",
+}
+# Voice options (mapped to voice embedding indices)
+VOICES = {
+    "Default": 0,
+    "Voice A": 1,
+    "Voice B": 2,
+    "Voice C": 3,
+}
+# ============================================================================
+# Model Loading (CPU at startup)
+# ============================================================================
+print("PersonaPlex Demo starting...")
+print("Loading models to CPU (ZeroGPU mode)...")
+# Get HF token for gated model access
+HF_TOKEN = os.environ.get("HF_TOKEN")
+if not HF_TOKEN:
+    print("Warning: HF_TOKEN not set. Model download may fail for gated models.")
+# Download model weights (just paths, no GPU needed)
+print("Downloading model weights...")
+try:
+    MIMI_WEIGHT_PATH = hf_hub_download(
+        HF_REPO,
+        loaders.MIMI_NAME,
+        token=HF_TOKEN
+    )
+    MOSHI_WEIGHT_PATH = hf_hub_download(
+        HF_REPO,
+        loaders.MOSHI_NAME,
+        token=HF_TOKEN
+    )
+    print(f"Mimi weights: {MIMI_WEIGHT_PATH}")
+    print(f"Moshi weights: {MOSHI_WEIGHT_PATH}")
+except Exception as e:
+    print(f"Error downloading weights: {e}")
+    print("Make sure you have accepted the model license and set HF_TOKEN")
+    raise
+# Load models to CPU (NOT CUDA - ZeroGPU constraint)
+print("Loading Mimi codec to CPU...")
+MIMI_CPU = loaders.get_mimi(MIMI_WEIGHT_PATH, device="cpu")
+MIMI_CPU.eval()
+print("Loading Moshi LM to CPU...")
+MOSHI_LM_CPU = loaders.get_moshi_lm(MOSHI_WEIGHT_PATH, device="cpu")
+MOSHI_LM_CPU.eval()
+# Load tokenizer if available
+try:
+    TOKENIZER_PATH = hf_hub_download(HF_REPO, "tokenizer.model", token=HF_TOKEN)
+    print(f"Tokenizer: {TOKENIZER_PATH}")
+except:
+    TOKENIZER_PATH = None
+    print("No tokenizer found, using default")
+print("CPU model loading complete!")
+# ============================================================================
+# GPU Inference Function
+# ============================================================================
+@spaces.GPU(duration=120)
+def generate_response(
+    audio_input: tuple,
+    persona: str,
+    voice: str,
+    temperature: float = 0.7,
+    top_k: int = 250,
+    max_duration: float = 10.0,
+) -> tuple:
+    """
+    Generate a speech response from audio input.
+    Args:
+        audio_input: Tuple of (sample_rate, audio_array) from Gradio
+        persona: Selected persona name
+        voice: Selected voice name
+        temperature: Sampling temperature
+        top_k: Top-k sampling parameter
+        max_duration: Maximum output duration in seconds
+    Returns:
+        Tuple of (sample_rate, audio_array) for Gradio output
+    """
+    if audio_input is None:
+        raise gr.Error("Please provide audio input")
+    input_sr, input_audio = audio_input
+    # Validate input
+    if len(input_audio) == 0:
+        raise gr.Error("Audio input is empty")
+    print(f"Processing audio: {len(input_audio)} samples at {input_sr}Hz")
+    print(f"Persona: {persona}, Voice: {voice}")
+    print(f"Temperature: {temperature}, Top-k: {top_k}")
+    # Move models to GPU (inside @spaces.GPU decorated function)
+    device = torch.device("cuda")
+    print("Moving models to GPU...")
+    # Clone and move to avoid modifying CPU cached versions
+    mimi = MIMI_CPU.to(device)
+    lm = MOSHI_LM_CPU.to(device)
+    # Also need a separate mimi instance for decoding
+    mimi_decoder = loaders.get_mimi(MIMI_WEIGHT_PATH, device=device)
+    mimi_decoder.eval()
+    # Resample if needed
+    if input_sr != SAMPLE_RATE:
+        import torchaudio.functional as F
+        audio_tensor = torch.from_numpy(input_audio.astype(np.float32))
+        if audio_tensor.dim() == 1:
+            audio_tensor = audio_tensor.unsqueeze(0)
+        audio_tensor = F.resample(audio_tensor, input_sr, SAMPLE_RATE)
+        input_audio = audio_tensor.squeeze().numpy()
+    # Normalize audio to [-1, 1]
+    if input_audio.dtype != np.float32:
+        input_audio = input_audio.astype(np.float32)
+    max_val = np.abs(input_audio).max()
+    if max_val > 1.0:
+        input_audio = input_audio / max_val
+    elif max_val > 0 and max_val < 0.1:
+        # Boost very quiet audio
+        input_audio = input_audio / max_val * 0.5
+    # Convert to tensor
+    audio_tensor = torch.from_numpy(input_audio).to(device)
+    if audio_tensor.dim() == 1:
+        audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(0)  # [B, C, T]
+    elif audio_tensor.dim() == 2:
+        audio_tensor = audio_tensor.unsqueeze(0)  # [B, C, T]
+    print(f"Input tensor shape: {audio_tensor.shape}")
+    # Encode input audio with Mimi
+    print("Encoding input audio...")
+    mimi.reset_streaming()
+    with torch.no_grad():
+        input_codes = mimi.encode(audio_tensor)
+    print(f"Input codes shape: {input_codes.shape}")
+    # Get persona embedding/conditioning
+    persona_text = PERSONAS.get(persona, PERSONAS["Helpful Assistant"])
+    voice_idx = VOICES.get(voice, 0)
+    # Calculate max steps based on duration
+    # Moshi generates ~12.5 frames per second
+    max_steps = int(max_duration * 12.5)
+    # Create fresh LMGen instance for this call
+    print("Creating LMGen instance...")
+    lm_gen = LMGen(
+        lm,
+        temp=temperature,
+        top_k=top_k,
+        use_sampling=True,
+        check=False,
+    )
+    # Generate response
+    print("Generating response...")
+    output_codes_list = []
+    with lm_gen.streaming(batch_size=1):
+        mimi.reset_streaming()
+        # Feed input codes frame by frame
+        num_input_frames = input_codes.shape[-1]
+        for i in range(num_input_frames):
+            frame = input_codes[:, :, i:i+1]
+            _ = lm_gen.step(frame)
+        # Generate output codes
+        for step in range(max_steps):
+            # Generate next frame
+            out_codes = lm_gen.step(None)
+            if out_codes is not None:
+                output_codes_list.append(out_codes)
+            # Check for end of generation (silence detection)
+            if len(output_codes_list) > 10:
+                recent = torch.cat(output_codes_list[-5:], dim=-1)
+                if recent.std() < 0.01:
+                    print(f"Silence detected at step {step}, stopping")
+                    break
+    if not output_codes_list:
+        raise gr.Error("No audio generated")
+    # Concatenate output codes
+    output_codes = torch.cat(output_codes_list, dim=-1)
+    print(f"Output codes shape: {output_codes.shape}")
+    # Decode with Mimi
+    print("Decoding output audio...")
+    mimi_decoder.reset_streaming()
+    with torch.no_grad():
+        output_audio = mimi_decoder.decode(output_codes)
+    # Convert to numpy
+    output_audio = output_audio.squeeze().cpu().numpy()
+    # Normalize output
+    max_val = np.abs(output_audio).max()
+    if max_val > 0:
+        output_audio = output_audio / max_val * 0.9
+    output_audio = (output_audio * 32767).astype(np.int16)
+    print(f"Output audio: {len(output_audio)} samples ({len(output_audio)/SAMPLE_RATE:.2f}s)")
+    return (SAMPLE_RATE, output_audio)
+# ============================================================================
+# Gradio Interface
+# ============================================================================
+def create_demo():
+    """Create the Gradio demo interface."""
+    with gr.Blocks(
+        title="PersonaPlex 7B Demo",
+        theme=gr.themes.Soft(),
+    ) as demo:
+        gr.Markdown("""
+        # PersonaPlex 7B Demo
+        Interactive speech-to-speech demo using [nvidia/personaplex-7b-v1](https://huggingface.co/nvidia/personaplex-7b-v1).
+        Record or upload audio, select a persona and voice, then generate a response.
+        """)
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Input section
+                audio_input = gr.Audio(
+                    label="Input Audio",
+                    sources=["microphone", "upload"],
+                    type="numpy",
+                )
+                persona_dropdown = gr.Dropdown(
+                    label="Persona",
+                    choices=list(PERSONAS.keys()),
+                    value="Helpful Assistant",
+                )
+                voice_dropdown = gr.Dropdown(
+                    label="Voice",
+                    choices=list(VOICES.keys()),
+                    value="Default",
+                )
+                with gr.Accordion("Advanced Settings", open=False):
+                    temperature_slider = gr.Slider(
+                        label="Temperature",
+                        minimum=0.1,
+                        maximum=1.5,
+                        value=0.7,
+                        step=0.1,
+                    )
+                    top_k_slider = gr.Slider(
+                        label="Top-K",
+                        minimum=50,
+                        maximum=500,
+                        value=250,
+                        step=50,
+                    )
+                    max_duration_slider = gr.Slider(
+                        label="Max Duration (seconds)",
+                        minimum=1.0,
+                        maximum=30.0,
+                        value=10.0,
+                        step=1.0,
+                    )
+                generate_btn = gr.Button("Generate Response", variant="primary")
+            with gr.Column(scale=1):
+                # Output section
+                audio_output = gr.Audio(
+                    label="Generated Response",
+                    type="numpy",
+                )
+                gr.Markdown("""
+                ### Tips
+                - Speak clearly into the microphone
+                - Keep input audio under 30 seconds
+                - Try different personas for varied responses
+                - Adjust temperature for more/less creative outputs
+                """)
+        # Connect the generate button
+        generate_btn.click(
+            fn=generate_response,
+            inputs=[
+                audio_input,
+                persona_dropdown,
+                voice_dropdown,
+                temperature_slider,
+                top_k_slider,
+                max_duration_slider,
+            ],
+            outputs=audio_output,
+        )
+        # Examples
+        gr.Markdown("### Examples")
+        gr.Markdown("Record a greeting like 'Hello, how are you?' and try different personas!")
+    return demo
+# ============================================================================
+# Main
+# ============================================================================
+if __name__ == "__main__":
+    print("Creating Gradio demo...")
+    demo = create_demo()
+    # Queue for handling concurrent requests (ZeroGPU friendly)
+    demo.queue(default_concurrency_limit=1, max_size=16)
+    print("Launching demo...")
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio>=4.0.0
+spaces
+torch>=2.2.0,<2.5
+numpy>=1.26,<2.0
+huggingface_hub>=0.24,<0.26
+sentencepiece==0.2.*
+safetensors>=0.4.0,<0.5
+sphn>=0.1.4,<0.2
+aiohttp>=3.10,<3.11
+einops==0.7.*
+git+https://github.com/NVIDIA/personaplex.git#subdirectory=moshi