Kailing-Leifang
/

PersonaFlow

+"""PersonaFlow - Interactive Audio Character Demo for Hugging Face Spaces."""
+import logging
+import os
+from pathlib import Path
+import gradio as gr
+import numpy as np
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Check if running on Hugging Face Spaces
+IS_SPACES = os.environ.get("SPACE_ID") is not None
+# Import spaces conditionally
+if IS_SPACES:
+    import spaces
+# Import local modules
+from config.characters import get_character, get_all_characters, DEFAULT_CHARACTER_ID
+# Lazy import pipeline to avoid loading models at import time
+_pipeline = None
+def get_pipeline():
+    """Get the audio pipeline, creating it if needed."""
+    global _pipeline
+    if _pipeline is None:
+        from src.pipeline import AudioPipeline
+        device = "cuda" if IS_SPACES else "cpu"
+        _pipeline = AudioPipeline(device=device)
+    return _pipeline
+def _process_audio_impl(audio_tuple, character_id, conversation_history):
+    """Implementation of audio processing pipeline."""
+    if audio_tuple is None:
+        return None, "", "", "No audio recorded"
+    sample_rate, audio_data = audio_tuple
+    # Check for valid audio
+    if len(audio_data) == 0:
+        return None, "", "", "No audio detected"
+    # Get character
+    character = get_character(character_id)
+    if character is None:
+        character = get_character(DEFAULT_CHARACTER_ID)
+    logger.info(f"Processing audio for character: {character.name}")
+    try:
+        # Get pipeline and process
+        pipeline = get_pipeline()
+        audio_out, user_text, response_text, timings = pipeline.process(
+            audio_tuple=audio_tuple,
+            system_prompt=character.system_prompt,
+            voice=character.voice,
+            conversation_history=conversation_history,
+        )
+        # Format timing info
+        timing_str = f"STT: {timings['stt']*1000:.0f}ms | LLM: {timings['llm']*1000:.0f}ms | TTS: {timings['tts']*1000:.0f}ms | Total: {timings['total']*1000:.0f}ms"
+        return audio_out, user_text, response_text, timing_str
+    except Exception as e:
+        logger.error(f"Error processing audio: {e}", exc_info=True)
+        return None, "", f"Error: {str(e)}", ""
+# Define the GPU-decorated function conditionally
+if IS_SPACES:
+    @spaces.GPU(duration=30)
+    def process_audio_gpu(audio_tuple, character_id, conversation_history):
+        """Process audio with GPU acceleration on Spaces."""
+        return _process_audio_impl(audio_tuple, character_id, conversation_history)
+else:
+    def process_audio_gpu(audio_tuple, character_id, conversation_history):
+        """Process audio locally (no GPU decorator)."""
+        return _process_audio_impl(audio_tuple, character_id, conversation_history)
+def create_portrait_html(character):
+    """Create HTML for the animated portrait."""
+    emoji = '🚀' if character.id == 'visionary' else '🤔' if character.id == 'skeptic' else '🌟'
+    return f"""
+    <div class="portrait-container portrait-idle" style="
+        width: 200px;
+        height: 200px;
+        border-radius: 50%;
+        background: {character.portrait_color};
+        margin: 0 auto;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2);
+        position: relative;
+    ">
+        <div class="portrait-placeholder" style="font-size: 80px;">
+            {emoji}
+        </div>
+        <div class="mouth-overlay mouth-closed" style="
+            position: absolute;
+            bottom: 25%;
+            left: 50%;
+            transform: translateX(-50%);
+            width: 40px;
+            height: 8px;
+            background: rgba(0, 0, 0, 0.2);
+            border-radius: 4px;
+        "></div>
+    </div>
+    <div class="status-indicator status-idle" style="
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        gap: 8px;
+        padding: 8px 16px;
+        border-radius: 20px;
+        margin: 15px auto;
+        width: fit-content;
+        background: #f3f4f6;
+    ">
+        <div class="status-dot" style="width: 8px; height: 8px; border-radius: 50%; background: #9ca3af;"></div>
+        <span class="status-text">Ready to listen</span>
+    </div>
+    """
+def on_audio_record(audio, character_id, history):
+    """Handle audio recording completion."""
+    if history is None:
+        history = []
+    if audio is None:
+        return None, "", history, history
+    # Convert history (list of tuples) to format expected by LLM
+    conversation_history = []
+    for user_msg, assistant_msg in history:
+        conversation_history.append({"role": "user", "content": user_msg})
+        conversation_history.append({"role": "assistant", "content": assistant_msg})
+    # Process audio
+    audio_out, user_text, response_text, timing = process_audio_gpu(
+        audio, character_id, conversation_history
+    )
+    # Update history (Gradio 4.x uses list of tuples)
+    new_history = list(history)
+    if user_text and response_text:
+        new_history.append((user_text, response_text))
+    return audio_out, timing, new_history, new_history
+def update_character_info(character_id):
+    """Update character info when selection changes."""
+    char = get_character(character_id)
+    if char:
+        return f"**{char.tagline}**\n\n{char.description}", create_portrait_html(char), [], []
+    return "", "", [], []
+def clear_conversation():
+    """Clear the conversation history."""
+    return [], []
+# Load CSS
+css_path = Path(__file__).parent / "static" / "styles.css"
+custom_css = ""
+if css_path.exists():
+    custom_css = css_path.read_text()
+# Build the Gradio interface
+with gr.Blocks(
+    title="PersonaFlow",
+    theme=gr.themes.Soft(),
+    css=custom_css,
+) as demo:
+    # Sign in option to get rid of non-registered user GPU bug
+    gr.LoginButton(value="Sign in to use your Pro Quota")
+    # State
+    conversation_state = gr.State([])
+    # Header
+    gr.Markdown("""
+    # 🎭 PersonaFlow
+    ### Speak with AI characters that have distinct personalities and voices
+    Select a character, then click the microphone to start talking!
+    """)
+    with gr.Row():
+        # Left column: Character selection
+        with gr.Column(scale=1):
+            gr.Markdown("### Choose Your Character")
+            character_dropdown = gr.Dropdown(
+                choices=[(c.name, c.id) for c in get_all_characters()],
+                value=DEFAULT_CHARACTER_ID,
+                label="Character",
+                interactive=True,
+            )
+            # Character info
+            default_char = get_character(DEFAULT_CHARACTER_ID)
+            character_info = gr.Markdown(
+                f"**{default_char.tagline}**\n\n{default_char.description}"
+            )
+        # Middle column: Portrait and audio
+        with gr.Column(scale=2):
+            # Portrait display
+            portrait_html = gr.HTML(
+                value=create_portrait_html(get_character(DEFAULT_CHARACTER_ID)),
+            )
+            # Audio input
+            audio_input = gr.Audio(
+                sources=["microphone"],
+                type="numpy",
+                label="🎤 Click to speak",
+                max_length=10,
+            )
+            # Audio output
+            audio_output = gr.Audio(
+                label="Character Response",
+                type="numpy",
+                autoplay=True,
+            )
+            # Timing display
+            timing_display = gr.Textbox(
+                label="Processing Time",
+                interactive=False,
+            )
+        # Right column: Conversation
+        with gr.Column(scale=1):
+            gr.Markdown("### Conversation")
+            chatbot = gr.Chatbot(
+                label="Chat History",
+                height=400,
+            )
+            clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary")
+    # Event handlers
+    character_dropdown.change(
+        fn=update_character_info,
+        inputs=[character_dropdown],
+        outputs=[character_info, portrait_html, chatbot, conversation_state],
+    )
+    # Audio processing
+    audio_input.stop_recording(
+        fn=on_audio_record,
+        inputs=[audio_input, character_dropdown, conversation_state],
+        outputs=[audio_output, timing_display, chatbot, conversation_state],
+    )
+    # Clear conversation
+    clear_btn.click(
+        fn=clear_conversation,
+        outputs=[chatbot, conversation_state],
+    )
+if __name__ == "__main__":
+    demo.launch(show_api=False)