File size: 8,920 Bytes

fb33bb7

"""PersonaFlow - Interactive Audio Character Demo for Hugging Face Spaces."""
import logging
import os
from pathlib import Path

import gradio as gr
import numpy as np

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Check if running on Hugging Face Spaces
IS_SPACES = os.environ.get("SPACE_ID") is not None

# Import spaces conditionally
if IS_SPACES:
    import spaces

# Import local modules
from config.characters import get_character, get_all_characters, DEFAULT_CHARACTER_ID

# Lazy import pipeline to avoid loading models at import time
_pipeline = None


def get_pipeline():
    """Get the audio pipeline, creating it if needed."""
    global _pipeline
    if _pipeline is None:
        from src.pipeline import AudioPipeline
        device = "cuda" if IS_SPACES else "cpu"
        _pipeline = AudioPipeline(device=device)
    return _pipeline


def _process_audio_impl(audio_tuple, character_id, conversation_history):
    """Implementation of audio processing pipeline."""
    if audio_tuple is None:
        return None, "", "", "No audio recorded"

    sample_rate, audio_data = audio_tuple

    # Check for valid audio
    if len(audio_data) == 0:
        return None, "", "", "No audio detected"

    # Get character
    character = get_character(character_id)
    if character is None:
        character = get_character(DEFAULT_CHARACTER_ID)

    logger.info(f"Processing audio for character: {character.name}")

    try:
        # Get pipeline and process
        pipeline = get_pipeline()
        audio_out, user_text, response_text, timings = pipeline.process(
            audio_tuple=audio_tuple,
            system_prompt=character.system_prompt,
            voice=character.voice,
            conversation_history=conversation_history,
        )

        # Format timing info
        timing_str = f"STT: {timings['stt']*1000:.0f}ms | LLM: {timings['llm']*1000:.0f}ms | TTS: {timings['tts']*1000:.0f}ms | Total: {timings['total']*1000:.0f}ms"

        return audio_out, user_text, response_text, timing_str

    except Exception as e:
        logger.error(f"Error processing audio: {e}", exc_info=True)
        return None, "", f"Error: {str(e)}", ""


# Define the GPU-decorated function conditionally
if IS_SPACES:
    @spaces.GPU(duration=30)
    def process_audio_gpu(audio_tuple, character_id, conversation_history):
        """Process audio with GPU acceleration on Spaces."""
        return _process_audio_impl(audio_tuple, character_id, conversation_history)
else:
    def process_audio_gpu(audio_tuple, character_id, conversation_history):
        """Process audio locally (no GPU decorator)."""
        return _process_audio_impl(audio_tuple, character_id, conversation_history)


def create_portrait_html(character):
    """Create HTML for the animated portrait."""
    emoji = '🚀' if character.id == 'visionary' else '🤔' if character.id == 'skeptic' else '🌟'
    return f"""

    <div class="portrait-container portrait-idle" style="

        width: 200px;

        height: 200px;

        border-radius: 50%;

        background: {character.portrait_color};

        margin: 0 auto;

        display: flex;

        align-items: center;

        justify-content: center;

        box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2);

        position: relative;

    ">

        <div class="portrait-placeholder" style="font-size: 80px;">

            {emoji}

        </div>

        <div class="mouth-overlay mouth-closed" style="

            position: absolute;

            bottom: 25%;

            left: 50%;

            transform: translateX(-50%);

            width: 40px;

            height: 8px;

            background: rgba(0, 0, 0, 0.2);

            border-radius: 4px;

        "></div>

    </div>

    <div class="status-indicator status-idle" style="

        display: flex;

        align-items: center;

        justify-content: center;

        gap: 8px;

        padding: 8px 16px;

        border-radius: 20px;

        margin: 15px auto;

        width: fit-content;

        background: #f3f4f6;

    ">

        <div class="status-dot" style="width: 8px; height: 8px; border-radius: 50%; background: #9ca3af;"></div>

        <span class="status-text">Ready to listen</span>

    </div>

    """


def on_audio_record(audio, character_id, history):
    """Handle audio recording completion."""
    if history is None:
        history = []

    if audio is None:
        return None, "", history, history

    # Convert history (list of tuples) to format expected by LLM
    conversation_history = []
    for user_msg, assistant_msg in history:
        conversation_history.append({"role": "user", "content": user_msg})
        conversation_history.append({"role": "assistant", "content": assistant_msg})

    # Process audio
    audio_out, user_text, response_text, timing = process_audio_gpu(
        audio, character_id, conversation_history
    )

    # Update history (Gradio 4.x uses list of tuples)
    new_history = list(history)
    if user_text and response_text:
        new_history.append((user_text, response_text))

    return audio_out, timing, new_history, new_history


def update_character_info(character_id):
    """Update character info when selection changes."""
    char = get_character(character_id)
    if char:
        return f"**{char.tagline}**\n\n{char.description}", create_portrait_html(char), [], []
    return "", "", [], []


def clear_conversation():
    """Clear the conversation history."""
    return [], []


# Load CSS
css_path = Path(__file__).parent / "static" / "styles.css"
custom_css = ""
if css_path.exists():
    custom_css = css_path.read_text()


# Build the Gradio interface
with gr.Blocks(
    title="PersonaFlow",
    theme=gr.themes.Soft(),
    css=custom_css,
) as demo:
    # Sign in option to get rid of non-registered user GPU bug
    gr.LoginButton(value="Sign in to use your Pro Quota")
    
    # State
    conversation_state = gr.State([])

    # Header
    gr.Markdown("""

    # 🎭 PersonaFlow

    ### Speak with AI characters that have distinct personalities and voices



    Select a character, then click the microphone to start talking!

    """)

    with gr.Row():
        # Left column: Character selection
        with gr.Column(scale=1):
            gr.Markdown("### Choose Your Character")

            character_dropdown = gr.Dropdown(
                choices=[(c.name, c.id) for c in get_all_characters()],
                value=DEFAULT_CHARACTER_ID,
                label="Character",
                interactive=True,
            )

            # Character info
            default_char = get_character(DEFAULT_CHARACTER_ID)
            character_info = gr.Markdown(
                f"**{default_char.tagline}**\n\n{default_char.description}"
            )

        # Middle column: Portrait and audio
        with gr.Column(scale=2):
            # Portrait display
            portrait_html = gr.HTML(
                value=create_portrait_html(get_character(DEFAULT_CHARACTER_ID)),
            )

            # Audio input
            audio_input = gr.Audio(
                sources=["microphone"],
                type="numpy",
                label="🎤 Click to speak",
                max_length=10,
            )

            # Audio output
            audio_output = gr.Audio(
                label="Character Response",
                type="numpy",
                autoplay=True,
            )

            # Timing display
            timing_display = gr.Textbox(
                label="Processing Time",
                interactive=False,
            )

        # Right column: Conversation
        with gr.Column(scale=1):
            gr.Markdown("### Conversation")

            chatbot = gr.Chatbot(
                label="Chat History",
                height=400,
            )

            clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary")

    # Event handlers
    character_dropdown.change(
        fn=update_character_info,
        inputs=[character_dropdown],
        outputs=[character_info, portrait_html, chatbot, conversation_state],
    )

    # Audio processing
    audio_input.stop_recording(
        fn=on_audio_record,
        inputs=[audio_input, character_dropdown, conversation_state],
        outputs=[audio_output, timing_display, chatbot, conversation_state],
    )

    # Clear conversation
    clear_btn.click(
        fn=clear_conversation,
        outputs=[chatbot, conversation_state],
    )

if __name__ == "__main__":
    demo.launch(show_api=False)