"""PersonaFlow - Interactive Audio Character Demo for Hugging Face Spaces.""" import logging import os from pathlib import Path import gradio as gr import numpy as np # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) # Check if running on Hugging Face Spaces IS_SPACES = os.environ.get("SPACE_ID") is not None # Import spaces conditionally if IS_SPACES: import spaces # Import local modules from config.characters import get_character, get_all_characters, DEFAULT_CHARACTER_ID # Lazy import pipeline to avoid loading models at import time _pipeline = None def get_pipeline(): """Get the audio pipeline, creating it if needed.""" global _pipeline if _pipeline is None: from src.pipeline import AudioPipeline device = "cuda" if IS_SPACES else "cpu" _pipeline = AudioPipeline(device=device) return _pipeline def _process_audio_impl(audio_tuple, character_id, conversation_history): """Implementation of audio processing pipeline.""" if audio_tuple is None: return None, "", "", "No audio recorded" sample_rate, audio_data = audio_tuple # Check for valid audio if len(audio_data) == 0: return None, "", "", "No audio detected" # Get character character = get_character(character_id) if character is None: character = get_character(DEFAULT_CHARACTER_ID) logger.info(f"Processing audio for character: {character.name}") try: # Get pipeline and process pipeline = get_pipeline() audio_out, user_text, response_text, timings = pipeline.process( audio_tuple=audio_tuple, system_prompt=character.system_prompt, voice=character.voice, conversation_history=conversation_history, ) # Format timing info timing_str = f"STT: {timings['stt']*1000:.0f}ms | LLM: {timings['llm']*1000:.0f}ms | TTS: {timings['tts']*1000:.0f}ms | Total: {timings['total']*1000:.0f}ms" return audio_out, user_text, response_text, timing_str except Exception as e: logger.error(f"Error processing audio: {e}", exc_info=True) return None, "", f"Error: {str(e)}", "" # Define the GPU-decorated function conditionally if IS_SPACES: @spaces.GPU(duration=30) def process_audio_gpu(audio_tuple, character_id, conversation_history): """Process audio with GPU acceleration on Spaces.""" return _process_audio_impl(audio_tuple, character_id, conversation_history) else: def process_audio_gpu(audio_tuple, character_id, conversation_history): """Process audio locally (no GPU decorator).""" return _process_audio_impl(audio_tuple, character_id, conversation_history) def create_portrait_html(character): """Create HTML for the animated portrait.""" emoji = '🚀' if character.id == 'visionary' else '🤔' if character.id == 'skeptic' else '🌟' return f"""
{emoji}
Ready to listen
""" def on_audio_record(audio, character_id, history): """Handle audio recording completion.""" if history is None: history = [] if audio is None: return None, "", history, history # Convert history (list of tuples) to format expected by LLM conversation_history = [] for user_msg, assistant_msg in history: conversation_history.append({"role": "user", "content": user_msg}) conversation_history.append({"role": "assistant", "content": assistant_msg}) # Process audio audio_out, user_text, response_text, timing = process_audio_gpu( audio, character_id, conversation_history ) # Update history (Gradio 4.x uses list of tuples) new_history = list(history) if user_text and response_text: new_history.append((user_text, response_text)) return audio_out, timing, new_history, new_history def update_character_info(character_id): """Update character info when selection changes.""" char = get_character(character_id) if char: return f"**{char.tagline}**\n\n{char.description}", create_portrait_html(char), [], [] return "", "", [], [] def clear_conversation(): """Clear the conversation history.""" return [], [] # Load CSS css_path = Path(__file__).parent / "static" / "styles.css" custom_css = "" if css_path.exists(): custom_css = css_path.read_text() # Build the Gradio interface with gr.Blocks( title="PersonaFlow", theme=gr.themes.Soft(), css=custom_css, ) as demo: # Sign in option to get rid of non-registered user GPU bug gr.LoginButton(value="Sign in to use your Pro Quota") # State conversation_state = gr.State([]) # Header gr.Markdown(""" # 🎭 PersonaFlow ### Speak with AI characters that have distinct personalities and voices Select a character, then click the microphone to start talking! """) with gr.Row(): # Left column: Character selection with gr.Column(scale=1): gr.Markdown("### Choose Your Character") character_dropdown = gr.Dropdown( choices=[(c.name, c.id) for c in get_all_characters()], value=DEFAULT_CHARACTER_ID, label="Character", interactive=True, ) # Character info default_char = get_character(DEFAULT_CHARACTER_ID) character_info = gr.Markdown( f"**{default_char.tagline}**\n\n{default_char.description}" ) # Middle column: Portrait and audio with gr.Column(scale=2): # Portrait display portrait_html = gr.HTML( value=create_portrait_html(get_character(DEFAULT_CHARACTER_ID)), ) # Audio input audio_input = gr.Audio( sources=["microphone"], type="numpy", label="🎤 Click to speak", max_length=10, ) # Audio output audio_output = gr.Audio( label="Character Response", type="numpy", autoplay=True, ) # Timing display timing_display = gr.Textbox( label="Processing Time", interactive=False, ) # Right column: Conversation with gr.Column(scale=1): gr.Markdown("### Conversation") chatbot = gr.Chatbot( label="Chat History", height=400, ) clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary") # Event handlers character_dropdown.change( fn=update_character_info, inputs=[character_dropdown], outputs=[character_info, portrait_html, chatbot, conversation_state], ) # Audio processing audio_input.stop_recording( fn=on_audio_record, inputs=[audio_input, character_dropdown, conversation_state], outputs=[audio_output, timing_display, chatbot, conversation_state], ) # Clear conversation clear_btn.click( fn=clear_conversation, outputs=[chatbot, conversation_state], ) if __name__ == "__main__": demo.launch(show_api=False)