File size: 8,920 Bytes
fb33bb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
"""PersonaFlow - Interactive Audio Character Demo for Hugging Face Spaces."""
import logging
import os
from pathlib import Path

import gradio as gr
import numpy as np

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Check if running on Hugging Face Spaces
IS_SPACES = os.environ.get("SPACE_ID") is not None

# Import spaces conditionally
if IS_SPACES:
    import spaces

# Import local modules
from config.characters import get_character, get_all_characters, DEFAULT_CHARACTER_ID

# Lazy import pipeline to avoid loading models at import time
_pipeline = None


def get_pipeline():
    """Get the audio pipeline, creating it if needed."""
    global _pipeline
    if _pipeline is None:
        from src.pipeline import AudioPipeline
        device = "cuda" if IS_SPACES else "cpu"
        _pipeline = AudioPipeline(device=device)
    return _pipeline


def _process_audio_impl(audio_tuple, character_id, conversation_history):
    """Implementation of audio processing pipeline."""
    if audio_tuple is None:
        return None, "", "", "No audio recorded"

    sample_rate, audio_data = audio_tuple

    # Check for valid audio
    if len(audio_data) == 0:
        return None, "", "", "No audio detected"

    # Get character
    character = get_character(character_id)
    if character is None:
        character = get_character(DEFAULT_CHARACTER_ID)

    logger.info(f"Processing audio for character: {character.name}")

    try:
        # Get pipeline and process
        pipeline = get_pipeline()
        audio_out, user_text, response_text, timings = pipeline.process(
            audio_tuple=audio_tuple,
            system_prompt=character.system_prompt,
            voice=character.voice,
            conversation_history=conversation_history,
        )

        # Format timing info
        timing_str = f"STT: {timings['stt']*1000:.0f}ms | LLM: {timings['llm']*1000:.0f}ms | TTS: {timings['tts']*1000:.0f}ms | Total: {timings['total']*1000:.0f}ms"

        return audio_out, user_text, response_text, timing_str

    except Exception as e:
        logger.error(f"Error processing audio: {e}", exc_info=True)
        return None, "", f"Error: {str(e)}", ""


# Define the GPU-decorated function conditionally
if IS_SPACES:
    @spaces.GPU(duration=30)
    def process_audio_gpu(audio_tuple, character_id, conversation_history):
        """Process audio with GPU acceleration on Spaces."""
        return _process_audio_impl(audio_tuple, character_id, conversation_history)
else:
    def process_audio_gpu(audio_tuple, character_id, conversation_history):
        """Process audio locally (no GPU decorator)."""
        return _process_audio_impl(audio_tuple, character_id, conversation_history)


def create_portrait_html(character):
    """Create HTML for the animated portrait."""
    emoji = 'πŸš€' if character.id == 'visionary' else 'πŸ€”' if character.id == 'skeptic' else '🌟'
    return f"""

    <div class="portrait-container portrait-idle" style="

        width: 200px;

        height: 200px;

        border-radius: 50%;

        background: {character.portrait_color};

        margin: 0 auto;

        display: flex;

        align-items: center;

        justify-content: center;

        box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2);

        position: relative;

    ">

        <div class="portrait-placeholder" style="font-size: 80px;">

            {emoji}

        </div>

        <div class="mouth-overlay mouth-closed" style="

            position: absolute;

            bottom: 25%;

            left: 50%;

            transform: translateX(-50%);

            width: 40px;

            height: 8px;

            background: rgba(0, 0, 0, 0.2);

            border-radius: 4px;

        "></div>

    </div>

    <div class="status-indicator status-idle" style="

        display: flex;

        align-items: center;

        justify-content: center;

        gap: 8px;

        padding: 8px 16px;

        border-radius: 20px;

        margin: 15px auto;

        width: fit-content;

        background: #f3f4f6;

    ">

        <div class="status-dot" style="width: 8px; height: 8px; border-radius: 50%; background: #9ca3af;"></div>

        <span class="status-text">Ready to listen</span>

    </div>

    """


def on_audio_record(audio, character_id, history):
    """Handle audio recording completion."""
    if history is None:
        history = []

    if audio is None:
        return None, "", history, history

    # Convert history (list of tuples) to format expected by LLM
    conversation_history = []
    for user_msg, assistant_msg in history:
        conversation_history.append({"role": "user", "content": user_msg})
        conversation_history.append({"role": "assistant", "content": assistant_msg})

    # Process audio
    audio_out, user_text, response_text, timing = process_audio_gpu(
        audio, character_id, conversation_history
    )

    # Update history (Gradio 4.x uses list of tuples)
    new_history = list(history)
    if user_text and response_text:
        new_history.append((user_text, response_text))

    return audio_out, timing, new_history, new_history


def update_character_info(character_id):
    """Update character info when selection changes."""
    char = get_character(character_id)
    if char:
        return f"**{char.tagline}**\n\n{char.description}", create_portrait_html(char), [], []
    return "", "", [], []


def clear_conversation():
    """Clear the conversation history."""
    return [], []


# Load CSS
css_path = Path(__file__).parent / "static" / "styles.css"
custom_css = ""
if css_path.exists():
    custom_css = css_path.read_text()


# Build the Gradio interface
with gr.Blocks(
    title="PersonaFlow",
    theme=gr.themes.Soft(),
    css=custom_css,
) as demo:
    # Sign in option to get rid of non-registered user GPU bug
    gr.LoginButton(value="Sign in to use your Pro Quota")
    
    # State
    conversation_state = gr.State([])

    # Header
    gr.Markdown("""

    # 🎭 PersonaFlow

    ### Speak with AI characters that have distinct personalities and voices



    Select a character, then click the microphone to start talking!

    """)

    with gr.Row():
        # Left column: Character selection
        with gr.Column(scale=1):
            gr.Markdown("### Choose Your Character")

            character_dropdown = gr.Dropdown(
                choices=[(c.name, c.id) for c in get_all_characters()],
                value=DEFAULT_CHARACTER_ID,
                label="Character",
                interactive=True,
            )

            # Character info
            default_char = get_character(DEFAULT_CHARACTER_ID)
            character_info = gr.Markdown(
                f"**{default_char.tagline}**\n\n{default_char.description}"
            )

        # Middle column: Portrait and audio
        with gr.Column(scale=2):
            # Portrait display
            portrait_html = gr.HTML(
                value=create_portrait_html(get_character(DEFAULT_CHARACTER_ID)),
            )

            # Audio input
            audio_input = gr.Audio(
                sources=["microphone"],
                type="numpy",
                label="🎀 Click to speak",
                max_length=10,
            )

            # Audio output
            audio_output = gr.Audio(
                label="Character Response",
                type="numpy",
                autoplay=True,
            )

            # Timing display
            timing_display = gr.Textbox(
                label="Processing Time",
                interactive=False,
            )

        # Right column: Conversation
        with gr.Column(scale=1):
            gr.Markdown("### Conversation")

            chatbot = gr.Chatbot(
                label="Chat History",
                height=400,
            )

            clear_btn = gr.Button("πŸ—‘οΈ Clear Conversation", variant="secondary")

    # Event handlers
    character_dropdown.change(
        fn=update_character_info,
        inputs=[character_dropdown],
        outputs=[character_info, portrait_html, chatbot, conversation_state],
    )

    # Audio processing
    audio_input.stop_recording(
        fn=on_audio_record,
        inputs=[audio_input, character_dropdown, conversation_state],
        outputs=[audio_output, timing_display, chatbot, conversation_state],
    )

    # Clear conversation
    clear_btn.click(
        fn=clear_conversation,
        outputs=[chatbot, conversation_state],
    )

if __name__ == "__main__":
    demo.launch(show_api=False)