Kailing-Leifang commited on
Commit
fb33bb7
·
verified ·
1 Parent(s): 27df14a

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +282 -0
app.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PersonaFlow - Interactive Audio Character Demo for Hugging Face Spaces."""
2
+ import logging
3
+ import os
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+
9
+ # Configure logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
13
+ )
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Check if running on Hugging Face Spaces
17
+ IS_SPACES = os.environ.get("SPACE_ID") is not None
18
+
19
+ # Import spaces conditionally
20
+ if IS_SPACES:
21
+ import spaces
22
+
23
+ # Import local modules
24
+ from config.characters import get_character, get_all_characters, DEFAULT_CHARACTER_ID
25
+
26
+ # Lazy import pipeline to avoid loading models at import time
27
+ _pipeline = None
28
+
29
+
30
+ def get_pipeline():
31
+ """Get the audio pipeline, creating it if needed."""
32
+ global _pipeline
33
+ if _pipeline is None:
34
+ from src.pipeline import AudioPipeline
35
+ device = "cuda" if IS_SPACES else "cpu"
36
+ _pipeline = AudioPipeline(device=device)
37
+ return _pipeline
38
+
39
+
40
+ def _process_audio_impl(audio_tuple, character_id, conversation_history):
41
+ """Implementation of audio processing pipeline."""
42
+ if audio_tuple is None:
43
+ return None, "", "", "No audio recorded"
44
+
45
+ sample_rate, audio_data = audio_tuple
46
+
47
+ # Check for valid audio
48
+ if len(audio_data) == 0:
49
+ return None, "", "", "No audio detected"
50
+
51
+ # Get character
52
+ character = get_character(character_id)
53
+ if character is None:
54
+ character = get_character(DEFAULT_CHARACTER_ID)
55
+
56
+ logger.info(f"Processing audio for character: {character.name}")
57
+
58
+ try:
59
+ # Get pipeline and process
60
+ pipeline = get_pipeline()
61
+ audio_out, user_text, response_text, timings = pipeline.process(
62
+ audio_tuple=audio_tuple,
63
+ system_prompt=character.system_prompt,
64
+ voice=character.voice,
65
+ conversation_history=conversation_history,
66
+ )
67
+
68
+ # Format timing info
69
+ timing_str = f"STT: {timings['stt']*1000:.0f}ms | LLM: {timings['llm']*1000:.0f}ms | TTS: {timings['tts']*1000:.0f}ms | Total: {timings['total']*1000:.0f}ms"
70
+
71
+ return audio_out, user_text, response_text, timing_str
72
+
73
+ except Exception as e:
74
+ logger.error(f"Error processing audio: {e}", exc_info=True)
75
+ return None, "", f"Error: {str(e)}", ""
76
+
77
+
78
+ # Define the GPU-decorated function conditionally
79
+ if IS_SPACES:
80
+ @spaces.GPU(duration=30)
81
+ def process_audio_gpu(audio_tuple, character_id, conversation_history):
82
+ """Process audio with GPU acceleration on Spaces."""
83
+ return _process_audio_impl(audio_tuple, character_id, conversation_history)
84
+ else:
85
+ def process_audio_gpu(audio_tuple, character_id, conversation_history):
86
+ """Process audio locally (no GPU decorator)."""
87
+ return _process_audio_impl(audio_tuple, character_id, conversation_history)
88
+
89
+
90
+ def create_portrait_html(character):
91
+ """Create HTML for the animated portrait."""
92
+ emoji = '🚀' if character.id == 'visionary' else '🤔' if character.id == 'skeptic' else '🌟'
93
+ return f"""
94
+ <div class="portrait-container portrait-idle" style="
95
+ width: 200px;
96
+ height: 200px;
97
+ border-radius: 50%;
98
+ background: {character.portrait_color};
99
+ margin: 0 auto;
100
+ display: flex;
101
+ align-items: center;
102
+ justify-content: center;
103
+ box-shadow: 0 4px 20px rgba(0, 0, 0, 0.2);
104
+ position: relative;
105
+ ">
106
+ <div class="portrait-placeholder" style="font-size: 80px;">
107
+ {emoji}
108
+ </div>
109
+ <div class="mouth-overlay mouth-closed" style="
110
+ position: absolute;
111
+ bottom: 25%;
112
+ left: 50%;
113
+ transform: translateX(-50%);
114
+ width: 40px;
115
+ height: 8px;
116
+ background: rgba(0, 0, 0, 0.2);
117
+ border-radius: 4px;
118
+ "></div>
119
+ </div>
120
+ <div class="status-indicator status-idle" style="
121
+ display: flex;
122
+ align-items: center;
123
+ justify-content: center;
124
+ gap: 8px;
125
+ padding: 8px 16px;
126
+ border-radius: 20px;
127
+ margin: 15px auto;
128
+ width: fit-content;
129
+ background: #f3f4f6;
130
+ ">
131
+ <div class="status-dot" style="width: 8px; height: 8px; border-radius: 50%; background: #9ca3af;"></div>
132
+ <span class="status-text">Ready to listen</span>
133
+ </div>
134
+ """
135
+
136
+
137
+ def on_audio_record(audio, character_id, history):
138
+ """Handle audio recording completion."""
139
+ if history is None:
140
+ history = []
141
+
142
+ if audio is None:
143
+ return None, "", history, history
144
+
145
+ # Convert history (list of tuples) to format expected by LLM
146
+ conversation_history = []
147
+ for user_msg, assistant_msg in history:
148
+ conversation_history.append({"role": "user", "content": user_msg})
149
+ conversation_history.append({"role": "assistant", "content": assistant_msg})
150
+
151
+ # Process audio
152
+ audio_out, user_text, response_text, timing = process_audio_gpu(
153
+ audio, character_id, conversation_history
154
+ )
155
+
156
+ # Update history (Gradio 4.x uses list of tuples)
157
+ new_history = list(history)
158
+ if user_text and response_text:
159
+ new_history.append((user_text, response_text))
160
+
161
+ return audio_out, timing, new_history, new_history
162
+
163
+
164
+ def update_character_info(character_id):
165
+ """Update character info when selection changes."""
166
+ char = get_character(character_id)
167
+ if char:
168
+ return f"**{char.tagline}**\n\n{char.description}", create_portrait_html(char), [], []
169
+ return "", "", [], []
170
+
171
+
172
+ def clear_conversation():
173
+ """Clear the conversation history."""
174
+ return [], []
175
+
176
+
177
+ # Load CSS
178
+ css_path = Path(__file__).parent / "static" / "styles.css"
179
+ custom_css = ""
180
+ if css_path.exists():
181
+ custom_css = css_path.read_text()
182
+
183
+
184
+ # Build the Gradio interface
185
+ with gr.Blocks(
186
+ title="PersonaFlow",
187
+ theme=gr.themes.Soft(),
188
+ css=custom_css,
189
+ ) as demo:
190
+ # Sign in option to get rid of non-registered user GPU bug
191
+ gr.LoginButton(value="Sign in to use your Pro Quota")
192
+
193
+ # State
194
+ conversation_state = gr.State([])
195
+
196
+ # Header
197
+ gr.Markdown("""
198
+ # 🎭 PersonaFlow
199
+ ### Speak with AI characters that have distinct personalities and voices
200
+
201
+ Select a character, then click the microphone to start talking!
202
+ """)
203
+
204
+ with gr.Row():
205
+ # Left column: Character selection
206
+ with gr.Column(scale=1):
207
+ gr.Markdown("### Choose Your Character")
208
+
209
+ character_dropdown = gr.Dropdown(
210
+ choices=[(c.name, c.id) for c in get_all_characters()],
211
+ value=DEFAULT_CHARACTER_ID,
212
+ label="Character",
213
+ interactive=True,
214
+ )
215
+
216
+ # Character info
217
+ default_char = get_character(DEFAULT_CHARACTER_ID)
218
+ character_info = gr.Markdown(
219
+ f"**{default_char.tagline}**\n\n{default_char.description}"
220
+ )
221
+
222
+ # Middle column: Portrait and audio
223
+ with gr.Column(scale=2):
224
+ # Portrait display
225
+ portrait_html = gr.HTML(
226
+ value=create_portrait_html(get_character(DEFAULT_CHARACTER_ID)),
227
+ )
228
+
229
+ # Audio input
230
+ audio_input = gr.Audio(
231
+ sources=["microphone"],
232
+ type="numpy",
233
+ label="🎤 Click to speak",
234
+ max_length=10,
235
+ )
236
+
237
+ # Audio output
238
+ audio_output = gr.Audio(
239
+ label="Character Response",
240
+ type="numpy",
241
+ autoplay=True,
242
+ )
243
+
244
+ # Timing display
245
+ timing_display = gr.Textbox(
246
+ label="Processing Time",
247
+ interactive=False,
248
+ )
249
+
250
+ # Right column: Conversation
251
+ with gr.Column(scale=1):
252
+ gr.Markdown("### Conversation")
253
+
254
+ chatbot = gr.Chatbot(
255
+ label="Chat History",
256
+ height=400,
257
+ )
258
+
259
+ clear_btn = gr.Button("🗑️ Clear Conversation", variant="secondary")
260
+
261
+ # Event handlers
262
+ character_dropdown.change(
263
+ fn=update_character_info,
264
+ inputs=[character_dropdown],
265
+ outputs=[character_info, portrait_html, chatbot, conversation_state],
266
+ )
267
+
268
+ # Audio processing
269
+ audio_input.stop_recording(
270
+ fn=on_audio_record,
271
+ inputs=[audio_input, character_dropdown, conversation_state],
272
+ outputs=[audio_output, timing_display, chatbot, conversation_state],
273
+ )
274
+
275
+ # Clear conversation
276
+ clear_btn.click(
277
+ fn=clear_conversation,
278
+ outputs=[chatbot, conversation_state],
279
+ )
280
+
281
+ if __name__ == "__main__":
282
+ demo.launch(show_api=False)