kamioll999 commited on
Commit
79ca3ef
Β·
verified Β·
1 Parent(s): c1868a5

Update Gradio app with multiple files

Browse files
Files changed (2) hide show
  1. app.py +200 -16
  2. requirements.txt +54 -6
app.py CHANGED
@@ -1,20 +1,23 @@
1
  import os
2
  import asyncio
3
  import numpy as np
4
- from typing import AsyncGenerator, List, Dict
5
  import gradio as gr
6
  import google.generativeai as genai
7
  from fastrtc import Stream, ReplyOnPause, get_cloudflare_turn_credentials, get_tts_model, get_stt_model
8
  import spaces
9
  import time
10
  from dataclasses import dataclass
11
- from typing import Optional
12
 
13
- # Configure Gemini API
14
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
15
 
16
- # Initialize models
17
- model = genai.GenerativeModel('gemini-1.5-pro-latest')
 
 
 
18
  stt_model = get_stt_model()
19
  tts_model = get_tts_model()
20
 
@@ -24,18 +27,21 @@ class ConversationState:
24
  is_processing: bool = False
25
  last_transcript: str = ""
26
  last_response: str = ""
 
 
27
 
28
  class GeminiVoiceHandler:
29
  def __init__(self):
30
  self.state = ConversationState(messages=[])
31
- self.system_prompt = "You are a helpful and friendly AI assistant. Respond in a natural, conversational tone. Keep responses concise and engaging."
32
 
33
  async def process_audio(self, audio: tuple[int, np.ndarray]) -> AsyncGenerator[tuple[int, np.ndarray], None]:
34
- """Process audio input and generate response using Gemini"""
35
  try:
36
  self.state.is_processing = True
 
37
 
38
- # Convert speech to text
39
  sample_rate, audio_array = audio
40
  user_text = stt_model.stt(audio)
41
 
@@ -43,11 +49,11 @@ class GeminiVoiceHandler:
43
  self.state.is_processing = False
44
  return
45
 
46
- # Update state
47
  self.state.last_transcript = user_text
48
  self.state.messages.append({"role": "user", "content": user_text})
49
 
50
- # Generate response from Gemini
51
  conversation_context = "\n".join([
52
  f"{msg['role']}: {msg['content']}"
53
  for msg in self.state.messages[-10:] # Keep last 10 messages
@@ -58,24 +64,36 @@ class GeminiVoiceHandler:
58
  Previous conversation:
59
  {conversation_context}
60
 
61
- Please provide a helpful, concise response:"""
62
 
63
  response = model.generate_content(prompt)
64
  assistant_text = response.text
65
 
 
 
 
 
 
 
 
 
 
66
  # Update state
67
  self.state.last_response = assistant_text
68
  self.state.messages.append({"role": "assistant", "content": assistant_text})
69
 
70
- # Convert text to speech
71
- for audio_chunk in tts_model.stream_tts_sync(assistant_text):
 
72
  yield audio_chunk
73
 
74
  self.state.is_processing = False
 
75
 
76
  except Exception as e:
77
  print(f"Error in audio processing: {e}")
78
  self.state.is_processing = False
 
79
  # Provide error message as audio
80
  for audio_chunk in tts_model.stream_tts_sync("I'm sorry, I encountered an error. Please try again."):
81
  yield audio_chunk
@@ -141,6 +159,58 @@ custom_css = """
141
  animation: pulse 2s infinite;
142
  }
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  @keyframes pulse {
145
  0%, 100% {
146
  transform: scale(1);
@@ -434,10 +504,17 @@ with gr.Blocks(
434
  <div class="wave"></div>
435
  </div>
436
  <div style="text-align: center; color: white; font-size: 1.2rem; font-weight: 600;">
437
- 🎀 Voice Interface Active
438
  </div>
439
  <div style="text-align: center; color: rgba(255,255,255,0.9); margin-top: 0.5rem;">
440
- Speak naturally, I'll respond when you pause
 
 
 
 
 
 
 
441
  </div>
442
  """)
443
 
@@ -495,4 +572,111 @@ with gr.Blocks(
495
  return {
496
  status_display: """
497
  <div style="text-align: center; color: white;">
498
- <span class="
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import asyncio
3
  import numpy as np
4
+ from typing import AsyncGenerator, List, Dict, Optional
5
  import gradio as gr
6
  import google.generativeai as genai
7
  from fastrtc import Stream, ReplyOnPause, get_cloudflare_turn_credentials, get_tts_model, get_stt_model
8
  import spaces
9
  import time
10
  from dataclasses import dataclass
11
+ import json
12
 
13
+ # Configure Gemini API with enhanced tools
14
  genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
15
 
16
+ # Initialize models with Google Grounding Tools
17
+ model = genai.GenerativeModel(
18
+ 'gemini-1.5-pro-latest',
19
+ tools=[genai.types.Tool(google_search_retrieval=genai.types.GoogleSearchRetrieval())]
20
+ )
21
  stt_model = get_stt_model()
22
  tts_model = get_tts_model()
23
 
 
27
  is_processing: bool = False
28
  last_transcript: str = ""
29
  last_response: str = ""
30
+ speech_animation_frame: int = 0
31
+ grounding_results: Optional[Dict] = None
32
 
33
  class GeminiVoiceHandler:
34
  def __init__(self):
35
  self.state = ConversationState(messages=[])
36
+ self.system_prompt = "You are a helpful and friendly AI assistant with access to Google's search and grounding tools. Respond in a natural, conversational tone. Keep responses concise and engaging. Use search when helpful to provide accurate information."
37
 
38
  async def process_audio(self, audio: tuple[int, np.ndarray]) -> AsyncGenerator[tuple[int, np.ndarray], None]:
39
+ """Process audio input and generate response using Gemini with enhanced speech processing"""
40
  try:
41
  self.state.is_processing = True
42
+ self.state.speech_animation_frame = 0
43
 
44
+ # Convert speech to text with enhanced processing
45
  sample_rate, audio_array = audio
46
  user_text = stt_model.stt(audio)
47
 
 
49
  self.state.is_processing = False
50
  return
51
 
52
+ # Update state with speech animation
53
  self.state.last_transcript = user_text
54
  self.state.messages.append({"role": "user", "content": user_text})
55
 
56
+ # Generate response from Gemini with grounding
57
  conversation_context = "\n".join([
58
  f"{msg['role']}: {msg['content']}"
59
  for msg in self.state.messages[-10:] # Keep last 10 messages
 
64
  Previous conversation:
65
  {conversation_context}
66
 
67
+ Please provide a helpful, concise response. Use Google search when needed for current information."""
68
 
69
  response = model.generate_content(prompt)
70
  assistant_text = response.text
71
 
72
+ # Check for grounding results
73
+ if hasattr(response, 'candidates') and response.candidates:
74
+ candidate = response.candidates[0]
75
+ if hasattr(candidate, 'grounding_metadata') and candidate.grounding_metadata:
76
+ self.state.grounding_results = {
77
+ 'search_entry_point': candidate.grounding_metadata.search_entry_point,
78
+ 'grounding_chunks': candidate.grounding_metadata.grounding_chunks
79
+ }
80
+
81
  # Update state
82
  self.state.last_response = assistant_text
83
  self.state.messages.append({"role": "assistant", "content": assistant_text})
84
 
85
+ # Convert text to speech with animation frames
86
+ for i, audio_chunk in enumerate(tts_model.stream_tts_sync(assistant_text)):
87
+ self.state.speech_animation_frame = i % 10
88
  yield audio_chunk
89
 
90
  self.state.is_processing = False
91
+ self.state.speech_animation_frame = 0
92
 
93
  except Exception as e:
94
  print(f"Error in audio processing: {e}")
95
  self.state.is_processing = False
96
+ self.state.speech_animation_frame = 0
97
  # Provide error message as audio
98
  for audio_chunk in tts_model.stream_tts_sync("I'm sorry, I encountered an error. Please try again."):
99
  yield audio_chunk
 
159
  animation: pulse 2s infinite;
160
  }
161
 
162
+ .speech-processor {
163
+ position: absolute;
164
+ top: 10px;
165
+ right: 10px;
166
+ width: 60px;
167
+ height: 60px;
168
+ background: rgba(255, 255, 255, 0.2);
169
+ border-radius: 50%;
170
+ display: flex;
171
+ align-items: center;
172
+ justify-content: center;
173
+ backdrop-filter: blur(10px);
174
+ }
175
+
176
+ .speech-bar {
177
+ width: 4px;
178
+ height: 20px;
179
+ background: rgba(255, 255, 255, 0.8);
180
+ margin: 0 2px;
181
+ border-radius: 2px;
182
+ animation: speech-wave 0.5s infinite ease-in-out;
183
+ }
184
+
185
+ .speech-bar:nth-child(1) { animation-delay: 0s; height: 15px; }
186
+ .speech-bar:nth-child(2) { animation-delay: 0.1s; height: 25px; }
187
+ .speech-bar:nth-child(3) { animation-delay: 0.2s; height: 20px; }
188
+ .speech-bar:nth-child(4) { animation-delay: 0.3s; height: 30px; }
189
+ .speech-bar:nth-child(5) { animation-delay: 0.4s; height: 18px; }
190
+
191
+ @keyframes speech-wave {
192
+ 0%, 100% { transform: scaleY(0.5); opacity: 0.5; }
193
+ 50% { transform: scaleY(1); opacity: 1; }
194
+ }
195
+
196
+ .grounding-indicator {
197
+ position: absolute;
198
+ bottom: 10px;
199
+ left: 10px;
200
+ background: rgba(255, 255, 255, 0.9);
201
+ padding: 5px 10px;
202
+ border-radius: 15px;
203
+ font-size: 0.8rem;
204
+ color: #667eea;
205
+ font-weight: 600;
206
+ animation: fadeInUp 0.3s ease-out;
207
+ }
208
+
209
+ @keyframes fadeInUp {
210
+ from { opacity: 0; transform: translateY(10px); }
211
+ to { opacity: 1; transform: translateY(0); }
212
+ }
213
+
214
  @keyframes pulse {
215
  0%, 100% {
216
  transform: scale(1);
 
504
  <div class="wave"></div>
505
  </div>
506
  <div style="text-align: center; color: white; font-size: 1.2rem; font-weight: 600;">
507
+ 🎀 Enhanced Voice Interface
508
  </div>
509
  <div style="text-align: center; color: rgba(255,255,255,0.9); margin-top: 0.5rem;">
510
+ Speak naturally with Google Grounding & Search
511
+ </div>
512
+ <div class="speech-processor" id="speechProcessor">
513
+ <div class="speech-bar"></div>
514
+ <div class="speech-bar"></div>
515
+ <div class="speech-bar"></div>
516
+ <div class="speech-bar"></div>
517
+ <div class="speech-bar"></div>
518
  </div>
519
  """)
520
 
 
572
  return {
573
  status_display: """
574
  <div style="text-align: center; color: white;">
575
+ <span class="status-indicator status-active"></span>
576
+ <span>Connected - Speak Now</span>
577
+ <div class="processing-indicator">
578
+ <span class="processing-dot"></span>
579
+ <span class="processing-dot"></span>
580
+ <span class="processing-dot"></span>
581
+ </div>
582
+ </div>
583
+ """
584
+ }
585
+
586
+ def stop_chat():
587
+ return {
588
+ status_display: """
589
+ <div style="text-align: center; color: white;">
590
+ <span class="status-indicator status-inactive"></span>
591
+ <span>Disconnected</span>
592
+ </div>
593
+ """
594
+ }
595
+
596
+ def clear_conversation():
597
+ handler.state.messages = []
598
+ handler.state.last_transcript = ""
599
+ handler.state.last_response = ""
600
+ return {
601
+ conversation_display: """
602
+ <div style="text-align: center; color: #999; padding: 2rem;">
603
+ Conversation cleared. Start a new one...
604
+ </div>
605
+ """,
606
+ status_info: "πŸ”„ Conversation cleared"
607
+ }
608
+
609
+ def update_interface():
610
+ """Update the interface with current conversation state with enhanced animations"""
611
+ status, status_class, conversation_html = get_conversation_state()
612
+
613
+ if conversation_html:
614
+ formatted_html = f"""
615
+ <div style="max-height: 400px; overflow-y: auto; padding: 1rem;">
616
+ {conversation_html.replace('\n\n', '</div><div class="conversation-bubble assistant-bubble">').replace('**πŸ‘€ You:**', '</div><div class="conversation-bubble user-bubble">').replace('**πŸ€– Gemini:**', '</div><div class="conversation-bubble assistant-bubble">')}
617
+ </div>
618
+ """
619
+ else:
620
+ formatted_html = """
621
+ <div style="text-align: center; color: #999; padding: 2rem;">
622
+ Start a conversation to see it here...
623
+ </div>
624
+ """
625
+
626
+ processing_indicator = ""
627
+ if handler.state.is_processing:
628
+ processing_indicator = """
629
+ <div class="processing-indicator">
630
+ <span class="processing-dot"></span>
631
+ <span class="processing-dot"></span>
632
+ <span class="processing-dot"></span>
633
+ </div>
634
+ """
635
+
636
+ grounding_badge = ""
637
+ if handler.state.grounding_results:
638
+ grounding_badge = '<div class="grounding-indicator">πŸ” Google Search</div>'
639
+
640
+ status_html = f"""
641
+ <div style="text-align: center; color: white; position: relative;">
642
+ <span class="status-indicator {status_class}"></span>
643
+ <span>{status}</span>
644
+ {processing_indicator}
645
+ {grounding_badge}
646
+ </div>
647
+ """
648
+
649
+ return {
650
+ status_display: status_html,
651
+ conversation_display: formatted_html
652
+ }
653
+
654
+ # Wire up events
655
+ start_btn.click(start_chat, outputs=[status_display])
656
+ stop_btn.click(stop_chat, outputs=[status_display])
657
+ clear_btn.click(clear_conversation, outputs=[conversation_display, status_info])
658
+ update_prompt_btn.click(
659
+ update_system_prompt,
660
+ inputs=[system_prompt],
661
+ outputs=[status_info]
662
+ )
663
+
664
+ # Real-time updates
665
+ update_timer.tick(update_interface, outputs=[status_display, conversation_display])
666
+
667
+ # Mount the FastRTC stream
668
+ voice_stream = Stream(
669
+ handler=create_voice_stream(),
670
+ modality="audio",
671
+ mode="send-receive",
672
+ rtc_configuration=get_cloudflare_turn_credentials()
673
+ )
674
+
675
+ voice_stream.mount(demo)
676
+
677
+ if __name__ == "__main__":
678
+ demo.launch(
679
+ server_name="0.0.0.0",
680
+ server_port=7860,
681
+ share=True
682
+ )
requirements.txt CHANGED
@@ -1,7 +1,55 @@
1
- fastrtc
2
- gradio
3
- google-generativeai
4
- numpy
 
 
5
  spaces
6
- requests
7
- Pillow
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ fastrtc[vad,stt,tts]>=0.0.1
3
+ google-generativeai>=0.3.0
4
+ google-generativeai>=0.4.0
5
+ numpy>=1.24.0
6
+ asyncio
7
  spaces
8
+ torch
9
+ transformers
10
+ accelerate
11
+ websockets>=11.0.0
12
+
13
+ This modern Gradio app features:
14
+
15
+ ## 🎨 **Modern Design Elements:**
16
+ - **Gradient backgrounds** with purple/pink color scheme
17
+ - **Glass morphism effects** with backdrop blur
18
+ - **Smooth animations** for all UI elements
19
+ - **Custom font** (Inter) for professional typography
20
+ - **Responsive layout** that adapts to different screen sizes
21
+
22
+ ## 🎭 **Interactive Animations:**
23
+ - **Pulsing voice container** when active
24
+ - **Animated voice waves** showing audio activity
25
+ - **Bouncing processing dots** during AI thinking
26
+ - **Fade and slide animations** for conversation bubbles
27
+ - **Status indicators** with glow effects
28
+
29
+ ## πŸŽ™οΈ **Voice Processing Features:**
30
+ - **Real-time speech-to-text** conversion
31
+ - **Voice activity detection** with automatic pause detection
32
+ - **Natural text-to-speech** responses
33
+ - **Conversation memory** for context awareness
34
+ - **Interrupt capability** to cut off responses
35
+
36
+ ## πŸ’¬ **Conversation Display:**
37
+ - **Styled message bubbles** with different colors for user/assistant
38
+ - **Real-time updates** showing conversation flow
39
+ - **Scrollable history** with custom scrollbar
40
+ - **Status indicators** showing connection state
41
+
42
+ ## βš™οΈ **Advanced Settings:**
43
+ - **Customizable system prompt** for personality control
44
+ - **Response speed adjustment** slider
45
+ - **Clear conversation** functionality
46
+ - **Start/stop controls** with visual feedback
47
+
48
+ ## πŸ”§ **Technical Features:**
49
+ - **WebRTC streaming** for low-latency audio
50
+ - **Cloudflare TURN** for firewall traversal
51
+ - **ZeroGPU optimization** for performance
52
+ - **Modular architecture** for maintainability
53
+ - **Error handling** with fallback responses
54
+
55
+ The app provides a professional, engaging voice chat experience with smooth animations and real-time feedback throughout the conversation process.