Spaces:
Sleeping
Sleeping
my latest updates…
Browse files- fastrtc_ui.py +265 -0
- japi_head.png +0 -0
fastrtc_ui.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
can_interrupt does not work!
|
| 3 |
+
|
| 4 |
+
## Setup
|
| 5 |
+
```
|
| 6 |
+
pip install fastrtc[vad] google-genai python-dotenv onnxruntime
|
| 7 |
+
```
|
| 8 |
+
|
| 9 |
+
onnxruntime required https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version
|
| 10 |
+
|
| 11 |
+
To run the script:
|
| 12 |
+
```
|
| 13 |
+
python fastrtc_ui.py
|
| 14 |
+
```
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import asyncio
|
| 18 |
+
import base64
|
| 19 |
+
import os
|
| 20 |
+
from typing import Literal
|
| 21 |
+
|
| 22 |
+
import gradio as gr
|
| 23 |
+
import numpy as np
|
| 24 |
+
from fastrtc import (
|
| 25 |
+
AsyncStreamHandler,
|
| 26 |
+
WebRTC,
|
| 27 |
+
wait_for_item,
|
| 28 |
+
ReplyOnPause,
|
| 29 |
+
)
|
| 30 |
+
from google import genai
|
| 31 |
+
from google.genai.types import (
|
| 32 |
+
LiveConnectConfig,
|
| 33 |
+
PrebuiltVoiceConfig,
|
| 34 |
+
SpeechConfig,
|
| 35 |
+
VoiceConfig,
|
| 36 |
+
Part,
|
| 37 |
+
Content,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
api_key = os.getenv("GOOGLE_API_KEY", "")
|
| 41 |
+
|
| 42 |
+
try:
|
| 43 |
+
from dotenv import load_dotenv
|
| 44 |
+
load_dotenv()
|
| 45 |
+
except (ImportError, ModuleNotFoundError):
|
| 46 |
+
pass
|
| 47 |
+
|
| 48 |
+
SYSTEM_PROMPTS = {
|
| 49 |
+
"Friendly Tutor": "You are Japi the friendly language tutor for English.",
|
| 50 |
+
"Helpful Assistant": "You are a helpful assistant tasked with searching the web for information when you do not know the answer. Respond with 50 words.",
|
| 51 |
+
"Pirate": "Talk like a pirate, respond with maximum 10 words.",
|
| 52 |
+
"Interactive Tutor": "You are Japi the language tutor for English, tasked with guiding a language learner through an interactive session. Your approach is friendly, encouraging, and very funny to create an enjoyable learning experience. Do not answer your own question, wait for user to repond."
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
PROMPTS = {
|
| 56 |
+
"Hi": "Hi There!",
|
| 57 |
+
"Soccer Chat": """Do not answer your own question. Wait for user's response before asking the next question. Role: You are "Japi", a friendly and funny AI Tutor for English. Your role is to teach Business English skills to an employee in Indonesia at B1 level which is lower-intermediate. Method: Use simple B1 level English. Define words in 6 words or less. Avoid complex structures, idioms, and Bahasa Indonesia. Use encouraging phrases like "Good job!" Provide grammar corrections in your feedback, rephrasing the user's response in correct English up to 30 words and by providing grammar explanations for the errors. Do not correct punctuation/capitalization. Interaction: Maintain a friendly, supportive tone and use jokes to make it funny. Use phrases like "Great work!" or "Let's practice another example." Topics: Describe basic business terms that might be used in a meeting with managers, using easy-to-understand language. Process: Present topic, emphasize key vocab. Ask a question. Provide feedback: Correct grammar/content issues by rephrasing the user's response in better English up to 30 words and explain the grammar errors. Ask follow up question. Provide feedback again with grammar correction. Ask another follow up question. Provide feedback again with grammar correction. Start a role-play scenario where you are having a meeting with your boss. Encourage the use of vocabulary and phrases from the topic. For the first four exchanges of the role-play, Japi does not provide any corrections or comments, allowing for uninterrupted interaction. After the first four exchanges, Japi then provides feedback on grammar and content. Follow up with another role-play situation, but this time offer three different personas for the boss: Demanding micromanager, Supportive, Hands-Off. Encourage the use of vocabulary and phrases from the topic. For the first four exchanges of the role-play, Japi does not provide any corrections or comments, allowing for uninterrupted interaction. After the first four exchanges, Japi then provides feedback on grammar and content.""",
|
| 58 |
+
#"Soccer Chat": """When you do not know the answer perform a web search. When needed, correct and explain the student's grammar and vocabulary mistakes. Give positive reinforcement too. Ask one question at a time to avoid overwhelming the student. Adapt questions based on responses. Use a conversational, supportive tone and encourage user opinions in English. Be friendly, patient, humorous and knowledgeable about football. Discuss it with open-ended questions and stay focused on current football related topics. Start by asking what league the student is interested in, or which is their favourite team and player. Wait for the answer before the next question.""",
|
| 59 |
+
"Travel": """Let's talk about traveling""",
|
| 60 |
+
"Past Tense Practice": """You are an engaging and friendly English grammar tutor for A2-B1 level learners. Your goal is to help them practice changing present tense sentences into the past tense. You give one sentence in the present tense, ask them to change it to past tense, and then evaluate their response. Give helpful, fun feedback if the answer is wrong, and praise if it's right. Then continue with another sentence.""",
|
| 61 |
+
"Difficult Sounds": """You are a fun and patient English pronunciation coach. Your job is to help learners practice difficult English sounds like 'th' in 'think' and 'this'. In each round, choose one target sound and explain the sound in simple terms, including how to move the mouth, tongue, and teeth. Give 2-3 example words that use this sound. Ask the learner to say each word out loud.Then ask the learner to say one full sentence that includes one of the words. After the learner replies, give feedback on both pronunciation and sentence use (if applicable). Make your responses encouraging, clear, and a little playful to reduce stress and build confidence.""",
|
| 62 |
+
"Tongue Twisters": """You are a fun and supportive English pronunciation coach running a Tongue Twister Race. Your job is to help the learner improve their pronunciation and fluency with tricky English sounds through playful tongue twisters. Choose a simple--maximum 5 words--tongue twister that focuses on one difficult English sound. Show the tongue twister and say it slowly. Ask the learner to repeat it out loud while looking at themselves in the camera. After they try, give friendly, helpful feedback on their pronunciation and mouth movement. Then move to the next tongue twister with a new sound focus.""",
|
| 63 |
+
"Business Meeting": """Do not answer your own question. Wait for user's response before asking the next question. Role: You are "Japi", a friendly and funny AI Tutor for English. Your role is to teach Business English skills to an employee in Indonesia at B1 level which is lower-intermediate. Method: Use simple B1 level English. Define words in 6 words or less. Avoid complex structures, idioms, and Bahasa Indonesia. Use encouraging phrases like "Good job!" Provide grammar corrections in your feedback, rephrasing the user's response in correct English up to 30 words and by providing grammar explanations for the errors. Do not correct punctuation/capitalization. Interaction: Maintain a friendly, supportive tone and use jokes to make it funny. Use phrases like "Great work!" or "Let's practice another example." Topics: Describe basic business terms that might be used in a meeting with managers, using easy-to-understand language. Process: Present topic, emphasize key vocab. Ask a question. Provide feedback: Correct grammar/content issues by rephrasing the user's response in better English up to 30 words and explain the grammar errors. Ask follow up question. Provide feedback again with grammar correction. Ask another follow up question. Provide feedback again with grammar correction. Start a role-play scenario where you are having a meeting with your boss. Encourage the use of vocabulary and phrases from the topic. For the first four exchanges of the role-play, Japi does not provide any corrections or comments, allowing for uninterrupted interaction. After the first four exchanges, Japi then provides feedback on grammar and content. Follow up with another role-play situation, but this time offer three different personas for the boss: Demanding micromanager, Supportive, Hands-Off. Encourage the use of vocabulary and phrases from the topic. For the first four exchanges of the role-play, Japi does not provide any corrections or comments, allowing for uninterrupted interaction. After the first four exchanges, Japi then provides feedback on grammar and content."""
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
MODELS = {
|
| 67 |
+
"Gemini Flash Exp": "gemini-2.0-flash-exp",
|
| 68 |
+
"Gemini Flash Live": "gemini-2.0-flash-live-001"
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
class GeminiHandler(AsyncStreamHandler):
|
| 72 |
+
"""Handler for the Gemini API"""
|
| 73 |
+
|
| 74 |
+
def __init__(
|
| 75 |
+
self,
|
| 76 |
+
expected_layout: Literal["mono"] = "mono",
|
| 77 |
+
output_sample_rate: int = 24000,
|
| 78 |
+
output_frame_size: int = 480,
|
| 79 |
+
) -> None:
|
| 80 |
+
super().__init__(
|
| 81 |
+
expected_layout,
|
| 82 |
+
output_sample_rate,
|
| 83 |
+
output_frame_size,
|
| 84 |
+
input_sample_rate=16000,
|
| 85 |
+
)
|
| 86 |
+
self.handler = ReplyOnPause(self.handle_response, can_interrupt=True)
|
| 87 |
+
self.input_queue: asyncio.Queue = asyncio.Queue()
|
| 88 |
+
self.output_queue: asyncio.Queue = asyncio.Queue()
|
| 89 |
+
self.quit: asyncio.Event = asyncio.Event()
|
| 90 |
+
|
| 91 |
+
async def handle_response(self, audio):
|
| 92 |
+
# Your existing audio handling logic here
|
| 93 |
+
_, array = audio
|
| 94 |
+
array = array.squeeze()
|
| 95 |
+
audio_message = base64.b64encode(array.tobytes()).decode("UTF-8")
|
| 96 |
+
self.input_queue.put_nowait(audio_message)
|
| 97 |
+
# Return the response audio chunks
|
| 98 |
+
while not self.quit.is_set():
|
| 99 |
+
try:
|
| 100 |
+
return await wait_for_item(self.output_queue)
|
| 101 |
+
except asyncio.CancelledError:
|
| 102 |
+
# Handle interruption gracefully
|
| 103 |
+
break
|
| 104 |
+
|
| 105 |
+
def copy(self) -> "GeminiHandler":
|
| 106 |
+
return GeminiHandler(
|
| 107 |
+
expected_layout="mono",
|
| 108 |
+
output_sample_rate=self.output_sample_rate,
|
| 109 |
+
output_frame_size=self.output_frame_size,
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
async def start_up(self):
|
| 113 |
+
try:
|
| 114 |
+
await self.wait_for_args()
|
| 115 |
+
api_key, voice_name, prompt_system, prompt, model = self.latest_args[1:]
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
client = genai.Client(
|
| 119 |
+
api_key=api_key or os.getenv("GEMINI_API_KEY"),
|
| 120 |
+
http_options={"api_version": "v1alpha"},
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
tools = [{'google_search': {}}]
|
| 124 |
+
|
| 125 |
+
config = LiveConnectConfig(
|
| 126 |
+
response_modalities=["AUDIO"],
|
| 127 |
+
output_audio_transcription={},
|
| 128 |
+
tools=tools,
|
| 129 |
+
system_instruction=prompt_system,
|
| 130 |
+
speech_config=SpeechConfig(
|
| 131 |
+
voice_config=VoiceConfig(
|
| 132 |
+
prebuilt_voice_config=PrebuiltVoiceConfig(
|
| 133 |
+
voice_name=voice_name,
|
| 134 |
+
)
|
| 135 |
+
)
|
| 136 |
+
),
|
| 137 |
+
)
|
| 138 |
+
try:
|
| 139 |
+
async with client.aio.live.connect(
|
| 140 |
+
model=model, config=config
|
| 141 |
+
) as session:
|
| 142 |
+
print(f"Connected to model: {model}") # Debug info
|
| 143 |
+
print(f"system prompt: {prompt_system}") # Debug info
|
| 144 |
+
print(f"prompt: {prompt}") # Debug info
|
| 145 |
+
|
| 146 |
+
await session.send_client_content(turns=Content(
|
| 147 |
+
role='user',
|
| 148 |
+
parts=[Part(text=prompt)]))
|
| 149 |
+
|
| 150 |
+
async for audio in session.start_stream(
|
| 151 |
+
stream=self.stream(), mime_type="audio/pcm"
|
| 152 |
+
):
|
| 153 |
+
if audio.data:
|
| 154 |
+
array = np.frombuffer(audio.data, dtype=np.int16)
|
| 155 |
+
self.output_queue.put_nowait((self.output_sample_rate, array))
|
| 156 |
+
await session.send() # Make sure to await the send
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print(f"Error during session: {str(e)}")
|
| 159 |
+
raise # Re-raise the exception after printing
|
| 160 |
+
except Exception as e:
|
| 161 |
+
print(f"Error in start_up: {str(e)}")
|
| 162 |
+
raise # Re-raise the exception after printing
|
| 163 |
+
|
| 164 |
+
async def stream(self):
|
| 165 |
+
while not self.quit.is_set():
|
| 166 |
+
yield await wait_for_item(self.input_queue)
|
| 167 |
+
|
| 168 |
+
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
|
| 169 |
+
_, array = frame
|
| 170 |
+
array = array.squeeze()
|
| 171 |
+
audio_message = base64.b64encode(array.tobytes()).decode("UTF-8")
|
| 172 |
+
self.input_queue.put_nowait(audio_message)
|
| 173 |
+
|
| 174 |
+
async def emit(self) -> tuple[int, np.ndarray] | None:
|
| 175 |
+
return await wait_for_item(self.output_queue)
|
| 176 |
+
|
| 177 |
+
def shutdown(self) -> None:
|
| 178 |
+
self.quit.set()
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
with gr.Blocks() as demo:
|
| 182 |
+
|
| 183 |
+
gr.HTML(
|
| 184 |
+
"""
|
| 185 |
+
<div style='text-align: center'>
|
| 186 |
+
<h1>Welcome to Japi Voice Chat</h1>
|
| 187 |
+
<p>Speak with Your friendly English tutor using real-time audio streaming</p>
|
| 188 |
+
</div>
|
| 189 |
+
"""
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
with gr.Column():
|
| 193 |
+
with gr.Row():
|
| 194 |
+
system_prompt_selector = gr.Dropdown(
|
| 195 |
+
label="System Prompt",
|
| 196 |
+
choices=list(SYSTEM_PROMPTS.keys()),
|
| 197 |
+
value=list(SYSTEM_PROMPTS.keys())[0],
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
prompt_selector = gr.Dropdown(
|
| 201 |
+
label="User Prompt",
|
| 202 |
+
choices=list(PROMPTS.keys()),
|
| 203 |
+
value=list(PROMPTS.keys())[0],
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
with gr.Row():
|
| 207 |
+
|
| 208 |
+
# Display selected prompts
|
| 209 |
+
gr.Markdown(
|
| 210 |
+
value=lambda x: f"""
|
| 211 |
+
```
|
| 212 |
+
{SYSTEM_PROMPTS[x]}
|
| 213 |
+
```
|
| 214 |
+
""",
|
| 215 |
+
inputs=[system_prompt_selector],
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
# Display selected prompts
|
| 219 |
+
gr.Markdown(
|
| 220 |
+
value=lambda y: f"""
|
| 221 |
+
```
|
| 222 |
+
{PROMPTS[y]}
|
| 223 |
+
```
|
| 224 |
+
""",
|
| 225 |
+
inputs=[prompt_selector],
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
api_key_state = gr.State(os.getenv("GOOGLE_API_KEY", "")) # Wrap API key in gr.State
|
| 229 |
+
|
| 230 |
+
with gr.Column() as row:
|
| 231 |
+
|
| 232 |
+
with gr.Row():
|
| 233 |
+
|
| 234 |
+
model_selector = gr.Dropdown(
|
| 235 |
+
label="Model",
|
| 236 |
+
choices=list(MODELS.values()), # Use values instead of keys
|
| 237 |
+
value=list(MODELS.values())[0], # Use first value as default
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
voice = gr.Dropdown(
|
| 241 |
+
label="Voice",
|
| 242 |
+
choices=["Puck", "Charon", "Kore", "Fenrir", "Aoede"],
|
| 243 |
+
value="Puck",
|
| 244 |
+
)
|
| 245 |
+
|
| 246 |
+
webrtc = WebRTC(
|
| 247 |
+
label="Audio",
|
| 248 |
+
modality="audio",
|
| 249 |
+
mode="send-receive",
|
| 250 |
+
pulse_color="rgb(35, 157, 225)",
|
| 251 |
+
icon_button_color="rgb(255, 255, 255)",
|
| 252 |
+
icon="japi_head.png",
|
| 253 |
+
button_labels={"start": "Start", "stop": "Stop"},
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
webrtc.stream(
|
| 257 |
+
GeminiHandler(),
|
| 258 |
+
inputs=[webrtc, api_key_state, voice, system_prompt_selector, prompt_selector, model_selector],
|
| 259 |
+
outputs=[webrtc],
|
| 260 |
+
time_limit=60,
|
| 261 |
+
concurrency_limit=1,
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
if __name__ == "__main__":
|
| 265 |
+
demo.launch()
|
japi_head.png
ADDED
|