fastrtc_ui / app.py
Zoltan100's picture
my latest updates.
3faf80b
"""
can_interrupt does not work!
## Setup
```
pip install fastrtc[vad] google-genai python-dotenv onnxruntime
```
onnxruntime required https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version
To run the script:
```
python fastrtc_ui.py
```
"""
import asyncio
import base64
import os
from typing import Literal
import gradio as gr
import numpy as np
from fastrtc import (
AsyncStreamHandler,
WebRTC,
wait_for_item,
ReplyOnPause,
get_cloudflare_turn_credentials_async,
)
from google import genai
from google.genai.types import (
LiveConnectConfig,
PrebuiltVoiceConfig,
SpeechConfig,
VoiceConfig,
Part,
Content,
)
from gradio.utils import get_space
import traceback
from datetime import datetime
api_key = os.getenv("GOOGLE_API_KEY", "")
try:
from dotenv import load_dotenv
load_dotenv()
except (ImportError, ModuleNotFoundError):
pass
SYSTEM_PROMPTS = {
"Japi": """Objective: Engage a language learner in a fun conversation to enhance their language skills and knowledge on a chosen topic at a given CEFR English level. Roleplay Scenario: You are Japi the language tutor for English, tasked with guiding a language learner through an interactive session. Your approach is friendly, encouraging, and very funny to create an enjoyable learning experience. Conversation Start: Begin by asking the learner about their familiarity or experience with the chosen topic, encouraging them to share their thoughts and insights. Key Components: Maintain Engagement: Show genuine interest in the learner's experiences and opinions, asking open-ended questions to keep the conversation flowing. Adaptive Learning: Tailor the conversation based on the learner's responses, gradually introducing new concepts or vocabulary while adjusting the complexity to suit their understanding and proficiency level.Encourage active participation to foster language development while guiding the conversation back to the original topic if the learner attempts to shift the topic. Correction and Feedback: Provide positive reinforcement and gently correct any level specific errors or misunderstandings, offering clear explanations or examples to aid comprehension and language improvement. Use of Humor and Fun Facts: Infuse the conversation with light-hearted jokes, interesting facts, or anecdotes related to the topic to keep the learner engaged and motivated. Guidelines: Keep the interaction lively, funny and entertaining, ensuring the learner feels comfortable expressing themselves. Encourage active participation and free expression to foster language development. Adapt the template to various topics and proficiency levels while maintaining the core objectives of engagement, adaptive learning, correction and feedback, and the use of humor and fun facts. Be short, maximum 10 words on each turn.""",
"Friendly Tutor": "You are Japi the friendly language tutor for English.",
"Helpful Assistant": "You are a helpful assistant tasked with searching the web for information when you do not know the answer. Respond with 50 words.",
"Pirate": "Talk like a pirate, respond with maximum 10 words.",
"Interactive Tutor": "You are Japi the language tutor for English, tasked with guiding a language learner through an interactive session. Your approach is friendly, encouraging, and very funny to create an enjoyable learning experience. Do not answer your own question, wait for user to repond."
}
SYSTEM_PROMPT=SYSTEM_PROMPTS[ list(SYSTEM_PROMPTS.keys())[0] ] # default
PROMPTS = {
"Hi": "Hi There!",
"Travel": """Let's talk about traveling.""",
"Past Tense Practice": """You are an engaging and friendly English grammar tutor for A2-B1 level learners. Your goal is to help them practice changing present tense sentences into the past tense. You give one sentence in the present tense, ask them to change it to past tense, and then evaluate their response. Give helpful, fun feedback if the answer is wrong, and praise if it's right. Then continue with another sentence.""",
"Difficult Sounds": """You are a fun and patient English pronunciation coach. Your job is to help learners practice difficult English sounds like 'th' in 'think' and 'this'. In each round, choose one target sound and explain the sound in simple terms, including how to move the mouth, tongue, and teeth. Give 2-3 example words that use this sound. Ask the learner to say each word out loud.Then ask the learner to say one full sentence that includes one of the words. After the learner replies, give feedback on both pronunciation and sentence use (if applicable). Make your responses encouraging, clear, and a little playful to reduce stress and build confidence.""",
"Tongue Twisters": """You are a fun and supportive English pronunciation coach running a Tongue Twister Race. Your job is to help the learner improve their pronunciation and fluency with tricky English sounds through playful tongue twisters. Choose a simple--maximum 5 words--tongue twister that focuses on one difficult English sound. Show the tongue twister and say it slowly. Ask the learner to repeat it out loud while looking at themselves in the camera. After they try, give friendly, helpful feedback on their pronunciation and mouth movement. Then move to the next tongue twister with a new sound focus.""",
"Business Meeting": """Do not answer your own question. Wait for user's response before asking the next question. Role: You are "Japi", a friendly and funny AI Tutor for English. Your role is to teach Business English skills to an employee in Indonesia at B1 level which is lower-intermediate. Method: Use simple B1 level English. Define words in 6 words or less. Avoid complex structures, idioms, and Bahasa Indonesia. Use encouraging phrases like "Good job!" Provide grammar corrections in your feedback, rephrasing the user's response in correct English up to 30 words and by providing grammar explanations for the errors. Do not correct punctuation/capitalization. Interaction: Maintain a friendly, supportive tone and use jokes to make it funny. Use phrases like "Great work!" or "Let's practice another example." Topics: Describe basic business terms that might be used in a meeting with managers, using easy-to-understand language. Process: Present topic, emphasize key vocab. Ask a question. Provide feedback: Correct grammar/content issues by rephrasing the user's response in better English up to 30 words and explain the grammar errors. Ask follow up question. Provide feedback again with grammar correction. Ask another follow up question. Provide feedback again with grammar correction. Start a role-play scenario where you are having a meeting with your boss. Encourage the use of vocabulary and phrases from the topic. For the first four exchanges of the role-play, Japi does not provide any corrections or comments, allowing for uninterrupted interaction. After the first four exchanges, Japi then provides feedback on grammar and content. Follow up with another role-play situation, but this time offer three different personas for the boss: Demanding micromanager, Supportive, Hands-Off. Encourage the use of vocabulary and phrases from the topic. For the first four exchanges of the role-play, Japi does not provide any corrections or comments, allowing for uninterrupted interaction. After the first four exchanges, Japi then provides feedback on grammar and content.""",
"Soccer Chat": """When you do not know the answer perform a web search. When needed, correct and explain the student's grammar and vocabulary mistakes. Give positive reinforcement too. Ask one question at a time to avoid overwhelming the student. Adapt questions based on responses. Use a conversational, supportive tone and encourage user opinions in English. Be friendly, patient, humorous and knowledgeable about football. Discuss it with open-ended questions and stay focused on current football related topics. Start by asking what league the student is interested in, or which is their favourite team and player. Wait for the answer before the next question.""",
}
PROMPT=PROMPTS[ list(PROMPTS.keys())[0] ] # default
MODELS = {
"Gemini Flash Exp": "gemini-2.0-flash-exp",
"Gemini Flash Live": "gemini-2.0-flash-live-001"
}
MODEL=MODELS[ list(MODELS.keys())[0] ] # default
class GeminiHandler(AsyncStreamHandler):
"""Handler for the Gemini API"""
def __init__(
self,
expected_layout: Literal["mono"] = "mono",
output_sample_rate: int = 24000,
output_frame_size: int = 480,
) -> None:
super().__init__(
expected_layout,
output_sample_rate,
output_frame_size,
input_sample_rate=16000,
)
self.handler = ReplyOnPause(self.handle_response, can_interrupt=True)
self.input_queue: asyncio.Queue = asyncio.Queue()
self.output_queue: asyncio.Queue = asyncio.Queue()
self.quit: asyncio.Event = asyncio.Event()
async def handle_response(self, audio):
_, array = audio
array = array.squeeze()
audio_message = base64.b64encode(array.tobytes()).decode("UTF-8")
self.input_queue.put_nowait(audio_message)
# Check for interruption
if self.handler.can_interrupt:
print("Interruption is enabled.")
# Return the response audio chunks
while not self.quit.is_set():
try:
# Check if interruption is needed
if self.handler.can_interrupt and self.quit.is_set():
print("Interruption detected, breaking the loop.")
break
return await wait_for_item(self.output_queue)
except asyncio.CancelledError:
# Handle interruption gracefully
print("Task was cancelled, handling interruption.")
break
def shutdown(self) -> None:
print("Shutting down, setting quit event.")
self.quit.set()
def copy(self) -> "GeminiHandler":
return GeminiHandler(
expected_layout="mono",
output_sample_rate=self.output_sample_rate,
output_frame_size=self.output_frame_size,
)
async def start_up(self):
try:
await self.wait_for_args()
api_key, voice_name, prompt_system, prompt, model = self.latest_args[1:]
client = genai.Client(
api_key=api_key or os.getenv("GEMINI_API_KEY"),
http_options={"api_version": "v1alpha"},
)
tools = [{'google_search': {}}]
config = LiveConnectConfig(
response_modalities=["AUDIO"],
output_audio_transcription={},
tools=tools,
system_instruction=SYSTEM_PROMPTS[prompt_system],
speech_config=SpeechConfig(
voice_config=VoiceConfig(
prebuilt_voice_config=PrebuiltVoiceConfig(
voice_name=voice_name,
)
)
),
)
try:
async with client.aio.live.connect(
model=MODELS[model], config=config
) as session:
print(f"\n{datetime.now()} ===")
print(f"model: {model} - {MODELS[model]}") # Debug info
print(f"system prompt: {prompt_system} - {SYSTEM_PROMPTS[prompt_system]}") # Debug info
print(f"prompt: {prompt} - {PROMPTS[prompt]}") # Debug info
await session.send_client_content(turns=Content(
role='user',
parts=[Part(text=PROMPTS[prompt])]))
async for audio in session.start_stream(
stream=self.stream(), mime_type="audio/pcm"
):
if audio.data:
array = np.frombuffer(audio.data, dtype=np.int16)
self.output_queue.put_nowait((self.output_sample_rate, array))
#await session.send() # Make sure to await the send
except Exception as e:
#print(f"Error during session: {str(e)}")
#print(f"Error during session: {repr(e)}")
print("Error during session:")
traceback.print_exc()
raise # Re-raise the exception after printing
except Exception as e:
print(f"Error in start_up: {str(e)}")
raise # Re-raise the exception after printing
async def stream(self):
while not self.quit.is_set():
#yield await wait_for_item(self.input_queue)
item = await wait_for_item(self.input_queue)
# Only yield if item is valid (e.g., not None or empty)
if item:
yield item
async def receive(self, frame: tuple[int, np.ndarray]) -> None:
_, array = frame
array = array.squeeze()
audio_message = base64.b64encode(array.tobytes()).decode("UTF-8")
self.input_queue.put_nowait(audio_message)
async def emit(self) -> tuple[int, np.ndarray] | None:
return await wait_for_item(self.output_queue)
def shutdown(self) -> None:
self.quit.set()
with gr.Blocks() as demo:
gr.HTML(
"""
<div style='text-align: center'>
<h1>Welcome to Japi Voice Chat</h1>
<p>Speak with Your friendly English tutor using real-time audio streaming</p>
</div>
"""
)
with gr.Column():
with gr.Row():
system_prompt_selector = gr.Dropdown(
label="System Prompt",
choices=list(SYSTEM_PROMPTS.keys()),
value=list(SYSTEM_PROMPTS.keys())[0],
)
prompt_selector = gr.Dropdown(
label="User Prompt",
choices=list(PROMPTS.keys()),
value=list(PROMPTS.keys())[0],
)
with gr.Row():
# Display selected prompts
gr.Markdown(
value=lambda x: f"""
```
{SYSTEM_PROMPTS[x] if x in SYSTEM_PROMPTS else "Please select the system prompt."}
```
""",
inputs=[system_prompt_selector],
)
gr.Markdown(
value=lambda x: f"""
```
{PROMPTS[x] if x in PROMPTS else "Please select the prompt."}
```
""",
inputs=[prompt_selector],
)
api_key_state = gr.State(os.getenv("GOOGLE_API_KEY", "")) # Wrap API key in gr.State
# make sure you don't commit your token to git!
TOKEN = os.getenv('HF_TOKEN')
async def get_credentials():
return await get_cloudflare_turn_credentials_async(hf_token=TOKEN)
with gr.Column() as row:
with gr.Row():
model_selector = gr.Dropdown(
label="Model",
choices=list(MODELS.keys()), # Use values instead of keys
value=list(MODELS.keys())[0], # Use first value as default
)
voice = gr.Dropdown(
label="Voice",
choices=["Puck", "Charon", "Kore", "Fenrir", "Aoede"],
value="Puck",
)
webrtc = WebRTC(
label="Audio",
modality="audio",
mode="send-receive",
pulse_color="rgb(35, 157, 225)",
icon_button_color="rgb(255, 255, 255)",
icon="japi_head.png",
button_labels={"start": "Start", "stop": "Stop"},
rtc_configuration=get_cloudflare_turn_credentials_async if get_space() else None,
)
webrtc.stream(
GeminiHandler(ReplyOnPause(AsyncStreamHandler, can_interrupt=True)),
inputs=[webrtc, api_key_state, voice, system_prompt_selector, prompt_selector, model_selector],
outputs=[webrtc],
time_limit=120 if get_space() else None,
concurrency_limit=1,
)
if __name__ == "__main__":
demo.launch()