Spaces:
Sleeping
Sleeping
File size: 6,213 Bytes
fdcaa3c 0b14e82 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 | import os
import io
import wave
import audioop
import numpy as np
from openai import AsyncOpenAI
import chainlit as cl
import requests
url = "http://127.0.0.1:8888/api/v1/run/03d6265d-87cc-48af-9457-709f7bb288d8" # Replace with the link to your flow
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
CHUNK_SIZE = 1024
if not OPENAI_API_KEY:
raise ValueError(
"OPENAI_API_KEY must be set"
)
# Define a threshold for detecting silence and a timeout for ending a turn
SILENCE_THRESHOLD = (
3500 # Adjust based on your audio level (e.g., lower for quieter audio)
)
SILENCE_TIMEOUT = 1300.0 # Seconds of silence to consider the turn finished
@cl.step(type="tool")
async def speech_to_text(audio_file):
response = await openai_client.audio.transcriptions.create(
model="whisper-1", file=audio_file
)
return response.text
@cl.step(type="tool")
async def text_to_speech(text: str, mime_type: str):
async with openai_client.audio.speech.with_streaming_response.create(
model="gpt-4o-mini-tts",
voice="coral",
input=text,
instructions="Speak in a cheerful and positive tone.",
) as response:
buffer = io.BytesIO()
buffer.name = "output.wav"
async for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE):
if chunk:
buffer.write(chunk)
buffer.seek(0)
return buffer.name, buffer.read()
@cl.step(type="tool")
async def generate_text_answer(transcription):
# Request payload configuration
payload = {
"input_value": transcription,
"output_type": "chat",
"input_type": "chat"
}
# Request headers
headers = {
"Content-Type": "application/json"
}
text ="" #Add this line
try:
# Send API request
response = requests.request("POST", url, json=payload, headers=headers)
response.raise_for_status() # Raise exception for bad status codes
text= response.json()["outputs"][0]["outputs"][0]["results"]["message"]["text"] #Add this line
except requests.exceptions.RequestException as e:
print(f"Error making API request: {e}")
except ValueError as e:
print(f"Error parsing response: {e}")
return text
@cl.on_chat_start
async def start():
cl.user_session.set("message_history", [])
await cl.Message(
content="Welcome to Chainlit x Whisper example! Press `p` to talk!",
).send()
@cl.on_audio_start
async def on_audio_start():
cl.user_session.set("silent_duration_ms", 0)
cl.user_session.set("is_speaking", False)
cl.user_session.set("audio_chunks", [])
return True
@cl.on_audio_chunk
async def on_audio_chunk(chunk: cl.InputAudioChunk):
audio_chunks = cl.user_session.get("audio_chunks")
if audio_chunks is not None:
audio_chunk = np.frombuffer(chunk.data, dtype=np.int16)
audio_chunks.append(audio_chunk)
# If this is the first chunk, initialize timers and state
if chunk.isStart:
cl.user_session.set("last_elapsed_time", chunk.elapsedTime)
cl.user_session.set("is_speaking", True)
return
audio_chunks = cl.user_session.get("audio_chunks")
last_elapsed_time = cl.user_session.get("last_elapsed_time")
silent_duration_ms = cl.user_session.get("silent_duration_ms")
is_speaking = cl.user_session.get("is_speaking")
# Calculate the time difference between this chunk and the previous one
time_diff_ms = chunk.elapsedTime - last_elapsed_time
cl.user_session.set("last_elapsed_time", chunk.elapsedTime)
# Compute the RMS (root mean square) energy of the audio chunk
audio_energy = audioop.rms(
chunk.data, 2
) # Assumes 16-bit audio (2 bytes per sample)
if audio_energy < SILENCE_THRESHOLD:
# Audio is considered silent
silent_duration_ms += time_diff_ms
cl.user_session.set("silent_duration_ms", silent_duration_ms)
if silent_duration_ms >= SILENCE_TIMEOUT and is_speaking:
cl.user_session.set("is_speaking", False)
await process_audio()
else:
# Audio is not silent, reset silence timer and mark as speaking
cl.user_session.set("silent_duration_ms", 0)
if not is_speaking:
cl.user_session.set("is_speaking", True)
async def process_audio():
# Get the audio buffer from the session
if audio_chunks := cl.user_session.get("audio_chunks"):
# Concatenate all chunks
concatenated = np.concatenate(list(audio_chunks))
# Create an in-memory binary stream
wav_buffer = io.BytesIO()
# Create WAV file with proper parameters
with wave.open(wav_buffer, "wb") as wav_file:
wav_file.setnchannels(1) # mono
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
wav_file.setframerate(24000) # sample rate (24kHz PCM)
wav_file.writeframes(concatenated.tobytes())
# Reset buffer position
wav_buffer.seek(0)
cl.user_session.set("audio_chunks", [])
frames = wav_file.getnframes()
rate = wav_file.getframerate()
duration = frames / float(rate)
if duration <= 1.71:
print("The audio is too short, please try again.")
return
audio_buffer = wav_buffer.getvalue()
input_audio_el = cl.Audio(content=audio_buffer, mime="audio/wav")
whisper_input = ("audio.wav", audio_buffer, "audio/wav")
transcription = await speech_to_text(whisper_input)
await cl.Message(
author="You",
type="user_message",
content=transcription,
elements=[input_audio_el],
).send()
answer = await generate_text_answer(transcription)
output_name, output_audio = await text_to_speech(answer, "audio/wav")
output_audio_el = cl.Audio(
auto_play=True,
mime="audio/wav",
content=output_audio,
)
await cl.Message(content=answer, elements=[output_audio_el]).send()
@cl.on_message
async def on_message(message: cl.Message):
await cl.Message(content="This is a voice demo, press P to start!").send() |