Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import io
|
| 3 |
+
import wave
|
| 4 |
+
import audioop
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
from openai import AsyncOpenAI
|
| 8 |
+
import chainlit as cl
|
| 9 |
+
import requests
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
url = "http://127.0.0.1:7860/api/v1/run/03d6265d-87cc-48af-9457-709f7bb288d8" # Replace with the link to your flow
|
| 13 |
+
|
| 14 |
+
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
| 15 |
+
|
| 16 |
+
openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
|
| 17 |
+
CHUNK_SIZE = 1024
|
| 18 |
+
|
| 19 |
+
if not OPENAI_API_KEY:
|
| 20 |
+
raise ValueError(
|
| 21 |
+
"OPENAI_API_KEY must be set"
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Define a threshold for detecting silence and a timeout for ending a turn
|
| 25 |
+
SILENCE_THRESHOLD = (
|
| 26 |
+
3500 # Adjust based on your audio level (e.g., lower for quieter audio)
|
| 27 |
+
)
|
| 28 |
+
SILENCE_TIMEOUT = 1300.0 # Seconds of silence to consider the turn finished
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@cl.step(type="tool")
|
| 32 |
+
async def speech_to_text(audio_file):
|
| 33 |
+
response = await openai_client.audio.transcriptions.create(
|
| 34 |
+
model="whisper-1", file=audio_file
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
return response.text
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@cl.step(type="tool")
|
| 41 |
+
async def text_to_speech(text: str, mime_type: str):
|
| 42 |
+
async with openai_client.audio.speech.with_streaming_response.create(
|
| 43 |
+
model="gpt-4o-mini-tts",
|
| 44 |
+
voice="coral",
|
| 45 |
+
input=text,
|
| 46 |
+
instructions="Speak in a cheerful and positive tone.",
|
| 47 |
+
) as response:
|
| 48 |
+
buffer = io.BytesIO()
|
| 49 |
+
buffer.name = "output.wav"
|
| 50 |
+
|
| 51 |
+
async for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE):
|
| 52 |
+
if chunk:
|
| 53 |
+
buffer.write(chunk)
|
| 54 |
+
|
| 55 |
+
buffer.seek(0)
|
| 56 |
+
return buffer.name, buffer.read()
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@cl.step(type="tool")
|
| 60 |
+
async def generate_text_answer(transcription):
|
| 61 |
+
|
| 62 |
+
# Request payload configuration
|
| 63 |
+
payload = {
|
| 64 |
+
"input_value": transcription,
|
| 65 |
+
"output_type": "chat",
|
| 66 |
+
"input_type": "chat"
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
# Request headers
|
| 70 |
+
headers = {
|
| 71 |
+
"Content-Type": "application/json"
|
| 72 |
+
}
|
| 73 |
+
text ="" #Add this line
|
| 74 |
+
try:
|
| 75 |
+
# Send API request
|
| 76 |
+
response = requests.request("POST", url, json=payload, headers=headers)
|
| 77 |
+
response.raise_for_status() # Raise exception for bad status codes
|
| 78 |
+
|
| 79 |
+
text= response.json()["outputs"][0]["outputs"][0]["results"]["message"]["text"] #Add this line
|
| 80 |
+
|
| 81 |
+
except requests.exceptions.RequestException as e:
|
| 82 |
+
print(f"Error making API request: {e}")
|
| 83 |
+
|
| 84 |
+
except ValueError as e:
|
| 85 |
+
print(f"Error parsing response: {e}")
|
| 86 |
+
|
| 87 |
+
return text
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
@cl.on_chat_start
|
| 91 |
+
async def start():
|
| 92 |
+
cl.user_session.set("message_history", [])
|
| 93 |
+
await cl.Message(
|
| 94 |
+
content="Welcome to Chainlit x Whisper example! Press `p` to talk!",
|
| 95 |
+
).send()
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
@cl.on_audio_start
|
| 99 |
+
async def on_audio_start():
|
| 100 |
+
cl.user_session.set("silent_duration_ms", 0)
|
| 101 |
+
cl.user_session.set("is_speaking", False)
|
| 102 |
+
cl.user_session.set("audio_chunks", [])
|
| 103 |
+
return True
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
@cl.on_audio_chunk
|
| 107 |
+
async def on_audio_chunk(chunk: cl.InputAudioChunk):
|
| 108 |
+
audio_chunks = cl.user_session.get("audio_chunks")
|
| 109 |
+
|
| 110 |
+
if audio_chunks is not None:
|
| 111 |
+
audio_chunk = np.frombuffer(chunk.data, dtype=np.int16)
|
| 112 |
+
audio_chunks.append(audio_chunk)
|
| 113 |
+
|
| 114 |
+
# If this is the first chunk, initialize timers and state
|
| 115 |
+
if chunk.isStart:
|
| 116 |
+
cl.user_session.set("last_elapsed_time", chunk.elapsedTime)
|
| 117 |
+
cl.user_session.set("is_speaking", True)
|
| 118 |
+
return
|
| 119 |
+
|
| 120 |
+
audio_chunks = cl.user_session.get("audio_chunks")
|
| 121 |
+
last_elapsed_time = cl.user_session.get("last_elapsed_time")
|
| 122 |
+
silent_duration_ms = cl.user_session.get("silent_duration_ms")
|
| 123 |
+
is_speaking = cl.user_session.get("is_speaking")
|
| 124 |
+
|
| 125 |
+
# Calculate the time difference between this chunk and the previous one
|
| 126 |
+
time_diff_ms = chunk.elapsedTime - last_elapsed_time
|
| 127 |
+
cl.user_session.set("last_elapsed_time", chunk.elapsedTime)
|
| 128 |
+
|
| 129 |
+
# Compute the RMS (root mean square) energy of the audio chunk
|
| 130 |
+
audio_energy = audioop.rms(
|
| 131 |
+
chunk.data, 2
|
| 132 |
+
) # Assumes 16-bit audio (2 bytes per sample)
|
| 133 |
+
|
| 134 |
+
if audio_energy < SILENCE_THRESHOLD:
|
| 135 |
+
# Audio is considered silent
|
| 136 |
+
silent_duration_ms += time_diff_ms
|
| 137 |
+
cl.user_session.set("silent_duration_ms", silent_duration_ms)
|
| 138 |
+
if silent_duration_ms >= SILENCE_TIMEOUT and is_speaking:
|
| 139 |
+
cl.user_session.set("is_speaking", False)
|
| 140 |
+
await process_audio()
|
| 141 |
+
else:
|
| 142 |
+
# Audio is not silent, reset silence timer and mark as speaking
|
| 143 |
+
cl.user_session.set("silent_duration_ms", 0)
|
| 144 |
+
if not is_speaking:
|
| 145 |
+
cl.user_session.set("is_speaking", True)
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
async def process_audio():
|
| 149 |
+
# Get the audio buffer from the session
|
| 150 |
+
if audio_chunks := cl.user_session.get("audio_chunks"):
|
| 151 |
+
# Concatenate all chunks
|
| 152 |
+
concatenated = np.concatenate(list(audio_chunks))
|
| 153 |
+
|
| 154 |
+
# Create an in-memory binary stream
|
| 155 |
+
wav_buffer = io.BytesIO()
|
| 156 |
+
|
| 157 |
+
# Create WAV file with proper parameters
|
| 158 |
+
with wave.open(wav_buffer, "wb") as wav_file:
|
| 159 |
+
wav_file.setnchannels(1) # mono
|
| 160 |
+
wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
|
| 161 |
+
wav_file.setframerate(24000) # sample rate (24kHz PCM)
|
| 162 |
+
wav_file.writeframes(concatenated.tobytes())
|
| 163 |
+
|
| 164 |
+
# Reset buffer position
|
| 165 |
+
wav_buffer.seek(0)
|
| 166 |
+
|
| 167 |
+
cl.user_session.set("audio_chunks", [])
|
| 168 |
+
|
| 169 |
+
frames = wav_file.getnframes()
|
| 170 |
+
rate = wav_file.getframerate()
|
| 171 |
+
|
| 172 |
+
duration = frames / float(rate)
|
| 173 |
+
if duration <= 1.71:
|
| 174 |
+
print("The audio is too short, please try again.")
|
| 175 |
+
return
|
| 176 |
+
|
| 177 |
+
audio_buffer = wav_buffer.getvalue()
|
| 178 |
+
|
| 179 |
+
input_audio_el = cl.Audio(content=audio_buffer, mime="audio/wav")
|
| 180 |
+
|
| 181 |
+
whisper_input = ("audio.wav", audio_buffer, "audio/wav")
|
| 182 |
+
transcription = await speech_to_text(whisper_input)
|
| 183 |
+
|
| 184 |
+
await cl.Message(
|
| 185 |
+
author="You",
|
| 186 |
+
type="user_message",
|
| 187 |
+
content=transcription,
|
| 188 |
+
elements=[input_audio_el],
|
| 189 |
+
).send()
|
| 190 |
+
|
| 191 |
+
answer = await generate_text_answer(transcription)
|
| 192 |
+
|
| 193 |
+
output_name, output_audio = await text_to_speech(answer, "audio/wav")
|
| 194 |
+
|
| 195 |
+
output_audio_el = cl.Audio(
|
| 196 |
+
auto_play=True,
|
| 197 |
+
mime="audio/wav",
|
| 198 |
+
content=output_audio,
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
await cl.Message(content=answer, elements=[output_audio_el]).send()
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
@cl.on_message
|
| 205 |
+
async def on_message(message: cl.Message):
|
| 206 |
+
await cl.Message(content="This is a voice demo, press P to start!").send()
|