xaochNYU commited on
Commit
0b14e82
·
verified ·
1 Parent(s): 6efdd76

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -0
app.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import wave
4
+ import audioop
5
+ import numpy as np
6
+
7
+ from openai import AsyncOpenAI
8
+ import chainlit as cl
9
+ import requests
10
+
11
+
12
+ url = "http://127.0.0.1:7860/api/v1/run/03d6265d-87cc-48af-9457-709f7bb288d8" # Replace with the link to your flow
13
+
14
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
15
+
16
+ openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
17
+ CHUNK_SIZE = 1024
18
+
19
+ if not OPENAI_API_KEY:
20
+ raise ValueError(
21
+ "OPENAI_API_KEY must be set"
22
+ )
23
+
24
+ # Define a threshold for detecting silence and a timeout for ending a turn
25
+ SILENCE_THRESHOLD = (
26
+ 3500 # Adjust based on your audio level (e.g., lower for quieter audio)
27
+ )
28
+ SILENCE_TIMEOUT = 1300.0 # Seconds of silence to consider the turn finished
29
+
30
+
31
+ @cl.step(type="tool")
32
+ async def speech_to_text(audio_file):
33
+ response = await openai_client.audio.transcriptions.create(
34
+ model="whisper-1", file=audio_file
35
+ )
36
+
37
+ return response.text
38
+
39
+
40
+ @cl.step(type="tool")
41
+ async def text_to_speech(text: str, mime_type: str):
42
+ async with openai_client.audio.speech.with_streaming_response.create(
43
+ model="gpt-4o-mini-tts",
44
+ voice="coral",
45
+ input=text,
46
+ instructions="Speak in a cheerful and positive tone.",
47
+ ) as response:
48
+ buffer = io.BytesIO()
49
+ buffer.name = "output.wav"
50
+
51
+ async for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE):
52
+ if chunk:
53
+ buffer.write(chunk)
54
+
55
+ buffer.seek(0)
56
+ return buffer.name, buffer.read()
57
+
58
+
59
+ @cl.step(type="tool")
60
+ async def generate_text_answer(transcription):
61
+
62
+ # Request payload configuration
63
+ payload = {
64
+ "input_value": transcription,
65
+ "output_type": "chat",
66
+ "input_type": "chat"
67
+ }
68
+
69
+ # Request headers
70
+ headers = {
71
+ "Content-Type": "application/json"
72
+ }
73
+ text ="" #Add this line
74
+ try:
75
+ # Send API request
76
+ response = requests.request("POST", url, json=payload, headers=headers)
77
+ response.raise_for_status() # Raise exception for bad status codes
78
+
79
+ text= response.json()["outputs"][0]["outputs"][0]["results"]["message"]["text"] #Add this line
80
+
81
+ except requests.exceptions.RequestException as e:
82
+ print(f"Error making API request: {e}")
83
+
84
+ except ValueError as e:
85
+ print(f"Error parsing response: {e}")
86
+
87
+ return text
88
+
89
+
90
+ @cl.on_chat_start
91
+ async def start():
92
+ cl.user_session.set("message_history", [])
93
+ await cl.Message(
94
+ content="Welcome to Chainlit x Whisper example! Press `p` to talk!",
95
+ ).send()
96
+
97
+
98
+ @cl.on_audio_start
99
+ async def on_audio_start():
100
+ cl.user_session.set("silent_duration_ms", 0)
101
+ cl.user_session.set("is_speaking", False)
102
+ cl.user_session.set("audio_chunks", [])
103
+ return True
104
+
105
+
106
+ @cl.on_audio_chunk
107
+ async def on_audio_chunk(chunk: cl.InputAudioChunk):
108
+ audio_chunks = cl.user_session.get("audio_chunks")
109
+
110
+ if audio_chunks is not None:
111
+ audio_chunk = np.frombuffer(chunk.data, dtype=np.int16)
112
+ audio_chunks.append(audio_chunk)
113
+
114
+ # If this is the first chunk, initialize timers and state
115
+ if chunk.isStart:
116
+ cl.user_session.set("last_elapsed_time", chunk.elapsedTime)
117
+ cl.user_session.set("is_speaking", True)
118
+ return
119
+
120
+ audio_chunks = cl.user_session.get("audio_chunks")
121
+ last_elapsed_time = cl.user_session.get("last_elapsed_time")
122
+ silent_duration_ms = cl.user_session.get("silent_duration_ms")
123
+ is_speaking = cl.user_session.get("is_speaking")
124
+
125
+ # Calculate the time difference between this chunk and the previous one
126
+ time_diff_ms = chunk.elapsedTime - last_elapsed_time
127
+ cl.user_session.set("last_elapsed_time", chunk.elapsedTime)
128
+
129
+ # Compute the RMS (root mean square) energy of the audio chunk
130
+ audio_energy = audioop.rms(
131
+ chunk.data, 2
132
+ ) # Assumes 16-bit audio (2 bytes per sample)
133
+
134
+ if audio_energy < SILENCE_THRESHOLD:
135
+ # Audio is considered silent
136
+ silent_duration_ms += time_diff_ms
137
+ cl.user_session.set("silent_duration_ms", silent_duration_ms)
138
+ if silent_duration_ms >= SILENCE_TIMEOUT and is_speaking:
139
+ cl.user_session.set("is_speaking", False)
140
+ await process_audio()
141
+ else:
142
+ # Audio is not silent, reset silence timer and mark as speaking
143
+ cl.user_session.set("silent_duration_ms", 0)
144
+ if not is_speaking:
145
+ cl.user_session.set("is_speaking", True)
146
+
147
+
148
+ async def process_audio():
149
+ # Get the audio buffer from the session
150
+ if audio_chunks := cl.user_session.get("audio_chunks"):
151
+ # Concatenate all chunks
152
+ concatenated = np.concatenate(list(audio_chunks))
153
+
154
+ # Create an in-memory binary stream
155
+ wav_buffer = io.BytesIO()
156
+
157
+ # Create WAV file with proper parameters
158
+ with wave.open(wav_buffer, "wb") as wav_file:
159
+ wav_file.setnchannels(1) # mono
160
+ wav_file.setsampwidth(2) # 2 bytes per sample (16-bit)
161
+ wav_file.setframerate(24000) # sample rate (24kHz PCM)
162
+ wav_file.writeframes(concatenated.tobytes())
163
+
164
+ # Reset buffer position
165
+ wav_buffer.seek(0)
166
+
167
+ cl.user_session.set("audio_chunks", [])
168
+
169
+ frames = wav_file.getnframes()
170
+ rate = wav_file.getframerate()
171
+
172
+ duration = frames / float(rate)
173
+ if duration <= 1.71:
174
+ print("The audio is too short, please try again.")
175
+ return
176
+
177
+ audio_buffer = wav_buffer.getvalue()
178
+
179
+ input_audio_el = cl.Audio(content=audio_buffer, mime="audio/wav")
180
+
181
+ whisper_input = ("audio.wav", audio_buffer, "audio/wav")
182
+ transcription = await speech_to_text(whisper_input)
183
+
184
+ await cl.Message(
185
+ author="You",
186
+ type="user_message",
187
+ content=transcription,
188
+ elements=[input_audio_el],
189
+ ).send()
190
+
191
+ answer = await generate_text_answer(transcription)
192
+
193
+ output_name, output_audio = await text_to_speech(answer, "audio/wav")
194
+
195
+ output_audio_el = cl.Audio(
196
+ auto_play=True,
197
+ mime="audio/wav",
198
+ content=output_audio,
199
+ )
200
+
201
+ await cl.Message(content=answer, elements=[output_audio_el]).send()
202
+
203
+
204
+ @cl.on_message
205
+ async def on_message(message: cl.Message):
206
+ await cl.Message(content="This is a voice demo, press P to start!").send()