Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import wave | |
| import audioop | |
| import numpy as np | |
| from openai import AsyncOpenAI | |
| import chainlit as cl | |
| import requests | |
| url = "http://127.0.0.1:8888/api/v1/run/03d6265d-87cc-48af-9457-709f7bb288d8" # Replace with the link to your flow | |
| OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") | |
| openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY) | |
| CHUNK_SIZE = 1024 | |
| if not OPENAI_API_KEY: | |
| raise ValueError( | |
| "OPENAI_API_KEY must be set" | |
| ) | |
| # Define a threshold for detecting silence and a timeout for ending a turn | |
| SILENCE_THRESHOLD = ( | |
| 3500 # Adjust based on your audio level (e.g., lower for quieter audio) | |
| ) | |
| SILENCE_TIMEOUT = 1300.0 # Seconds of silence to consider the turn finished | |
| async def speech_to_text(audio_file): | |
| response = await openai_client.audio.transcriptions.create( | |
| model="whisper-1", file=audio_file | |
| ) | |
| return response.text | |
| async def text_to_speech(text: str, mime_type: str): | |
| async with openai_client.audio.speech.with_streaming_response.create( | |
| model="gpt-4o-mini-tts", | |
| voice="coral", | |
| input=text, | |
| instructions="Speak in a cheerful and positive tone.", | |
| ) as response: | |
| buffer = io.BytesIO() | |
| buffer.name = "output.wav" | |
| async for chunk in response.iter_bytes(chunk_size=CHUNK_SIZE): | |
| if chunk: | |
| buffer.write(chunk) | |
| buffer.seek(0) | |
| return buffer.name, buffer.read() | |
| async def generate_text_answer(transcription): | |
| # Request payload configuration | |
| payload = { | |
| "input_value": transcription, | |
| "output_type": "chat", | |
| "input_type": "chat" | |
| } | |
| # Request headers | |
| headers = { | |
| "Content-Type": "application/json" | |
| } | |
| text ="" #Add this line | |
| try: | |
| # Send API request | |
| response = requests.request("POST", url, json=payload, headers=headers) | |
| response.raise_for_status() # Raise exception for bad status codes | |
| text= response.json()["outputs"][0]["outputs"][0]["results"]["message"]["text"] #Add this line | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error making API request: {e}") | |
| except ValueError as e: | |
| print(f"Error parsing response: {e}") | |
| return text | |
| async def start(): | |
| cl.user_session.set("message_history", []) | |
| await cl.Message( | |
| content="Welcome to Chainlit x Whisper example! Press `p` to talk!", | |
| ).send() | |
| async def on_audio_start(): | |
| cl.user_session.set("silent_duration_ms", 0) | |
| cl.user_session.set("is_speaking", False) | |
| cl.user_session.set("audio_chunks", []) | |
| return True | |
| async def on_audio_chunk(chunk: cl.InputAudioChunk): | |
| audio_chunks = cl.user_session.get("audio_chunks") | |
| if audio_chunks is not None: | |
| audio_chunk = np.frombuffer(chunk.data, dtype=np.int16) | |
| audio_chunks.append(audio_chunk) | |
| # If this is the first chunk, initialize timers and state | |
| if chunk.isStart: | |
| cl.user_session.set("last_elapsed_time", chunk.elapsedTime) | |
| cl.user_session.set("is_speaking", True) | |
| return | |
| audio_chunks = cl.user_session.get("audio_chunks") | |
| last_elapsed_time = cl.user_session.get("last_elapsed_time") | |
| silent_duration_ms = cl.user_session.get("silent_duration_ms") | |
| is_speaking = cl.user_session.get("is_speaking") | |
| # Calculate the time difference between this chunk and the previous one | |
| time_diff_ms = chunk.elapsedTime - last_elapsed_time | |
| cl.user_session.set("last_elapsed_time", chunk.elapsedTime) | |
| # Compute the RMS (root mean square) energy of the audio chunk | |
| audio_energy = audioop.rms( | |
| chunk.data, 2 | |
| ) # Assumes 16-bit audio (2 bytes per sample) | |
| if audio_energy < SILENCE_THRESHOLD: | |
| # Audio is considered silent | |
| silent_duration_ms += time_diff_ms | |
| cl.user_session.set("silent_duration_ms", silent_duration_ms) | |
| if silent_duration_ms >= SILENCE_TIMEOUT and is_speaking: | |
| cl.user_session.set("is_speaking", False) | |
| await process_audio() | |
| else: | |
| # Audio is not silent, reset silence timer and mark as speaking | |
| cl.user_session.set("silent_duration_ms", 0) | |
| if not is_speaking: | |
| cl.user_session.set("is_speaking", True) | |
| async def process_audio(): | |
| # Get the audio buffer from the session | |
| if audio_chunks := cl.user_session.get("audio_chunks"): | |
| # Concatenate all chunks | |
| concatenated = np.concatenate(list(audio_chunks)) | |
| # Create an in-memory binary stream | |
| wav_buffer = io.BytesIO() | |
| # Create WAV file with proper parameters | |
| with wave.open(wav_buffer, "wb") as wav_file: | |
| wav_file.setnchannels(1) # mono | |
| wav_file.setsampwidth(2) # 2 bytes per sample (16-bit) | |
| wav_file.setframerate(24000) # sample rate (24kHz PCM) | |
| wav_file.writeframes(concatenated.tobytes()) | |
| # Reset buffer position | |
| wav_buffer.seek(0) | |
| cl.user_session.set("audio_chunks", []) | |
| frames = wav_file.getnframes() | |
| rate = wav_file.getframerate() | |
| duration = frames / float(rate) | |
| if duration <= 1.71: | |
| print("The audio is too short, please try again.") | |
| return | |
| audio_buffer = wav_buffer.getvalue() | |
| input_audio_el = cl.Audio(content=audio_buffer, mime="audio/wav") | |
| whisper_input = ("audio.wav", audio_buffer, "audio/wav") | |
| transcription = await speech_to_text(whisper_input) | |
| await cl.Message( | |
| author="You", | |
| type="user_message", | |
| content=transcription, | |
| elements=[input_audio_el], | |
| ).send() | |
| answer = await generate_text_answer(transcription) | |
| output_name, output_audio = await text_to_speech(answer, "audio/wav") | |
| output_audio_el = cl.Audio( | |
| auto_play=True, | |
| mime="audio/wav", | |
| content=output_audio, | |
| ) | |
| await cl.Message(content=answer, elements=[output_audio_el]).send() | |
| async def on_message(message: cl.Message): | |
| await cl.Message(content="This is a voice demo, press P to start!").send() |