Spaces:

LunaticMaestro
/

voiceBot

Sleeping

File size: 4,859 Bytes

import soundfile as sf 
import numpy as np
import io 
import numpy as np
import os
from dotenv import load_dotenv 
from openai import OpenAI, RateLimitError, APIError, APIConnectionError
import time 
from pydub import AudioSegment
load_dotenv() 

client = OpenAI(
  base_url = "https://integrate.api.nvidia.com/v1",
  api_key = os.environ["NVIDIA_API"]
)

def chat_llm(conversation: None):
    completion = client.chat.completions.create(
    model="meta/llama-3.1-405b-instruct",
    messages=conversation,
    temperature=0.2,
    top_p=0.7,
    max_tokens=4000,
    stream=True
    )

    for chunk in completion:
        if chunk.choices[0].delta.content is not None:
            print(chunk.choices[0].delta.content, end="")

        
if __name__ == "__main__": 
    from _data_model import AppState
    state = AppState(llm_conversation=[
        {
            "role": "system",
            "content": "You are a voice assistant. You are there on my behalf. My name is Deepak and your name is Julia. You are there tell the user how good engineer I am"
        },
        {
            "role": "user", 
            "content": "Hey, what can  you tell ?"
        }
    ])
    e, msg = chat_llm(conversation=state.llm_conversation)
    print(e, msg)




def audio_to_bytes(audio_input) -> bytes:
    """
    Convert a Gradio audio input (numpy array or filepath) to WAV bytes.
    
    Parameters:
        audio_input: tuple | str
            - If tuple: (numpy_array, sample_rate)
            - If str: path to an audio file

    Returns:
        bytes: The WAV file bytes.
    """
    if isinstance(audio_input, str):
        # audio_input is a file path
        samplerate, data = sf.read(audio_input)
    elif isinstance(audio_input, (tuple, list)) and len(audio_input) == 2:
        # audio_input is (numpy array, sample_rate)
        samplerate, data = audio_input
    else:
        raise ValueError("Invalid audio input. Expected (numpy_array, sample_rate) or file path string.")
        # Ensure mono (channel count = 1)
    if data.ndim > 1:
        data = np.mean(data, axis=1)  # average channels to mono

    # Write to an in-memory buffer
    wav_buffer = io.BytesIO()
    sf.write(wav_buffer, data, samplerate, format='WAV')
    wav_bytes = wav_buffer.getvalue()
    wav_buffer.close()
    
    return wav_bytes


def audio_bytes_to_gr_tuple(audio_bytes: bytes) -> tuple[int, np.ndarray]:
    """
    Converts a bytes object containing audio data into a (sample_rate, numpy_array)
    tuple suitable for Gradio's Audio component.
    Supports any format recognized by pydub/ffmpeg.
    """
    # Load the bytes into an AudioSegment
    audio_segment = AudioSegment.from_file(io.BytesIO(audio_bytes))

    # Get raw audio samples as numpy array
    samples = np.array(audio_segment.get_array_of_samples())

    # If stereo, reshape appropriately
    if audio_segment.channels > 1:
        samples = samples.reshape((-1, audio_segment.channels))

    # Return in (sample_rate, np.ndarray) format expected by Gradio
    return audio_segment.frame_rate, samples

'''depcreate'''


# def detect_pause(audio_array, sample_rate, silence_threshold=0.01, min_pause_ms=300) -> bool:
#     """
#     Detect if there is a pause in the audio.
    
#     Parameters:
#         audio_array (np.ndarray): Audio samples (mono or stereo)
#         sample_rate (int): Sampling rate
#         silence_threshold (float): Max amplitude considered silence
#         min_pause_ms (int): Minimum duration (ms) to count as a pause
    
#     Returns:
#         bool: True if a pause is detected, False otherwise
#     """
#     # Convert stereo to mono if needed
#     if audio_array.ndim > 1:
#         audio_array = np.mean(audio_array, axis=1)
    
#     # Absolute amplitude
#     amplitude = np.abs(audio_array)
    
#     # Boolean array: True where below threshold
#     silent = amplitude < silence_threshold
    
#     # Convert pause duration from ms to number of samples
#     min_silent_samples = int(sample_rate * (min_pause_ms / 1000.0))
    
#     # Find if there is a contiguous silent region of that length
#     count = 0
#     for s in silent:
#         if s:
#             count += 1
#             if count >= min_silent_samples:
#                 return True  # pause detected
#         else:
#             count = 0
#     return False  # no long enough silence




# def steaming(audio: tuple, state: AppState):
#     if state.stream is None:
#         state.stream = audio[1]
#         state.sampling_rate = audio[0]
#     else:
#         state.stream =  np.concatenate((state.stream, audio[1]))

#     pause_detected = detect_pause(state.stream, state.sampling_rate)
#     state.pause_detected = pause_detected

#     if state.pause_detected and state.started_talking:
#         return gr.Audio(recording=False), state
#     return None, state