Spaces:
Sleeping
Sleeping
File size: 7,719 Bytes
c24d3b9 0a9393d c24d3b9 0a9393d c24d3b9 84a6755 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 | from faster_whisper import WhisperModel
import numpy as np
import scipy.signal
import spaces
model_size = "base.en"
model = WhisperModel(model_size, device="cpu", compute_type="float32")
def whisper_process_audio(audio_file):
sample_rate, audio_data = audio_file
if audio_data.ndim > 1 and audio_data.shape[1] > 1:
# Mix stereo channels by averaging them
audio_data = np.mean(audio_data, axis=1)
#normalise audio data
np_audio_float32 = audio_data.astype(np.float32) / 32768.0
np_audio_16k = scipy.signal.resample(np_audio_float32, int(len(np_audio_float32) * 16000 / sample_rate))
return np_audio_16k
def transcribe(audio):
segments, info = model.transcribe(whisper_process_audio(audio), beam_size=5, language='en')
text = "".join([segment.text for segment in segments])
return text
from kokoro import KModel, KPipeline
import os
import random
import torch
import numpy as np
import kokoro
import misaki
kkmodel = KModel().to('cuda').eval()
pipeline = KPipeline(lang_code='a', model=False)
@spaces.GPU
def generate_tts(text, voice='af_heart', speed=1):
pack = pipeline.load_voice(voice)
audio_chunks = []
for _, ps, _ in pipeline(text, voice, speed):
ref_s = pack[len(ps)-1]
try:
audio = kkmodel(ps, ref_s, speed)
audio_chunks.append(audio.numpy())
except:
print("lol there was an issue idk")
# yield 24000, audio.numpy()
if audio_chunks:
concatenated_audio = np.concatenate(audio_chunks)
print(concatenated_audio.shape)
return 24000, concatenated_audio
else:
return 24000, np.array([])
import io
import os
import time
from dataclasses import dataclass, field
from multiprocessing import freeze_support
# import groq
import gradio as gr
import numpy as np
from vllm import LLM
@spaces.GPU
def initialize_model():
"""Initialize the model - called after proper multiprocessing setup"""
llama3_model_id = "shuyuej/Llama-3.2-1B-Instruct-GPTQ"
llama3_pipe = LLM(
model=llama3_model_id,
quantization="gptq",
gpu_memory_utilization=0.5,
max_model_len=1024
)
return llama3_pipe
# Global variable to hold the model
llama3_pipe = None
default_sys_prompt = """You are a helpful chatbot. You respond very conversationally, and help the end user as best as you can."""
def llama_QA(message_history, system_prompt: str):
""" stupid func for asking llama a question and then getting an answer
inputs: - input_question [str]: question for llama to answer
outputs: - response [str]: llama's response
"""
global llama3_pipe
# set max gen to 512
sampling_params = llama3_pipe.get_default_sampling_params()
sampling_params.max_tokens = 512
input_message_history = [{"role": "system", "content": system_prompt}]
input_message_history.extend(message_history)
outputs = llama3_pipe.chat(input_message_history, sampling_params)[0].outputs[0].text
# message_history.append({"role": "assistant", "content": outputs})
return outputs
@dataclass
class AppState:
conversation: list = field(default_factory=list)
stopped: bool = False
model_outs: any = None
def process_audio(audio: tuple, state: AppState):
return audio, state
@spaces.GPU
def response(state: AppState, audio: tuple, system_prompt):
if not audio:
return state, state.conversation, None
# Transcribe the audio file
transcription = transcribe(audio)
if transcription:
if transcription.startswith("Error"):
transcription = "Error in audio transcription."
# Append the user's message in the proper format
state.conversation.append({"role": "user", "content": transcription})
# Generate assistant response
assistant_message = llama_QA(state.conversation, system_prompt)
# Append the assistant's message in the proper format
state.conversation.append({"role": "assistant", "content": assistant_message})
# Generate TTS audio
response_audio = generate_tts(assistant_message)
print(state.conversation)
return state, state.conversation, response_audio
def start_recording_user(state: AppState):
return None
js = """
async function main() {
const script1 = document.createElement("script");
script1.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js";
document.head.appendChild(script1)
const script2 = document.createElement("script");
script2.onload = async () => {
console.log("vad loaded") ;
var record = document.querySelector('.record-button');
record.textContent = "Just Start Talking!"
record.style = "width: fit-content; padding-right: 0.5vw;"
const myvad = await vad.MicVAD.new({
onSpeechStart: () => {
var record = document.querySelector('.record-button');
var player = document.querySelector('#streaming-out audio');
if (record != null && (player == null || player.paused || player.ended)) {
console.log("Starting recording", record);
record.click();
} else {
console.log("Audio still playing, not starting recording");
}
},
onSpeechEnd: (audio) => {
var stop = document.querySelector('.stop-button');
if (stop != null) {
console.log("Stopping recording", stop);
stop.click();
}
}
})
myvad.start()
}
script2.src = "https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.7/dist/bundle.min.js";
script1.onload = () => {
console.log("onnx loaded")
document.head.appendChild(script2)
};
}
"""
js_reset = """
() => {
var record = document.querySelector('.record-button');
record.textContent = "Just Start Talking!"
record.style = "width: fit-content; padding-right: 0.5vw;"
}
"""
def create_demo():
"""Create and return the Gradio demo interface"""
with gr.Blocks(js=js) as demo:
with gr.Row():
system_prompt = gr.Textbox(
value=default_sys_prompt,
interactive=True
)
with gr.Row():
input_audio = gr.Audio(
label="Input Audio",
sources=["microphone"],
type="numpy",
streaming=False,
)
with gr.Row():
chatbot = gr.Chatbot(label="Conversation", type="messages")
with gr.Row():
output_audio = gr.Audio(
label="Assistant Audio",
interactive=False,
autoplay=True,
elem_id="streaming-out"
)
state = gr.State(value=AppState())
stream = input_audio.start_recording(
process_audio,
[input_audio, state],
[input_audio, state],
)
respond = input_audio.stop_recording(
response,
inputs=[state, input_audio, system_prompt],
outputs=[state, chatbot, output_audio]
)
restart = respond.then(
start_recording_user,
[state],
[input_audio]
).then(
lambda state: state,
state,
state,
js=js_reset
)
cancel = gr.Button("New Conversation", variant="stop")
cancel.click(
lambda: (AppState(), gr.Audio(recording=False)),
None,
[state, input_audio],
cancels=[respond, restart],
)
return demo
demo = create_demo()
demo.launch() |