|
|
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| import os |
| import logging |
| import numpy as np |
| import spaces |
|
|
| from transformers import WhisperProcessor, WhisperForConditionalGeneration |
| import librosa |
|
|
| from huggingface_hub import InferenceClient |
|
|
| from kokoro import KPipeline |
|
|
| logging.basicConfig( |
| level=logging.INFO, |
| format="%(asctime)s [%(levelname)s] %(message)s", |
| datefmt="%Y-%m-%d %H:%M:%S" |
| ) |
|
|
| |
|
|
| |
| |
| modelcard="openai/whisper-small" |
| processor = WhisperProcessor.from_pretrained(modelcard) |
| model = WhisperForConditionalGeneration.from_pretrained(modelcard) |
| forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="transcribe") |
|
|
| |
| hf = InferenceClient( |
| model="google/gemma-2-9b-it", |
| provider="groq", |
| api_key=os.environ.get("HF_API_KEY")) |
|
|
| |
| tts_pipeline = KPipeline( |
| repo_id='hexgrad/Kokoro-82M', |
| lang_code="f") |
|
|
| |
| with open("system_prompt.txt", "r", encoding="utf-8") as f: |
| SYSTEM_PROMPT = f.read().strip() |
|
|
| |
|
|
| js = """ |
| async function main() { |
| const script1 = document.createElement("script"); |
| script1.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js"; |
| document.head.appendChild(script1) |
| const script2 = document.createElement("script"); |
| script2.onload = async () => { |
| console.log("vad loaded") ; |
| var record = document.querySelector('.record-button'); |
| record.textContent = "Just Start Talking!" |
| record.style = "width: fit-content; padding-right: 0.5vw;" |
| const myvad = await vad.MicVAD.new({ |
| model: "v5", |
| positiveSpeechThreshold: 0.3, |
| negativeSpeechThreshold: 0.3, |
| minSpeechFrames: 10, |
| preSpeechPadFrames: 150, |
| onSpeechStart: () => { |
| console.log("Speech start detected") |
| var record = document.querySelector('.record-button'); |
| var play_button = document.getElementById("streaming_out").querySelector(".play-pause-button") |
| var playing = play_button && (play_button.ariaLabel === "Pause"); |
| if (record != null && !playing) { |
| console.log(record); |
| record.click(); |
| } |
| }, |
| onSpeechEnd: (audio) => { |
| console.log("Speech end detected") |
| var stop = document.querySelector('.stop-button'); |
| if (stop != null) { |
| console.log(stop); |
| stop.click(); |
| } |
| } |
| }) |
| myvad.start() |
| } |
| script2.src = "https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js"; |
| script1.onload = () => { |
| console.log("onnx loaded") |
| document.head.appendChild(script2) |
| }; |
| } |
| """ |
|
|
| js_reset = """ |
| () => { |
| var record = document.querySelector('.record-button'); |
| record.textContent = "Just Start Talking!" |
| record.style = "width: fit-content; padding-right: 0.5vw;" |
| } |
| """ |
|
|
| |
|
|
| @spaces.GPU |
| def transcribe(audio_path): |
| """ |
| Transcribe audio file to text using Whisper model. |
| Args: |
| audio_path (str): Path to the audio file. |
| Returns: |
| str: Transcribed text. |
| """ |
|
|
| logging.info(f"audio path: {audio_path}") |
|
|
| |
| audio_array, sampling_rate = librosa.load(audio_path, sr=16000, mono=True) |
|
|
| |
| input_features = processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features |
|
|
| |
| predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids) |
|
|
| |
| transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) |
| logging.info(f"transcription: {transcription[0]}") |
|
|
| return transcription[0] |
|
|
| def chat_with_llm(query, history): |
| """ |
| Interact with the LLM using the provided query and conversation history. |
| Args: |
| query (str): User's query. |
| history (list): Conversation history as a list of messages. |
| Returns: |
| str: LLM's response. |
| """ |
|
|
| |
| messages = [ |
| {"role": "system", "content": SYSTEM_PROMPT}, |
| *history, |
| ] |
|
|
| logging.info(f"user queried: {query}") |
|
|
| answer = hf.chat_completion(messages=messages, max_tokens=512).choices[0].message.content |
|
|
| logging.info(f"bot answered: {answer}") |
|
|
| return answer |
|
|
| @spaces.GPU |
| def synthesize(text, voice="ff_siwis"): |
| """ |
| Synthesize text to speech using Kokoro TTS pipeline. |
| Args: |
| text (str): Text to synthesize. |
| voice (str): Voice model to use for synthesis. |
| Returns: |
| tuple: Sampling rate and audio data as a numpy array. |
| """ |
|
|
| gen = tts_pipeline(text, voice=voice) |
| _, _, audio = next(gen) |
|
|
| |
| if hasattr(audio, "detach"): |
| audio = audio.detach().cpu().numpy() |
| elif not isinstance(audio, np.ndarray): |
| audio = np.array(audio) |
|
|
| logging.info(f"voice synthesis ready") |
| |
| return (24000, audio) |
|
|
| |
|
|
| import gradio as gr |
|
|
| from dataclasses import dataclass, field |
|
|
| @dataclass |
| class AppState: |
| conversation: list = field(default_factory=list) |
|
|
| with gr.Blocks(js=js) as demo: |
| |
| state = gr.State(value=AppState()) |
| |
| gr.Markdown(value=\ |
| """# sambot 🤖 |
| |
| Running an audio chatbot on a consumer GPU. |
| |
| The chatbot is based on a 3 steps pipeline: |
| |
| * STT using [Whisper-small](https://huggingface.co/openai/whisper-small) model |
| * LLM interaction through [HuggingFace Inference API](https://huggingface.co/docs/inference-providers/providers/hf-inference) |
| * TTS using [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M) |
| |
| The UI is made using Gradio, with automatic VAD managed on the frontend using [vad-web](https://github.com/ricky0123/vad).")""") |
|
|
| input_audio = gr.Audio( |
| sources=["microphone"], |
| label="Speak", |
| type="filepath", |
| waveform_options=gr.WaveformOptions(waveform_color="#DB7FBF") |
| ) |
| chatbot = gr.Chatbot( |
| label="Conversation", |
| type="messages", |
| visible=False |
| ) |
| output_audio = gr.Audio( |
| label="TTS Response", |
| autoplay=True, |
| visible=True, |
| elem_id="streaming_out" |
| ) |
| |
| def run_step(state: AppState, audio_path,): |
| """ |
| Process a single step in the conversation. |
| Args: |
| state (AppState): Current application state. |
| audio_path (str): Path to the recorded audio file. |
| Yields: |
| AppState: Updated application state. |
| list: Conversation history. |
| tuple: Audio tuple for TTS response. |
| """ |
|
|
| if not input_audio: |
| return AppState() |
|
|
| user_text = transcribe(audio_path) |
| state.conversation.append({"role": "user", "content": user_text}) |
|
|
| yield state, state.conversation, None |
|
|
| |
| bot_text = chat_with_llm(user_text, state.conversation) |
| state.conversation.append({"role": "assistant", "content": bot_text}) |
| audio_tuple = synthesize(bot_text) |
|
|
| yield state, state.conversation, audio_tuple |
|
|
| stream = input_audio.start_recording( |
| lambda audio, state: (audio, state), |
| [input_audio, state], |
| [input_audio, state], |
| ) |
| respond = input_audio.stop_recording( |
| run_step, |
| [state, input_audio], |
| [state, chatbot, output_audio] |
| ) |
| restart = respond.then( |
| lambda state: None, [state], [input_audio]).then( |
| lambda state: state, state, state, js=js_reset |
| ) |
|
|
| cancel = gr.Button("Restart Conversation", variant="stop") |
| cancel.click( |
| lambda: (AppState(), gr.Audio(recording=False)), |
| None, |
| [state, input_audio], |
| cancels=[respond, restart], |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |