Spaces:

amurienne
/

sambot

Paused

sambot / app.py

blackccpie

feat : added markdown description.

2784a8d 10 months ago

9.08 kB


	# The MIT License

	# Copyright (c) 2025 Albert Murienne

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	# THE SOFTWARE.

	import os
	import logging
	import numpy as np
	import spaces

	from transformers import WhisperProcessor, WhisperForConditionalGeneration
	import librosa

	from huggingface_hub import InferenceClient

	from kokoro import KPipeline

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S"
	)

	# INITIALIZE MODELS

	# Load Whisper model and processor
	#modelcard="openai/whisper-tiny"
	modelcard="openai/whisper-small"
	processor = WhisperProcessor.from_pretrained(modelcard)
	model = WhisperForConditionalGeneration.from_pretrained(modelcard)
	forced_decoder_ids = processor.get_decoder_prompt_ids(language="french", task="transcribe")

	# Set up Hugging Face InferenceClient (for LLM like llama)
	hf = InferenceClient(
	model="google/gemma-2-9b-it",
	provider="groq",
	api_key=os.environ.get("HF_API_KEY")) # remote LLM

	# Load Kokoro
	tts_pipeline = KPipeline(
	repo_id='hexgrad/Kokoro-82M',
	lang_code="f") # french

	# Read system prompt from external file
	with open("system_prompt.txt", "r", encoding="utf-8") as f:
	SYSTEM_PROMPT = f.read().strip()

	# DEFINE JAVASCRIPT FOR GRADIO UI

	js = """
	async function main() {
	const script1 = document.createElement("script");
	script1.src = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.14.0/dist/ort.js";
	document.head.appendChild(script1)
	const script2 = document.createElement("script");
	script2.onload = async () => {
	console.log("vad loaded") ;
	var record = document.querySelector('.record-button');
	record.textContent = "Just Start Talking!"
	record.style = "width: fit-content; padding-right: 0.5vw;"
	const myvad = await vad.MicVAD.new({
	model: "v5",
	positiveSpeechThreshold: 0.3,
	negativeSpeechThreshold: 0.3,
	minSpeechFrames: 10,
	preSpeechPadFrames: 150,
	onSpeechStart: () => {
	console.log("Speech start detected")
	var record = document.querySelector('.record-button');
	var play_button = document.getElementById("streaming_out").querySelector(".play-pause-button")
	var playing = play_button && (play_button.ariaLabel === "Pause");
	if (record != null && !playing) {
	console.log(record);
	record.click();
	}
	},
	onSpeechEnd: (audio) => {
	console.log("Speech end detected")
	var stop = document.querySelector('.stop-button');
	if (stop != null) {
	console.log(stop);
	stop.click();
	}
	}
	})
	myvad.start()
	}
	script2.src = "https://cdn.jsdelivr.net/npm/@ricky0123/vad-web@0.0.22/dist/bundle.min.js";
	script1.onload = () => {
	console.log("onnx loaded")
	document.head.appendChild(script2)
	};
	}
	"""

	js_reset = """
	() => {
	var record = document.querySelector('.record-button');
	record.textContent = "Just Start Talking!"
	record.style = "width: fit-content; padding-right: 0.5vw;"
	}
	"""

	# DEFINE CALLBACKS

	@spaces.GPU
	def transcribe(audio_path):
	"""
	Transcribe audio file to text using Whisper model.
	Args:
	audio_path (str): Path to the audio file.
	Returns:
	str: Transcribed text.
	"""

	logging.info(f"audio path: {audio_path}") # TODO : check None!!

	# load and resample local WAV file to 16kHz mono
	audio_array, sampling_rate = librosa.load(audio_path, sr=16000, mono=True)

	# process audio
	input_features = processor(audio_array, sampling_rate=16000, return_tensors="pt").input_features

	# generate token ids
	predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)

	# decode token ids to text
	transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
	logging.info(f"transcription: {transcription[0]}")

	return transcription[0]

	def chat_with_llm(query, history):
	"""
	Interact with the LLM using the provided query and conversation history.
	Args:
	query (str): User's query.
	history (list): Conversation history as a list of messages.
	Returns:
	str: LLM's response.
	"""

	# prepare messages in OpenAI-style format
	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	*history,
	]

	logging.info(f"user queried: {query}")

	answer = hf.chat_completion(messages=messages, max_tokens=512).choices[0].message.content

	logging.info(f"bot answered: {answer}")

	return answer

	@spaces.GPU
	def synthesize(text, voice="ff_siwis"):
	"""
	Synthesize text to speech using Kokoro TTS pipeline.
	Args:
	text (str): Text to synthesize.
	voice (str): Voice model to use for synthesis.
	Returns:
	tuple: Sampling rate and audio data as a numpy array.
	"""

	gen = tts_pipeline(text, voice=voice)
	_, _, audio = next(gen)

	# convert to numpy if it's a tensor
	if hasattr(audio, "detach"):
	audio = audio.detach().cpu().numpy()
	elif not isinstance(audio, np.ndarray):
	audio = np.array(audio)

	logging.info(f"voice synthesis ready")

	return (24000, audio)

	# BUILD THE GRADIO UI

	import gradio as gr

	from dataclasses import dataclass, field

	@dataclass
	class AppState:
	conversation: list = field(default_factory=list)

	with gr.Blocks(js=js) as demo:

	state = gr.State(value=AppState())

	gr.Markdown(value=\
	"""# sambot 🤖

	Running an audio chatbot on a consumer GPU.

	The chatbot is based on a 3 steps pipeline:

	* STT using [Whisper-small](https://huggingface.co/openai/whisper-small) model
	* LLM interaction through [HuggingFace Inference API](https://huggingface.co/docs/inference-providers/providers/hf-inference)
	* TTS using [Kokoro](https://huggingface.co/hexgrad/Kokoro-82M)

	The UI is made using Gradio, with automatic VAD managed on the frontend using [vad-web](https://github.com/ricky0123/vad).")""")

	input_audio = gr.Audio(
	sources=["microphone"],
	label="Speak",
	type="filepath",
	waveform_options=gr.WaveformOptions(waveform_color="#DB7FBF")
	)
	chatbot = gr.Chatbot(
	label="Conversation",
	type="messages",
	visible=False
	)
	output_audio = gr.Audio(
	label="TTS Response",
	autoplay=True,
	visible=True,
	elem_id="streaming_out"
	)

	def run_step(state: AppState, audio_path,):
	"""
	Process a single step in the conversation.
	Args:
	state (AppState): Current application state.
	audio_path (str): Path to the recorded audio file.
	Yields:
	AppState: Updated application state.
	list: Conversation history.
	tuple: Audio tuple for TTS response.
	"""

	if not input_audio:
	return AppState()

	user_text = transcribe(audio_path) # now using faster-whisper
	state.conversation.append({"role": "user", "content": user_text})

	yield state, state.conversation, None

	# LLM and TTS logic unchanged:
	bot_text = chat_with_llm(user_text, state.conversation)
	state.conversation.append({"role": "assistant", "content": bot_text})
	audio_tuple = synthesize(bot_text)

	yield state, state.conversation, audio_tuple

	stream = input_audio.start_recording(
	lambda audio, state: (audio, state),
	[input_audio, state],
	[input_audio, state],
	)
	respond = input_audio.stop_recording(
	run_step,
	[state, input_audio],
	[state, chatbot, output_audio]
	)
	restart = respond.then(
	lambda state: None, [state], [input_audio]).then(
	lambda state: state, state, state, js=js_reset
	)

	cancel = gr.Button("Restart Conversation", variant="stop")
	cancel.click(
	lambda: (AppState(), gr.Audio(recording=False)),
	None,
	[state, input_audio],
	cancels=[respond, restart],
	)

	if __name__ == "__main__":
	demo.launch()