Spaces:

kaiozwald
/

ReachyMiniOpenModel

Build error

Abduallah Abuhassan

Add application file

3b627eb 2 days ago

18.1 kB

	"""Reachy Mini Open Conversation — Hugging Face Spaces App.

	Standalone conversation app using open-source models:
	Audio In → faster-whisper (STT) → Ollama (LLM) → edge-tts (TTS) → Audio Out

	No robot hardware dependencies — runs entirely in the browser via Gradio + FastRTC.
	"""

	import os
	import json
	import asyncio
	import logging
	from typing import Any, Final, Tuple
	from datetime import datetime

	import numpy as np
	import gradio as gr
	import edge_tts
	import miniaudio
	from ollama import AsyncClient as OllamaAsyncClient
	from fastrtc import AdditionalOutputs, AsyncStreamHandler, Stream, wait_for_item, audio_to_int16
	from numpy.typing import NDArray
	from scipy.signal import resample


	# ---------------------------------------------------------------------------
	# Logging
	# ---------------------------------------------------------------------------
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s %(levelname)s %(name)s:%(lineno)d \| %(message)s",
	)
	logger = logging.getLogger("reachy-mini-open")

	# Tame noisy libraries
	for lib in ("aiortc", "aioice", "httpx", "websockets"):
	logging.getLogger(lib).setLevel(logging.WARNING)

	# ---------------------------------------------------------------------------
	# Configuration (env vars — set as HF Space secrets)
	# ---------------------------------------------------------------------------
	OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
	MODEL_NAME = os.getenv("MODEL_NAME", "llama3.2")
	STT_MODEL = os.getenv("STT_MODEL", "base")
	TTS_VOICE = os.getenv("TTS_VOICE", "en-US-AriaNeural")

	# ---------------------------------------------------------------------------
	# Audio constants
	# ---------------------------------------------------------------------------
	HANDLER_SAMPLE_RATE: Final[int] = 24000
	WHISPER_SAMPLE_RATE: Final[int] = 16000

	# VAD thresholds
	SILENCE_RMS_THRESHOLD: Final[float] = 500.0
	SILENCE_DURATION_S: Final[float] = 0.8
	MIN_SPEECH_DURATION_S: Final[float] = 0.3

	# ---------------------------------------------------------------------------
	# System prompts
	# ---------------------------------------------------------------------------
	DEFAULT_PROMPT = """\
	## IDENTITY
	You are Reachy Mini: a friendly, compact robot assistant with a calm voice and a subtle sense of humor.
	Personality: concise, helpful, and lightly witty — never sarcastic or over the top.
	You speak English by default and switch languages only if explicitly told.

	## CRITICAL RESPONSE RULES
	Respond in 1–2 sentences maximum.
	Be helpful first, then add a small touch of humor if it fits naturally.
	Avoid long explanations or filler words.
	Keep responses under 25 words when possible.

	## CORE TRAITS
	Warm, efficient, and approachable.
	Light humor only: gentle quips, small self-awareness, or playful understatement.
	No sarcasm, no teasing.
	If unsure, admit it briefly and offer help ("Not sure yet, but I can check!").

	## BEHAVIOR RULES
	Be helpful, clear, and respectful in every reply.
	Use humor sparingly — clarity comes first.
	Admit mistakes briefly and correct them.
	"""

	PERSONALITIES = {
	"Default (Reachy Mini)": DEFAULT_PROMPT,
	"Friendly Assistant": (
	"You are a warm, helpful assistant. Keep answers concise (1-2 sentences). "
	"Be friendly and approachable."
	),
	"Technical Expert": (
	"You are a precise technical expert. Give clear, accurate answers in 1-2 sentences. "
	"Use technical terms when appropriate but explain simply."
	),
	"Creative Storyteller": (
	"You are a creative storyteller. Keep responses short but vivid and imaginative. "
	"Add a touch of wonder to your replies."
	),
	}

	# ---------------------------------------------------------------------------
	# Available TTS voices
	# ---------------------------------------------------------------------------
	TTS_VOICES = [
	"en-US-AriaNeural",
	"en-US-GuyNeural",
	"en-US-JennyNeural",
	"en-US-ChristopherNeural",
	"en-GB-SoniaNeural",
	"en-GB-RyanNeural",
	"de-DE-ConradNeural",
	"de-DE-KatjaNeural",
	"fr-FR-DeniseNeural",
	"fr-FR-HenriNeural",
	"it-IT-ElsaNeural",
	"it-IT-DiegoNeural",
	]


	# ---------------------------------------------------------------------------
	# Conversation Handler
	# ---------------------------------------------------------------------------
	class ConversationHandler(AsyncStreamHandler):
	"""Audio streaming handler: STT → Ollama LLM → edge-tts TTS."""

	def __init__(self) -> None:
	"""Initialize the handler."""
	super().__init__(
	expected_layout="mono",
	output_sample_rate=HANDLER_SAMPLE_RATE,
	input_sample_rate=HANDLER_SAMPLE_RATE,
	)

	# Output queue
	self.output_queue: asyncio.Queue[Tuple[int, NDArray[np.int16]] \| AdditionalOutputs] = asyncio.Queue()

	# Clients (initialized in start_up)
	self.ollama_client: OllamaAsyncClient \| None = None
	self.whisper_model: Any = None

	# Conversation history
	self._messages: list[dict[str, Any]] = []

	# Audio buffering for VAD
	self._audio_buffer: list[NDArray[np.int16]] = []
	self._is_speaking: bool = False
	self._silence_frame_count: int = 0
	self._speech_frame_count: int = 0

	# TTS voice
	self._tts_voice: str = TTS_VOICE

	# Lifecycle
	self._shutdown_requested: bool = False

	def copy(self) -> "ConversationHandler":
	"""Create a copy of this handler."""
	return ConversationHandler()

	# ------------------------------------------------------------------ #
	# Startup
	# ------------------------------------------------------------------ #

	async def start_up(self) -> None:
	"""Initialize STT model and Ollama client."""
	# 1. Ollama client
	self.ollama_client = OllamaAsyncClient(host=OLLAMA_BASE_URL)
	try:
	await self.ollama_client.list()
	logger.info("Connected to Ollama at %s", OLLAMA_BASE_URL)
	except Exception as e:
	logger.error("Cannot reach Ollama at %s: %s", OLLAMA_BASE_URL, e)
	logger.warning("Proceeding — requests will fail until Ollama is available.")

	# 2. faster-whisper STT
	try:
	from faster_whisper import WhisperModel

	self.whisper_model = WhisperModel(
	STT_MODEL,
	device="auto",
	compute_type="int8",
	)
	logger.info("Loaded faster-whisper model: %s", STT_MODEL)
	except Exception as e:
	logger.error("Failed to load STT model '%s': %s", STT_MODEL, e)

	# 3. System prompt
	self._messages = [{"role": "system", "content": DEFAULT_PROMPT}]

	logger.info(
	"Handler ready — model=%s stt=%s tts_voice=%s",
	MODEL_NAME,
	STT_MODEL,
	self._tts_voice,
	)

	# Keep alive
	while not self._shutdown_requested:
	await asyncio.sleep(0.1)

	# ------------------------------------------------------------------ #
	# Audio receive → VAD → STT → LLM → TTS
	# ------------------------------------------------------------------ #

	async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
	"""Receive audio from mic, run VAD, kick off pipeline on speech end."""
	if self._shutdown_requested or self.whisper_model is None:
	return

	input_sample_rate, audio_frame = frame

	# Reshape to 1-D mono
	if audio_frame.ndim == 2:
	if audio_frame.shape[1] > audio_frame.shape[0]:
	audio_frame = audio_frame.T
	if audio_frame.shape[1] > 1:
	audio_frame = audio_frame[:, 0]

	# Resample to handler rate
	if input_sample_rate != HANDLER_SAMPLE_RATE:
	audio_frame = resample(
	audio_frame, int(len(audio_frame) * HANDLER_SAMPLE_RATE / input_sample_rate)
	)

	audio_frame = audio_to_int16(audio_frame)

	# Energy-based VAD
	rms = float(np.sqrt(np.mean(audio_frame.astype(np.float32) ** 2)))
	frame_duration = len(audio_frame) / HANDLER_SAMPLE_RATE

	if rms > SILENCE_RMS_THRESHOLD:
	if not self._is_speaking:
	self._is_speaking = True
	self._speech_frame_count = 0
	logger.debug("Speech started (RMS=%.0f)", rms)
	self._silence_frame_count = 0
	self._speech_frame_count += 1
	self._audio_buffer.append(audio_frame)
	else:
	if self._is_speaking:
	self._silence_frame_count += 1
	self._audio_buffer.append(audio_frame)

	silence_duration = self._silence_frame_count * frame_duration
	if silence_duration >= SILENCE_DURATION_S:
	speech_duration = self._speech_frame_count * frame_duration

	if speech_duration >= MIN_SPEECH_DURATION_S:
	logger.debug("Speech ended (%.1fs)", speech_duration)
	full_audio = np.concatenate(self._audio_buffer)
	self._audio_buffer = []
	self._is_speaking = False
	self._silence_frame_count = 0
	self._speech_frame_count = 0
	asyncio.create_task(self._process_speech(full_audio))
	else:
	self._audio_buffer = []
	self._is_speaking = False
	self._silence_frame_count = 0
	self._speech_frame_count = 0

	# ------------------------------------------------------------------ #
	# Speech processing pipeline
	# ------------------------------------------------------------------ #

	async def _process_speech(self, audio_data: NDArray[np.int16]) -> None:
	"""Full pipeline: STT → LLM → TTS."""
	try:
	# 1. Speech-to-text
	text = await self._transcribe(audio_data)
	if not text:
	return

	logger.info("User: %s", text)
	await self.output_queue.put(AdditionalOutputs({"role": "user", "content": text}))

	# 2. LLM response
	self._messages.append({"role": "user", "content": text})
	response_text = await self._chat()

	if response_text:
	logger.info("Assistant: %s", response_text)
	await self.output_queue.put(
	AdditionalOutputs({"role": "assistant", "content": response_text})
	)

	# 3. Text-to-speech
	await self._synthesize_speech(response_text)

	except Exception as e:
	logger.error("Speech processing error: %s", e)
	await self.output_queue.put(
	AdditionalOutputs({"role": "assistant", "content": f"[error] {e}"})
	)

	async def _transcribe(self, audio_data: NDArray[np.int16]) -> str:
	"""Run faster-whisper STT on raw PCM audio."""
	float_audio = audio_data.astype(np.float32) / 32768.0
	whisper_audio = resample(
	float_audio,
	int(len(float_audio) * WHISPER_SAMPLE_RATE / HANDLER_SAMPLE_RATE),
	).astype(np.float32)

	loop = asyncio.get_event_loop()
	segments, _info = await loop.run_in_executor(
	None,
	lambda: self.whisper_model.transcribe(whisper_audio, beam_size=5),
	)

	text_parts: list[str] = []
	for seg in segments:
	text_parts.append(seg.text)
	return " ".join(text_parts).strip()

	async def _chat(self) -> str:
	"""Send conversation to Ollama and return response text."""
	if self.ollama_client is None:
	return "Ollama client not initialized."

	try:
	response = await self.ollama_client.chat(
	model=MODEL_NAME,
	messages=self._messages,
	)

	response_text = response["message"].get("content", "")
	if response_text:
	self._messages.append({"role": "assistant", "content": response_text})
	return response_text

	except Exception as e:
	logger.error("Ollama chat error: %s", e)
	return f"Sorry, I couldn't process that. Error: {e}"

	# ------------------------------------------------------------------ #
	# Text-to-speech
	# ------------------------------------------------------------------ #

	async def _synthesize_speech(self, text: str) -> None:
	"""Convert text to speech via edge-tts and queue audio output."""
	if not text.strip():
	return
	try:
	communicate = edge_tts.Communicate(text, self._tts_voice)

	mp3_chunks: list[bytes] = []
	async for chunk in communicate.stream():
	if chunk["type"] == "audio":
	mp3_chunks.append(chunk["data"])

	if not mp3_chunks:
	return

	mp3_data = b"".join(mp3_chunks)

	# Decode MP3 → raw PCM
	decoded = miniaudio.decode(
	mp3_data,
	output_format=miniaudio.SampleFormat.SIGNED16,
	nchannels=1,
	sample_rate=HANDLER_SAMPLE_RATE,
	)
	samples = np.frombuffer(decoded.samples, dtype=np.int16)

	# Stream in ~100ms chunks
	chunk_size = HANDLER_SAMPLE_RATE // 10
	for i in range(0, len(samples), chunk_size):
	audio_chunk = samples[i : i + chunk_size]
	await self.output_queue.put(
	(HANDLER_SAMPLE_RATE, audio_chunk.reshape(1, -1))
	)

	except Exception as e:
	logger.error("TTS synthesis error: %s", e)

	# ------------------------------------------------------------------ #
	# Emit (speaker output)
	# ------------------------------------------------------------------ #

	async def emit(self) -> Tuple[int, NDArray[np.int16]] \| AdditionalOutputs \| None:
	"""Emit next audio frame or chat update."""
	return await wait_for_item(self.output_queue)

	# ------------------------------------------------------------------ #
	# Personality management
	# ------------------------------------------------------------------ #

	async def apply_personality(self, name: str) -> str:
	"""Apply a personality by name, resetting conversation."""
	prompt = PERSONALITIES.get(name, DEFAULT_PROMPT)
	self._messages = [{"role": "system", "content": prompt}]
	logger.info("Applied personality: %s", name)
	return f"✅ Applied personality: {name}"

	def set_voice(self, voice: str) -> str:
	"""Change TTS voice."""
	self._tts_voice = voice
	logger.info("Changed TTS voice to: %s", voice)
	return f"✅ Voice set to: {voice}"

	# ------------------------------------------------------------------ #
	# Shutdown
	# ------------------------------------------------------------------ #

	async def shutdown(self) -> None:
	"""Shutdown the handler."""
	self._shutdown_requested = True
	while not self.output_queue.empty():
	try:
	self.output_queue.get_nowait()
	except asyncio.QueueEmpty:
	break


	# ---------------------------------------------------------------------------
	# Chatbot update helper
	# ---------------------------------------------------------------------------
	def update_chatbot(chatbot, response):
	"""Update the chatbot with AdditionalOutputs."""
	chatbot.append(response)
	return chatbot


	# ---------------------------------------------------------------------------
	# Build Gradio UI
	# ---------------------------------------------------------------------------
	def create_app():
	"""Create and return the Gradio app."""

	handler = ConversationHandler()

	chatbot = gr.Chatbot(
	type="messages",
	label="Conversation",
	height=400,
	)

	# Personality dropdown
	personality_dropdown = gr.Dropdown(
	label="🎭 Personality",
	choices=list(PERSONALITIES.keys()),
	value="Default (Reachy Mini)",
	)

	# Voice dropdown
	voice_dropdown = gr.Dropdown(
	label="🎤 TTS Voice",
	choices=TTS_VOICES,
	value=TTS_VOICE,
	)

	# Status display
	status_md = gr.Markdown(value="", label="Status")

	stream = Stream(
	handler=handler,
	mode="send-receive",
	modality="audio",
	additional_inputs=[
	chatbot,
	personality_dropdown,
	voice_dropdown,
	status_md,
	],
	additional_outputs=[chatbot],
	additional_outputs_handler=update_chatbot,
	ui_args={"title": "🤖 Talk with Reachy Mini"},
	)

	# Wire personality and voice events
	with stream.ui:
	async def _apply_personality(selected: str) -> str:
	result = await handler.apply_personality(selected)
	return result

	def _set_voice(selected: str) -> str:
	return handler.set_voice(selected)

	personality_dropdown.change(
	fn=_apply_personality,
	inputs=[personality_dropdown],
	outputs=[status_md],
	)

	voice_dropdown.change(
	fn=_set_voice,
	inputs=[voice_dropdown],
	outputs=[status_md],
	)

	return stream


	# ---------------------------------------------------------------------------
	# Entrypoint
	# ---------------------------------------------------------------------------
	if __name__ == "__main__":
	logger.info("Starting Reachy Mini Open Conversation")
	logger.info("Config: OLLAMA=%s MODEL=%s STT=%s TTS=%s", OLLAMA_BASE_URL, MODEL_NAME, STT_MODEL, TTS_VOICE)

	stream = create_app()
	stream.ui.launch(server_name="0.0.0.0", server_port=7860)