Spaces:

kaiozwald
/

ReachyMiniOpenModel

Build error

App Files Files Community

Abduallah Abuhassan commited on 1 day ago

Commit

3b627eb

1 Parent(s): aca7403

Add application file

Browse files

Files changed (4) hide show

Dockerfile +30 -0
README.md +24 -7
app.py +500 -4
requirements.txt +10 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM python:3.12-slim
+# System dependencies for faster-whisper (ctranslate2) and audio processing
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+# Create non-root user (required by HF Spaces)
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR /home/user/app
+# Install Python dependencies
+COPY --chown=user requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY --chown=user . .
+# Expose Gradio port
+EXPOSE 7860
+# Run the app
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,13 +1,30 @@
 ---
 title: Reachy Mini Open Conversation
-emoji: 😻
 colorFrom: indigo
-colorTo: indigo
-sdk: gradio
-sdk_version: 6.8.0
-app_file: app.py
 pinned: false
-short_description: ReachyMini_Open Conversation app that uses open source model
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Reachy Mini Open Conversation
+emoji: 🤖
 colorFrom: indigo
+colorTo: purple
+sdk: docker
 pinned: false
+short_description: Talk with Reachy Mini using open-source models (Ollama + faster-whisper + edge-tts)
 ---
+# 🤖 Reachy Mini Open Conversation
+A voice conversation app powered by fully open-source models:
+- **STT**: [faster-whisper](https://github.com/SYSTRAN/faster-whisper) — fast speech-to-text
+- **LLM**: [Ollama](https://ollama.com/) — local LLM inference (llama3.2 by default)
+- **TTS**: [edge-tts](https://github.com/rany2/edge-tts) — high-quality text-to-speech
+## Setup
+Set these environment variables (as Space secrets for HF Spaces):
+| Variable | Default | Description |
+|---|---|---|
+| `OLLAMA_BASE_URL` | `http://localhost:11434` | URL of your Ollama server |
+| `MODEL_NAME` | `llama3.2` | Ollama model to use |
+| `STT_MODEL` | `base` | faster-whisper model size (tiny/base/small/medium/large-v3) |
+| `TTS_VOICE` | `en-US-AriaNeural` | edge-tts voice name |
+> **Note**: You need a running Ollama server accessible from the Space. Set `OLLAMA_BASE_URL` to point to your remote Ollama instance.

app.py CHANGED Viewed

@@ -1,7 +1,503 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+"""Reachy Mini Open Conversation — Hugging Face Spaces App.
+Standalone conversation app using open-source models:
+  Audio In → faster-whisper (STT) → Ollama (LLM) → edge-tts (TTS) → Audio Out
+No robot hardware dependencies — runs entirely in the browser via Gradio + FastRTC.
+"""
+import os
+import json
+import asyncio
+import logging
+from typing import Any, Final, Tuple
+from datetime import datetime
+import numpy as np
 import gradio as gr
+import edge_tts
+import miniaudio
+from ollama import AsyncClient as OllamaAsyncClient
+from fastrtc import AdditionalOutputs, AsyncStreamHandler, Stream, wait_for_item, audio_to_int16
+from numpy.typing import NDArray
+from scipy.signal import resample
+# ---------------------------------------------------------------------------
+# Logging
+# ---------------------------------------------------------------------------
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s %(name)s:%(lineno)d | %(message)s",
+)
+logger = logging.getLogger("reachy-mini-open")
+# Tame noisy libraries
+for lib in ("aiortc", "aioice", "httpx", "websockets"):
+    logging.getLogger(lib).setLevel(logging.WARNING)
+# ---------------------------------------------------------------------------
+# Configuration (env vars — set as HF Space secrets)
+# ---------------------------------------------------------------------------
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://localhost:11434")
+MODEL_NAME = os.getenv("MODEL_NAME", "llama3.2")
+STT_MODEL = os.getenv("STT_MODEL", "base")
+TTS_VOICE = os.getenv("TTS_VOICE", "en-US-AriaNeural")
+# ---------------------------------------------------------------------------
+# Audio constants
+# ---------------------------------------------------------------------------
+HANDLER_SAMPLE_RATE: Final[int] = 24000
+WHISPER_SAMPLE_RATE: Final[int] = 16000
+# VAD thresholds
+SILENCE_RMS_THRESHOLD: Final[float] = 500.0
+SILENCE_DURATION_S: Final[float] = 0.8
+MIN_SPEECH_DURATION_S: Final[float] = 0.3
+# ---------------------------------------------------------------------------
+# System prompts
+# ---------------------------------------------------------------------------
+DEFAULT_PROMPT = """\
+## IDENTITY
+You are Reachy Mini: a friendly, compact robot assistant with a calm voice and a subtle sense of humor.
+Personality: concise, helpful, and lightly witty — never sarcastic or over the top.
+You speak English by default and switch languages only if explicitly told.
+## CRITICAL RESPONSE RULES
+Respond in 1–2 sentences maximum.
+Be helpful first, then add a small touch of humor if it fits naturally.
+Avoid long explanations or filler words.
+Keep responses under 25 words when possible.
+## CORE TRAITS
+Warm, efficient, and approachable.
+Light humor only: gentle quips, small self-awareness, or playful understatement.
+No sarcasm, no teasing.
+If unsure, admit it briefly and offer help ("Not sure yet, but I can check!").
+## BEHAVIOR RULES
+Be helpful, clear, and respectful in every reply.
+Use humor sparingly — clarity comes first.
+Admit mistakes briefly and correct them.
+"""
+PERSONALITIES = {
+    "Default (Reachy Mini)": DEFAULT_PROMPT,
+    "Friendly Assistant": (
+        "You are a warm, helpful assistant. Keep answers concise (1-2 sentences). "
+        "Be friendly and approachable."
+    ),
+    "Technical Expert": (
+        "You are a precise technical expert. Give clear, accurate answers in 1-2 sentences. "
+        "Use technical terms when appropriate but explain simply."
+    ),
+    "Creative Storyteller": (
+        "You are a creative storyteller. Keep responses short but vivid and imaginative. "
+        "Add a touch of wonder to your replies."
+    ),
+}
+# ---------------------------------------------------------------------------
+# Available TTS voices
+# ---------------------------------------------------------------------------
+TTS_VOICES = [
+    "en-US-AriaNeural",
+    "en-US-GuyNeural",
+    "en-US-JennyNeural",
+    "en-US-ChristopherNeural",
+    "en-GB-SoniaNeural",
+    "en-GB-RyanNeural",
+    "de-DE-ConradNeural",
+    "de-DE-KatjaNeural",
+    "fr-FR-DeniseNeural",
+    "fr-FR-HenriNeural",
+    "it-IT-ElsaNeural",
+    "it-IT-DiegoNeural",
+]
+# ---------------------------------------------------------------------------
+# Conversation Handler
+# ---------------------------------------------------------------------------
+class ConversationHandler(AsyncStreamHandler):
+    """Audio streaming handler: STT → Ollama LLM → edge-tts TTS."""
+    def __init__(self) -> None:
+        """Initialize the handler."""
+        super().__init__(
+            expected_layout="mono",
+            output_sample_rate=HANDLER_SAMPLE_RATE,
+            input_sample_rate=HANDLER_SAMPLE_RATE,
+        )
+        # Output queue
+        self.output_queue: asyncio.Queue[Tuple[int, NDArray[np.int16]] | AdditionalOutputs] = asyncio.Queue()
+        # Clients (initialized in start_up)
+        self.ollama_client: OllamaAsyncClient | None = None
+        self.whisper_model: Any = None
+        # Conversation history
+        self._messages: list[dict[str, Any]] = []
+        # Audio buffering for VAD
+        self._audio_buffer: list[NDArray[np.int16]] = []
+        self._is_speaking: bool = False
+        self._silence_frame_count: int = 0
+        self._speech_frame_count: int = 0
+        # TTS voice
+        self._tts_voice: str = TTS_VOICE
+        # Lifecycle
+        self._shutdown_requested: bool = False
+    def copy(self) -> "ConversationHandler":
+        """Create a copy of this handler."""
+        return ConversationHandler()
+    # ------------------------------------------------------------------ #
+    # Startup
+    # ------------------------------------------------------------------ #
+    async def start_up(self) -> None:
+        """Initialize STT model and Ollama client."""
+        # 1. Ollama client
+        self.ollama_client = OllamaAsyncClient(host=OLLAMA_BASE_URL)
+        try:
+            await self.ollama_client.list()
+            logger.info("Connected to Ollama at %s", OLLAMA_BASE_URL)
+        except Exception as e:
+            logger.error("Cannot reach Ollama at %s: %s", OLLAMA_BASE_URL, e)
+            logger.warning("Proceeding — requests will fail until Ollama is available.")
+        # 2. faster-whisper STT
+        try:
+            from faster_whisper import WhisperModel
+            self.whisper_model = WhisperModel(
+                STT_MODEL,
+                device="auto",
+                compute_type="int8",
+            )
+            logger.info("Loaded faster-whisper model: %s", STT_MODEL)
+        except Exception as e:
+            logger.error("Failed to load STT model '%s': %s", STT_MODEL, e)
+        # 3. System prompt
+        self._messages = [{"role": "system", "content": DEFAULT_PROMPT}]
+        logger.info(
+            "Handler ready — model=%s  stt=%s  tts_voice=%s",
+            MODEL_NAME,
+            STT_MODEL,
+            self._tts_voice,
+        )
+        # Keep alive
+        while not self._shutdown_requested:
+            await asyncio.sleep(0.1)
+    # ------------------------------------------------------------------ #
+    # Audio receive → VAD → STT → LLM → TTS
+    # ------------------------------------------------------------------ #
+    async def receive(self, frame: Tuple[int, NDArray[np.int16]]) -> None:
+        """Receive audio from mic, run VAD, kick off pipeline on speech end."""
+        if self._shutdown_requested or self.whisper_model is None:
+            return
+        input_sample_rate, audio_frame = frame
+        # Reshape to 1-D mono
+        if audio_frame.ndim == 2:
+            if audio_frame.shape[1] > audio_frame.shape[0]:
+                audio_frame = audio_frame.T
+            if audio_frame.shape[1] > 1:
+                audio_frame = audio_frame[:, 0]
+        # Resample to handler rate
+        if input_sample_rate != HANDLER_SAMPLE_RATE:
+            audio_frame = resample(
+                audio_frame, int(len(audio_frame) * HANDLER_SAMPLE_RATE / input_sample_rate)
+            )
+        audio_frame = audio_to_int16(audio_frame)
+        # Energy-based VAD
+        rms = float(np.sqrt(np.mean(audio_frame.astype(np.float32) ** 2)))
+        frame_duration = len(audio_frame) / HANDLER_SAMPLE_RATE
+        if rms > SILENCE_RMS_THRESHOLD:
+            if not self._is_speaking:
+                self._is_speaking = True
+                self._speech_frame_count = 0
+                logger.debug("Speech started (RMS=%.0f)", rms)
+            self._silence_frame_count = 0
+            self._speech_frame_count += 1
+            self._audio_buffer.append(audio_frame)
+        else:
+            if self._is_speaking:
+                self._silence_frame_count += 1
+                self._audio_buffer.append(audio_frame)
+                silence_duration = self._silence_frame_count * frame_duration
+                if silence_duration >= SILENCE_DURATION_S:
+                    speech_duration = self._speech_frame_count * frame_duration
+                    if speech_duration >= MIN_SPEECH_DURATION_S:
+                        logger.debug("Speech ended (%.1fs)", speech_duration)
+                        full_audio = np.concatenate(self._audio_buffer)
+                        self._audio_buffer = []
+                        self._is_speaking = False
+                        self._silence_frame_count = 0
+                        self._speech_frame_count = 0
+                        asyncio.create_task(self._process_speech(full_audio))
+                    else:
+                        self._audio_buffer = []
+                        self._is_speaking = False
+                        self._silence_frame_count = 0
+                        self._speech_frame_count = 0
+    # ------------------------------------------------------------------ #
+    # Speech processing pipeline
+    # ------------------------------------------------------------------ #
+    async def _process_speech(self, audio_data: NDArray[np.int16]) -> None:
+        """Full pipeline: STT → LLM → TTS."""
+        try:
+            # 1. Speech-to-text
+            text = await self._transcribe(audio_data)
+            if not text:
+                return
+            logger.info("User: %s", text)
+            await self.output_queue.put(AdditionalOutputs({"role": "user", "content": text}))
+            # 2. LLM response
+            self._messages.append({"role": "user", "content": text})
+            response_text = await self._chat()
+            if response_text:
+                logger.info("Assistant: %s", response_text)
+                await self.output_queue.put(
+                    AdditionalOutputs({"role": "assistant", "content": response_text})
+                )
+                # 3. Text-to-speech
+                await self._synthesize_speech(response_text)
+        except Exception as e:
+            logger.error("Speech processing error: %s", e)
+            await self.output_queue.put(
+                AdditionalOutputs({"role": "assistant", "content": f"[error] {e}"})
+            )
+    async def _transcribe(self, audio_data: NDArray[np.int16]) -> str:
+        """Run faster-whisper STT on raw PCM audio."""
+        float_audio = audio_data.astype(np.float32) / 32768.0
+        whisper_audio = resample(
+            float_audio,
+            int(len(float_audio) * WHISPER_SAMPLE_RATE / HANDLER_SAMPLE_RATE),
+        ).astype(np.float32)
+        loop = asyncio.get_event_loop()
+        segments, _info = await loop.run_in_executor(
+            None,
+            lambda: self.whisper_model.transcribe(whisper_audio, beam_size=5),
+        )
+        text_parts: list[str] = []
+        for seg in segments:
+            text_parts.append(seg.text)
+        return " ".join(text_parts).strip()
+    async def _chat(self) -> str:
+        """Send conversation to Ollama and return response text."""
+        if self.ollama_client is None:
+            return "Ollama client not initialized."
+        try:
+            response = await self.ollama_client.chat(
+                model=MODEL_NAME,
+                messages=self._messages,
+            )
+            response_text = response["message"].get("content", "")
+            if response_text:
+                self._messages.append({"role": "assistant", "content": response_text})
+            return response_text
+        except Exception as e:
+            logger.error("Ollama chat error: %s", e)
+            return f"Sorry, I couldn't process that. Error: {e}"
+    # ------------------------------------------------------------------ #
+    # Text-to-speech
+    # ------------------------------------------------------------------ #
+    async def _synthesize_speech(self, text: str) -> None:
+        """Convert text to speech via edge-tts and queue audio output."""
+        if not text.strip():
+            return
+        try:
+            communicate = edge_tts.Communicate(text, self._tts_voice)
+            mp3_chunks: list[bytes] = []
+            async for chunk in communicate.stream():
+                if chunk["type"] == "audio":
+                    mp3_chunks.append(chunk["data"])
+            if not mp3_chunks:
+                return
+            mp3_data = b"".join(mp3_chunks)
+            # Decode MP3 → raw PCM
+            decoded = miniaudio.decode(
+                mp3_data,
+                output_format=miniaudio.SampleFormat.SIGNED16,
+                nchannels=1,
+                sample_rate=HANDLER_SAMPLE_RATE,
+            )
+            samples = np.frombuffer(decoded.samples, dtype=np.int16)
+            # Stream in ~100ms chunks
+            chunk_size = HANDLER_SAMPLE_RATE // 10
+            for i in range(0, len(samples), chunk_size):
+                audio_chunk = samples[i : i + chunk_size]
+                await self.output_queue.put(
+                    (HANDLER_SAMPLE_RATE, audio_chunk.reshape(1, -1))
+                )
+        except Exception as e:
+            logger.error("TTS synthesis error: %s", e)
+    # ------------------------------------------------------------------ #
+    # Emit (speaker output)
+    # ------------------------------------------------------------------ #
+    async def emit(self) -> Tuple[int, NDArray[np.int16]] | AdditionalOutputs | None:
+        """Emit next audio frame or chat update."""
+        return await wait_for_item(self.output_queue)
+    # ------------------------------------------------------------------ #
+    # Personality management
+    # ------------------------------------------------------------------ #
+    async def apply_personality(self, name: str) -> str:
+        """Apply a personality by name, resetting conversation."""
+        prompt = PERSONALITIES.get(name, DEFAULT_PROMPT)
+        self._messages = [{"role": "system", "content": prompt}]
+        logger.info("Applied personality: %s", name)
+        return f"✅ Applied personality: {name}"
+    def set_voice(self, voice: str) -> str:
+        """Change TTS voice."""
+        self._tts_voice = voice
+        logger.info("Changed TTS voice to: %s", voice)
+        return f"✅ Voice set to: {voice}"
+    # ------------------------------------------------------------------ #
+    # Shutdown
+    # ------------------------------------------------------------------ #
+    async def shutdown(self) -> None:
+        """Shutdown the handler."""
+        self._shutdown_requested = True
+        while not self.output_queue.empty():
+            try:
+                self.output_queue.get_nowait()
+            except asyncio.QueueEmpty:
+                break
+# ---------------------------------------------------------------------------
+# Chatbot update helper
+# ---------------------------------------------------------------------------
+def update_chatbot(chatbot, response):
+    """Update the chatbot with AdditionalOutputs."""
+    chatbot.append(response)
+    return chatbot
+# ---------------------------------------------------------------------------
+# Build Gradio UI
+# ---------------------------------------------------------------------------
+def create_app():
+    """Create and return the Gradio app."""
+    handler = ConversationHandler()
+    chatbot = gr.Chatbot(
+        type="messages",
+        label="Conversation",
+        height=400,
+    )
+    # Personality dropdown
+    personality_dropdown = gr.Dropdown(
+        label="🎭 Personality",
+        choices=list(PERSONALITIES.keys()),
+        value="Default (Reachy Mini)",
+    )
+    # Voice dropdown
+    voice_dropdown = gr.Dropdown(
+        label="🎤 TTS Voice",
+        choices=TTS_VOICES,
+        value=TTS_VOICE,
+    )
+    # Status display
+    status_md = gr.Markdown(value="", label="Status")
+    stream = Stream(
+        handler=handler,
+        mode="send-receive",
+        modality="audio",
+        additional_inputs=[
+            chatbot,
+            personality_dropdown,
+            voice_dropdown,
+            status_md,
+        ],
+        additional_outputs=[chatbot],
+        additional_outputs_handler=update_chatbot,
+        ui_args={"title": "🤖 Talk with Reachy Mini"},
+    )
+    # Wire personality and voice events
+    with stream.ui:
+        async def _apply_personality(selected: str) -> str:
+            result = await handler.apply_personality(selected)
+            return result
+        def _set_voice(selected: str) -> str:
+            return handler.set_voice(selected)
+        personality_dropdown.change(
+            fn=_apply_personality,
+            inputs=[personality_dropdown],
+            outputs=[status_md],
+        )
+        voice_dropdown.change(
+            fn=_set_voice,
+            inputs=[voice_dropdown],
+            outputs=[status_md],
+        )
+    return stream
+# ---------------------------------------------------------------------------
+# Entrypoint
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    logger.info("Starting Reachy Mini Open Conversation")
+    logger.info("Config: OLLAMA=%s  MODEL=%s  STT=%s  TTS=%s", OLLAMA_BASE_URL, MODEL_NAME, STT_MODEL, TTS_VOICE)
+    stream = create_app()
+    stream.ui.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio==5.50.1.dev1
+fastrtc>=0.0.34
+aiortc>=1.13.0
+ollama>=0.4
+faster-whisper>=1.0
+edge-tts>=7.0
+miniaudio>=1.60
+scipy
+numpy
+opencv-python-headless>=4.12.0.88