Spaces:

onitsche
/

talk

Running

App Files Files Community

onitsche commited on 19 days ago

Commit

8db182c

1 Parent(s): 6fba46b

Switch to edge-tts neural voice + gestures while speaking

Browse files

Files changed (3) hide show

pyproject.toml +1 -1
talk/main.py +38 -8
talk/tts.py +43 -64

pyproject.toml CHANGED Viewed

@@ -11,7 +11,7 @@ readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
     "reachy-mini",
-    "piper-tts",
 ]
 keywords = ["reachy-mini-app", "reachy-mini"]

 requires-python = ">=3.10"
 dependencies = [
     "reachy-mini",
+    "edge-tts",
 ]
 keywords = ["reachy-mini-app", "reachy-mini"]

talk/main.py CHANGED Viewed

@@ -4,15 +4,18 @@ State machine:
   SLEEPING → (antenna press) → SPEAKING → SLEEPING
 The robot sleeps with antennas folded. Pressing either antenna wakes it up,
-it announces the current date and time in German, then goes back to sleep.
 """
 import logging
 import threading
 import time
 from datetime import datetime
 from enum import Enum, auto
 from reachy_mini import ReachyMini, ReachyMiniApp
 from talk.tts import speak
@@ -23,8 +26,8 @@ logger = logging.getLogger(__name__)
 ANTENNA_PRESS_THRESHOLD = 0.15
 # Sleep position mirrors SLEEP_ANTENNAS_JOINT_POSITIONS in the SDK.
 SLEEP_ANTENNAS = [-3.05, 3.05]
-# Ignore antenna input for this many seconds right after waking (debounce).
-DEBOUNCE_AFTER_WAKE = 2.0
 WEEKDAYS_DE = [
     "Montag", "Dienstag", "Mittwoch", "Donnerstag",
@@ -40,13 +43,28 @@ def _datetime_text_de() -> str:
     now = datetime.now()
     weekday = WEEKDAYS_DE[now.weekday()]
     month = MONTHS_DE[now.month - 1]
-    minute = now.strftime("%M")  # "05", "30", etc. — espeak-ng -v de reads these correctly
     return (
         f"Heute ist {weekday}, der {now.day}. {month} {now.year}. "
         f"Es ist {now.hour} Uhr {minute}."
     )
 class State(Enum):
     SLEEPING = auto()
     SPEAKING = auto()
@@ -54,8 +72,7 @@ class State(Enum):
 class Talk(ReachyMiniApp):
     custom_app_url: str | None = "http://0.0.0.0:8042"
-    # No video needed — saves CPU on the wireless CM4.
-    request_media_backend: str | None = "gstreamer_no_video"
     def run(self, reachy_mini: ReachyMini, stop_event: threading.Event) -> None:
         _lock = threading.Lock()
@@ -78,7 +95,7 @@ class Talk(ReachyMiniApp):
                     _shared["state"] = "sleeping"
                 antennas = reachy_mini.get_present_antenna_joint_positions()
-                if time.time() - last_spoke_at > DEBOUNCE_AFTER_WAKE:
                     right_dev = abs(antennas[0] - SLEEP_ANTENNAS[0])
                     left_dev = abs(antennas[1] - SLEEP_ANTENNAS[1])
                     if right_dev > ANTENNA_PRESS_THRESHOLD or left_dev > ANTENNA_PRESS_THRESHOLD:
@@ -99,9 +116,22 @@ class Talk(ReachyMiniApp):
                 logger.info("Speaking: %s", text)
                 reachy_mini.wake_up()
                 speak(text, reachy_mini)
-                reachy_mini.goto_sleep()
                 last_spoke_at = time.time()
                 state = State.SLEEPING

   SLEEPING → (antenna press) → SPEAKING → SLEEPING
 The robot sleeps with antennas folded. Pressing either antenna wakes it up,
+it announces the current date and time in German with expressive gestures,
+then goes back to sleep.
 """
 import logging
+import math
 import threading
 import time
 from datetime import datetime
 from enum import Enum, auto
+import numpy as np
 from reachy_mini import ReachyMini, ReachyMiniApp
 from talk.tts import speak
 ANTENNA_PRESS_THRESHOLD = 0.15
 # Sleep position mirrors SLEEP_ANTENNAS_JOINT_POSITIONS in the SDK.
 SLEEP_ANTENNAS = [-3.05, 3.05]
+# Ignore antenna input for this many seconds right after speaking (debounce).
+DEBOUNCE_AFTER_SPEAK = 2.0
 WEEKDAYS_DE = [
     "Montag", "Dienstag", "Mittwoch", "Donnerstag",
     now = datetime.now()
     weekday = WEEKDAYS_DE[now.weekday()]
     month = MONTHS_DE[now.month - 1]
+    minute = now.strftime("%M")
     return (
         f"Heute ist {weekday}, der {now.day}. {month} {now.year}. "
         f"Es ist {now.hour} Uhr {minute}."
     )
+def _gesture_loop(reachy_mini: ReachyMini, stop: threading.Event) -> None:
+    """Gently moves head and antennas while the robot speaks."""
+    t0 = time.time()
+    while not stop.is_set():
+        t = time.time() - t0
+        # Slow look side-to-side and small nod — gives the impression of engagement
+        y = 0.25 * math.sin(2 * math.pi * 0.12 * t)   # gentle left/right
+        z = 0.05 * math.sin(2 * math.pi * 0.22 * t)    # subtle up/down
+        head_pose = reachy_mini.look_at_world(1.0, y, z, perform_movement=False)
+        # Antennas wiggle in opposite phase — like excited "ears"
+        ant = math.radians(20) * math.sin(2 * math.pi * 0.5 * t)
+        reachy_mini.set_target(head=head_pose, antennas=[ant, -ant])
+        time.sleep(0.05)  # 20 Hz
 class State(Enum):
     SLEEPING = auto()
     SPEAKING = auto()
 class Talk(ReachyMiniApp):
     custom_app_url: str | None = "http://0.0.0.0:8042"
+    request_media_backend: str | None = None
     def run(self, reachy_mini: ReachyMini, stop_event: threading.Event) -> None:
         _lock = threading.Lock()
                     _shared["state"] = "sleeping"
                 antennas = reachy_mini.get_present_antenna_joint_positions()
+                if time.time() - last_spoke_at > DEBOUNCE_AFTER_SPEAK:
                     right_dev = abs(antennas[0] - SLEEP_ANTENNAS[0])
                     left_dev = abs(antennas[1] - SLEEP_ANTENNAS[1])
                     if right_dev > ANTENNA_PRESS_THRESHOLD or left_dev > ANTENNA_PRESS_THRESHOLD:
                 logger.info("Speaking: %s", text)
                 reachy_mini.wake_up()
+                # Gesture thread runs in parallel while TTS plays.
+                gesture_stop = threading.Event()
+                gesture_thread = threading.Thread(
+                    target=_gesture_loop,
+                    args=(reachy_mini, gesture_stop),
+                    daemon=True,
+                )
+                gesture_thread.start()
                 speak(text, reachy_mini)
+                gesture_stop.set()
+                gesture_thread.join(timeout=1.0)
+                reachy_mini.goto_sleep()
                 last_spoke_at = time.time()
                 state = State.SLEEPING

talk/tts.py CHANGED Viewed

@@ -1,105 +1,84 @@
-"""Text-to-speech via piper-tts (neural, offline) → WAV → Reachy Mini audio.
-The German model (de_DE-thorsten-high, ~65 MB) is downloaded from Hugging Face
-on first run and cached in talk/models/. Fully offline thereafter.
-Falls back to espeak-ng if piper-tts is not installed.
 """
 import logging
 import os
 import time
-import wave
-from pathlib import Path
 from typing import Optional
 logger = logging.getLogger(__name__)
-_MODELS_DIR = Path(__file__).parent / "models"
-_MODEL_NAME = "de_DE-thorsten-high"
-_MODEL_BASE_URL = (
-    "https://huggingface.co/rhasspy/piper-voices/resolve/v1.0.0"
-    "/de/de_DE/thorsten/high/"
-)
-_voice = None
-_voice_loaded = False
-def _load_voice():
-    global _voice, _voice_loaded
-    if _voice_loaded:
-        return _voice
-    _voice_loaded = True
-    try:
-        import urllib.request
-        from piper.voice import PiperVoice
-        _MODELS_DIR.mkdir(exist_ok=True)
-        onnx_path = _MODELS_DIR / f"{_MODEL_NAME}.onnx"
-        json_path = _MODELS_DIR / f"{_MODEL_NAME}.onnx.json"
-        if not onnx_path.exists():
-            logger.info("Downloading piper model %s (~65 MB) …", _MODEL_NAME)
-            urllib.request.urlretrieve(_MODEL_BASE_URL + f"{_MODEL_NAME}.onnx", onnx_path)
-            urllib.request.urlretrieve(_MODEL_BASE_URL + f"{_MODEL_NAME}.onnx.json", json_path)
-            logger.info("Piper model downloaded.")
-        _voice = PiperVoice.load(str(onnx_path), config_path=str(json_path))
-        logger.info("Piper TTS ready (%s)", _MODEL_NAME)
-    except ImportError:
-        logger.warning("piper-tts not installed — falling back to espeak-ng")
-    except Exception as exc:
-        logger.warning("Failed to load piper: %s", exc)
-    return _voice
-def speak(text: str, reachy_mini, words_per_minute: int = 120, lang: str = "de") -> None:
     """Synthesize *text* and play it through the robot's speakers.
-    Uses piper-tts (neural) when available, espeak-ng otherwise.
     Blocks until playback should be complete.
     """
-    import tempfile
-    voice = _load_voice()
-    wav_path: Optional[str] = None
     try:
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
-            wav_path = f.name
-        if voice is not None:
-            with wave.open(wav_path, "wb") as wav_file:
-                voice.synthesize(text, wav_file)
-        else:
-            import shutil
-            import subprocess
             cmd = shutil.which("espeak-ng") or shutil.which("espeak")
             if cmd is None:
-                logger.warning("No TTS engine available. Install piper-tts or espeak-ng.")
                 return
             subprocess.run(
-                [cmd, "-v", lang, "-s", str(words_per_minute), "-w", wav_path, "--", text],
                 check=True, timeout=15, capture_output=True,
             )
         try:
-            reachy_mini.media.play_sound(wav_path)
         except Exception as exc:
             logger.warning("play_sound failed: %s", exc)
             return
-        # play_sound() is async — wait for estimated playback duration.
         wps = words_per_minute / 60.0
-        estimated = len(text.split()) / wps + 1.0
-        time.sleep(max(estimated, 1.5))
     except Exception as exc:
         logger.warning("TTS error: %s", exc)
     finally:
-        if wav_path:
             try:
-                os.unlink(wav_path)
             except OSError:
                 pass

+"""Text-to-speech via edge-tts (Microsoft neural, online) → MP3 → Reachy Mini audio.
+Uses Microsoft Edge's TTS API (no API key, no model download, requires internet).
+German voice: de-DE-KatjaNeural.
+Falls back to espeak-ng if edge-tts fails (network error, not installed, etc.).
 """
+import asyncio
 import logging
 import os
+import shutil
+import subprocess
+import tempfile
 import time
 from typing import Optional
 logger = logging.getLogger(__name__)
+EDGE_VOICE = "de-DE-KatjaNeural"
+EDGE_RATE = "-5%"   # slightly slower for clarity
+async def _edge_synthesize(text: str, path: str) -> None:
+    import edge_tts
+    communicate = edge_tts.Communicate(text, EDGE_VOICE, rate=EDGE_RATE)
+    await communicate.save(path)
+def speak(text: str, reachy_mini, words_per_minute: int = 130, lang: str = "de") -> None:
     """Synthesize *text* and play it through the robot's speakers.
+    Tries edge-tts first (neural quality), falls back to espeak-ng.
     Blocks until playback should be complete.
     """
+    audio_path: Optional[str] = None
     try:
+        # edge-tts outputs MP3; GStreamer playbin handles it natively.
+        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
+            audio_path = f.name
+        success = False
+        try:
+            asyncio.run(_edge_synthesize(text, audio_path))
+            success = True
+        except ImportError:
+            logger.warning("edge-tts not installed — falling back to espeak-ng")
+        except Exception as exc:
+            logger.warning("edge-tts failed (%s) — falling back to espeak-ng", exc)
+        if not success:
+            # Fallback: espeak-ng to WAV
+            os.unlink(audio_path)
+            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
+                audio_path = f.name
             cmd = shutil.which("espeak-ng") or shutil.which("espeak")
             if cmd is None:
+                logger.warning("No TTS engine available. Install edge-tts or espeak-ng.")
                 return
             subprocess.run(
+                [cmd, "-v", lang, "-s", str(words_per_minute), "-w", audio_path, "--", text],
                 check=True, timeout=15, capture_output=True,
             )
         try:
+            reachy_mini.media.play_sound(audio_path)
         except Exception as exc:
             logger.warning("play_sound failed: %s", exc)
             return
+        # play_sound() is async — wait for estimated playback to finish.
         wps = words_per_minute / 60.0
+        estimated = len(text.split()) / wps + 1.5
+        time.sleep(max(estimated, 2.0))
     except Exception as exc:
         logger.warning("TTS error: %s", exc)
     finally:
+        if audio_path and os.path.exists(audio_path):
             try:
+                os.unlink(audio_path)
             except OSError:
                 pass