Spaces:

aankitdas
/

tts-eval-framework

Sleeping

App Files Files Community

aankitdas commited on Apr 13

Commit

24a256c

1 Parent(s): c1f5502

upgraded for hf space

Browse files

Files changed (6) hide show

app/app.py +4 -4
app/engines/__init__.py +9 -7
app/engines/elevenlabs_engine.py +129 -0
app/engines/piper_engine.py +56 -12
app/engines/voxtral_engine.py +69 -0
app/evaluator.py +10 -13

app/app.py CHANGED Viewed

@@ -18,12 +18,12 @@ import pandas as pd
 import gradio as gr
 sys.path.insert(0, os.path.dirname(__file__))
 from engines import ENGINES, ENGINE_MAP
 from engines.kokoro_engine import KOKORO_VOICES, KOKORO_DEFAULT_VOICE
 from evaluator import evaluate
-from dotenv import load_dotenv
-load_dotenv(os.path.join(os.path.dirname(__file__), ".env"))
 # ── constants ─────────────────────────────────────────────────────────────────
 BANDS = ["K-2", "3-5", "6-8", "9-12"]
@@ -177,7 +177,7 @@ def build_business_chart(results: list[dict]):
         color = color_map.get(engine_type, "#bdc3c7")
         # bubble size: min size 15, scale with cost
-        size = max(15, cost * 5000 + 15)
         hover = (
             f"<b>{engine_name}</b><br>"

 import gradio as gr
 sys.path.insert(0, os.path.dirname(__file__))
+from dotenv import load_dotenv
+load_dotenv(os.path.join(os.path.dirname(__file__), ".env"))
 from engines import ENGINES, ENGINE_MAP
 from engines.kokoro_engine import KOKORO_VOICES, KOKORO_DEFAULT_VOICE
 from evaluator import evaluate
 # ── constants ─────────────────────────────────────────────────────────────────
 BANDS = ["K-2", "3-5", "6-8", "9-12"]
         color = color_map.get(engine_type, "#bdc3c7")
         # bubble size: min size 15, scale with cost
+        size = 20 #max(15, cost * 5000 + 15)
         hover = (
             f"<b>{engine_name}</b><br>"

app/engines/__init__.py CHANGED Viewed

@@ -4,25 +4,27 @@
 # import it here, and add it to ENGINES list.
 from engines.kokoro_engine import KokoroEngine
-from engines.edge_tts_engine import EdgeTTSEngine
-from engines.pyttsx3_engine import Pyttsx3Engine
-from engines.chirp_engine import ChirpEngine
 from engines.parler_engine import ParlerEngine
 from engines.piper_engine import PiperEngine
 from engines.chatterbox_runpod_engine import ChatterboxRunpodEngine
 # ordered list — determines dropdown order in UI
 # add new engines here when ready
 ENGINES = [
     KokoroEngine(),
-    EdgeTTSEngine(),
-    Pyttsx3Engine(),
     ParlerEngine(),
     PiperEngine(),
     ChatterboxRunpodEngine(),
     # ChirpEngine(),  # uncomment when API key is available
 ]

 # import it here, and add it to ENGINES list.
 from engines.kokoro_engine import KokoroEngine
+# from engines.edge_tts_engine import EdgeTTSEngine
+# from engines.pyttsx3_engine import Pyttsx3Engine
 from engines.parler_engine import ParlerEngine
 from engines.piper_engine import PiperEngine
 from engines.chatterbox_runpod_engine import ChatterboxRunpodEngine
+# from engines.voxtral_engine import VoxtralEngine
+# from engines.chirp_engine import ChirpEngine
+from engines.elevenlabs_engine import ElevenLabsEngine
 # ordered list — determines dropdown order in UI
 # add new engines here when ready
 ENGINES = [
     KokoroEngine(),
+    # EdgeTTSEngine(),
+    # Pyttsx3Engine(),
     ParlerEngine(),
     PiperEngine(),
     ChatterboxRunpodEngine(),
+    ElevenLabsEngine(),
+    # VoxtralEngine(),
     # ChirpEngine(),  # uncomment when API key is available
 ]

app/engines/elevenlabs_engine.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# app/engines/elevenlabs_engine.py
+# ElevenLabs TTS engine — neural cloud, high naturalness ceiling.
+# Uses the official elevenlabs Python SDK.
+# Free tier: 10,000 chars/month, MP3 output only, no commercial license.
+# Paid tier: WAV output available, commercial license included.
+# Classification: neural-cloud-paid (free tier is eval-only, not production).
+import time
+import os
+from pathlib import Path
+from elevenlabs.client import ElevenLabs
+from elevenlabs import VoiceSettings
+from engines.base import TTSEngine
+# --- pricing ---
+# eleven_turbo_v2_5: $0.15 per 1K chars on Creator tier
+# using Flash rate as conservative estimate for free tier evaluation
+_COST_PER_MILLION_CHARS = 150.0  # $0.15 per 1K = $150 per 1M
+# --- model ---
+# eleven_turbo_v2_5: best quality/latency tradeoff for non-realtime coaching
+# upgrade to eleven_multilingual_v2 for highest quality (slower)
+_DEFAULT_MODEL = "eleven_turbo_v2_5"
+class ElevenLabsEngine(TTSEngine):
+    name = "ElevenLabs (Turbo)"
+    engine_type = "neural-cloud-paid"
+    cost_per_million_chars = _COST_PER_MILLION_CHARS
+    is_production_ready = True
+    requires_internet = True
+    # Voice IDs from ElevenLabs shared voice library (free tier accessible)
+    # swap voice_id values after listening — IDs are stable across accounts
+    BAND_CONFIG = {
+        "K-2":  {
+            "voice_id": "cgSgspJ2msm6clMCkdW9",  # Jessica — playful, bright, warm
+            "voice_name": "Jessica",
+            "stability": 0.75,
+            "similarity_boost": 0.75,
+            "speed": 0.85,
+        },
+        "3-5":  {
+            "voice_id": "XrExE9yKIg1WjnnlVkGX",  # Matilda — knowledgeable, professional
+            "voice_name": "Matilda",
+            "stability": 0.70,
+            "similarity_boost": 0.75,
+            "speed": 0.95,
+        },
+        "6-8":  {
+            "voice_id": "EXAVITQu4vr4xnSDxMaL",  # Sarah — mature, reassuring
+            "voice_name": "Sarah",
+            "stability": 0.65,
+            "similarity_boost": 0.75,
+            "speed": 1.00,
+        },
+        "9-12": {
+            "voice_id": "nPczCjzI2devNBz1zQrb",  # Brian — deep, resonant, comforting
+            "voice_name": "Brian",
+            "stability": 0.60,
+            "similarity_boost": 0.80,
+            "speed": 1.10,
+        },
+    }
+    def __init__(self):
+        """
+        Initializes the ElevenLabs client using ELEVENLABS_API_KEY from env.
+        Raises ValueError early if key is missing so the error is clear.
+        """
+        api_key = os.getenv("ELEVENLABS_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "ELEVENLABS_API_KEY not set. "
+                "Add it to app/.env — see .env.example."
+            )
+        self._client = ElevenLabs(api_key=api_key)
+    def synthesize(self, text: str, band: str, output_path: str) -> dict:
+        """
+        Synthesize text using ElevenLabs Turbo v2.5.
+        Saves as MP3 (free tier limitation — WAV requires Creator tier).
+        evaluator.py handles MP3 natively via librosa.
+        Args:
+            text:        coaching text to synthesize
+            band:        grade band — "K-2", "3-5", "6-8", "9-12"
+            output_path: path without extension — .mp3 will be appended
+        Returns:
+            standard TTSEngine dict: audio_path, latency_seconds, voice, speed, engine
+        """
+        config = self.get_band_config(band)
+        full_path = output_path + ".mp3"
+        voice_settings = VoiceSettings(
+            stability=config["stability"],
+            similarity_boost=config["similarity_boost"],
+            speed=config["speed"],
+        )
+        start = time.time()
+        # text_to_speech.convert returns a generator of audio chunks
+        audio_chunks = self._client.text_to_speech.convert(
+            text=text,
+            voice_id=config["voice_id"],
+            model_id=_DEFAULT_MODEL,
+            output_format="mp3_44100_128",  # highest quality available on free tier
+            voice_settings=voice_settings,
+        )
+        # write chunks to file
+        Path(full_path).parent.mkdir(parents=True, exist_ok=True)
+        with open(full_path, "wb") as f:
+            for chunk in audio_chunks:
+                f.write(chunk)
+        latency = round(time.time() - start, 3)
+        return {
+            "audio_path": full_path,
+            "latency_seconds": latency,
+            "voice": config["voice_name"],
+            "speed": config["speed"],
+            "engine": self.name,
+        }

app/engines/piper_engine.py CHANGED Viewed

@@ -1,32 +1,76 @@
 # app/engines/piper_engine.py
 # Piper TTS engine — fast ONNX-based neural TTS, fully offline.
-# Voices are small (~63MB) ONNX models downloaded separately.
 # Designed for low-latency, low-resource deployment (runs on Raspberry Pi).
 # Faster than Kokoro on CPU, lower naturalness ceiling.
-# Good fallback: offline, no API key, minimal VRAM.
 import wave
 import time
 from pathlib import Path
 from piper import PiperVoice
 from engines.base import TTSEngine
-# voice files live in voices/piper/ relative to project root
 _VOICES_DIR = Path(__file__).parent.parent.parent / "voices" / "piper"
 # cache loaded voices — loading ONNX takes ~0.5s, reuse across calls
 _voice_cache: dict[str, PiperVoice] = {}
 def _get_voice(voice_file: str) -> PiperVoice:
     if voice_file not in _voice_cache:
         model_path = _VOICES_DIR / voice_file
-        if not model_path.exists():
-            raise FileNotFoundError(
-                f"Piper voice not found: {model_path}\n"
-                f"Download it first — see voices/piper/ directory."
-            )
         _voice_cache[voice_file] = PiperVoice.load(
             str(model_path),
             use_cuda=False,  # ONNX CUDA provider requires separate install
@@ -40,12 +84,12 @@ class PiperEngine(TTSEngine):
     engine_type = "neural-local"
     cost_per_million_chars = 0.0
     is_production_ready = False  # lower naturalness than Kokoro, no band-tuned voices yet
-    requires_internet = False
     BAND_CONFIG = {
-        "K-2":  {"voice_file": "en_US-amy-medium.onnx",   "speed": 0.9},
-        "3-5":  {"voice_file": "en_US-amy-medium.onnx",   "speed": 1.0},
-        "6-8":  {"voice_file": "en_US-amy-medium.onnx",   "speed": 1.0},
         "9-12": {"voice_file": "en_US-lessac-medium.onnx", "speed": 1.1},
     }

 # app/engines/piper_engine.py
 # Piper TTS engine — fast ONNX-based neural TTS, fully offline.
+# Voices are downloaded on demand from rhasspy/piper-voices on HF Hub
+# and cached flat in voices/piper/ for subsequent runs.
 # Designed for low-latency, low-resource deployment (runs on Raspberry Pi).
 # Faster than Kokoro on CPU, lower naturalness ceiling.
+# Good fallback: offline after first download, no API key, minimal VRAM.
 import wave
 import time
+import shutil
 from pathlib import Path
 from piper import PiperVoice
+from huggingface_hub import hf_hub_download
 from engines.base import TTSEngine
+# voice files live flat in voices/piper/ relative to project root
 _VOICES_DIR = Path(__file__).parent.parent.parent / "voices" / "piper"
 # cache loaded voices — loading ONNX takes ~0.5s, reuse across calls
 _voice_cache: dict[str, PiperVoice] = {}
+def _ensure_model_downloaded(voice_file: str) -> None:
+    """
+    Checks for model and config at flat voices/piper/ path.
+    If missing, downloads from rhasspy/piper-voices on HF Hub
+    and moves to flat location. Handles .onnx and .json separately
+    so a partial download can be recovered.
+    """
+    _VOICES_DIR.mkdir(parents=True, exist_ok=True)
+    model_path = _VOICES_DIR / voice_file
+    config_path = _VOICES_DIR / f"{voice_file}.json"
+    # parse voice file name into HF Hub repo subfolder structure
+    # e.g. en_US-amy-medium.onnx -> en/en_US/amy/medium/
+    parts = voice_file.split("-")
+    lang_family = parts[0].split("_")[0]        # "en"
+    lang_full = parts[0]                        # "en_US"
+    speaker = parts[1]                          # "amy"
+    quality = parts[2].replace(".onnx", "")     # "medium"
+    repo_subfolder = f"{lang_family}/{lang_full}/{speaker}/{quality}"
+    if not model_path.exists():
+        print(f"[Piper] Downloading {voice_file} from HF Hub...")
+        downloaded = hf_hub_download(
+            repo_id="rhasspy/piper-voices",
+            filename=f"{repo_subfolder}/{voice_file}",
+            local_dir=str(_VOICES_DIR),
+            local_dir_use_symlinks=False,
+        )
+        shutil.move(downloaded, model_path)
+        print(f"[Piper] Saved to {model_path}")
+    if not config_path.exists():
+        print(f"[Piper] Downloading {voice_file}.json from HF Hub...")
+        downloaded = hf_hub_download(
+            repo_id="rhasspy/piper-voices",
+            filename=f"{repo_subfolder}/{voice_file}.json",
+            local_dir=str(_VOICES_DIR),
+            local_dir_use_symlinks=False,
+        )
+        shutil.move(downloaded, config_path)
+        print(f"[Piper] Saved to {config_path}")
 def _get_voice(voice_file: str) -> PiperVoice:
+    """Returns a cached PiperVoice, downloading the model first if needed."""
     if voice_file not in _voice_cache:
+        _ensure_model_downloaded(voice_file)
         model_path = _VOICES_DIR / voice_file
         _voice_cache[voice_file] = PiperVoice.load(
             str(model_path),
             use_cuda=False,  # ONNX CUDA provider requires separate install
     engine_type = "neural-local"
     cost_per_million_chars = 0.0
     is_production_ready = False  # lower naturalness than Kokoro, no band-tuned voices yet
+    requires_internet = False    # only on first run; fully offline after download
     BAND_CONFIG = {
+        "K-2":  {"voice_file": "en_US-amy-medium.onnx",    "speed": 0.9},
+        "3-5":  {"voice_file": "en_US-amy-medium.onnx",    "speed": 1.0},
+        "6-8":  {"voice_file": "en_US-amy-medium.onnx",    "speed": 1.0},
         "9-12": {"voice_file": "en_US-lessac-medium.onnx", "speed": 1.1},
     }

app/engines/voxtral_engine.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# app/engines/voxtral_engine.py
+# Mistral Voxtral TTS via Mistral La Plateforme API.
+# Requires MISTRAL_API_KEY environment variable.
+# Pricing: $16.00 per 1M characters (Mistral, 2026)
+# Voices: paul (US), oliver (UK), marie (FR), nick (ES), jane_confident
+import os
+import time
+import requests
+from engines.base import TTSEngine
+MISTRAL_ENDPOINT = "https://api.mistral.ai/v1/audio/speech"
+class VoxtralEngine(TTSEngine):
+    name = "Voxtral (Mistral AI)"
+    engine_type = "neural-cloud-paid"
+    cost_per_million_chars = 16.0  # $16.00 USD
+    is_production_ready = True
+    requires_internet = True
+    # Mapping Mistral's 2026 preset voices to your pedagogical bands
+    BAND_CONFIG = {
+        "K-2":  {"voice_id": "paul"},            # Friendly US male
+        "3-5":  {"voice_id": "oliver"},          # Energetic UK male
+        "6-8":  {"voice_id": "jane_confident"},  # Clear, professional female
+        "9-12": {"voice_id": "paul"},            # Mature US male
+    }
+    def synthesize(self, text: str, band: str, output_path: str) -> dict:
+        api_key = os.environ.get("MISTRAL_API_KEY")
+        if not api_key:
+            raise ValueError("MISTRAL_API_KEY not set in environment.")
+        config = self.get_band_config(band)
+        voice_id = config["voice_id"]
+        full_path = output_path + ".wav"
+        payload = {
+            "model": "voxtral-mini-tts-2603",
+            "input": text,
+            "voice": voice_id,
+            "format": "wav"
+        }
+        headers = {
+            "Authorization": f"Bearer {api_key}",
+            "Content-Type": "application/json",
+        }
+        start = time.time()
+        response = requests.post(MISTRAL_ENDPOINT, json=payload, headers=headers, timeout=60)
+        response.raise_for_status()
+        latency = round(time.time() - start, 3)
+        with open(full_path, "wb") as f:
+            f.write(response.content)
+        return {
+            "audio_path": full_path,
+            "latency_seconds": latency,
+            "voice": voice_id,
+            "speed": 1.0,
+            "engine": self.name,
+            "actual_cost_usd": self.estimate_cost(text),
+        }
+    def estimate_cost(self, text: str) -> float:
+        # $16 per 1,000,000 characters
+        char_count = len(text)
+        return round((char_count / 1_000_000) * self.cost_per_million_chars, 6)

app/evaluator.py CHANGED Viewed

@@ -72,24 +72,17 @@ def compute_wer(reference_text: str, audio_path: str) -> float:
 def compute_utmos(audio_path: str) -> float:
     """
     Predict MOS score using UTMOS (automated naturalness rating 1-5).
     Args:
-        audio_path: path to synthesized audio file (.wav only)
     Returns:
         predicted MOS score (float, higher = more natural)
     """
     model = _get_utmos()
-    # convert mp3 to wav if needed
-    if audio_path.endswith(".mp3"):
-        audio, sr = librosa.load(audio_path, sr=16000)
-    else:
-        audio, sr = sf.read(audio_path)
-        if audio.ndim > 1:
-            audio = audio.mean(axis=1)
-        if sr != 16000:
-            audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
     wav_tensor = torch.FloatTensor(audio).unsqueeze(0)
     with torch.no_grad():
@@ -97,11 +90,11 @@ def compute_utmos(audio_path: str) -> float:
     return round(float(score), 3)
 def compute_rtf(latency_seconds: float, audio_path: str) -> float:
     """
     Compute Real Time Factor: synthesis_time / audio_duration.
     RTF < 1.0 means faster than real time.
     Args:
         latency_seconds: wall-clock synthesis time from engine
@@ -110,7 +103,11 @@ def compute_rtf(latency_seconds: float, audio_path: str) -> float:
     Returns:
         RTF as float
     """
-    audio, sr = sf.read(audio_path)
     audio_duration = len(audio) / sr
     if audio_duration == 0:
         return 0.0

 def compute_utmos(audio_path: str) -> float:
     """
     Predict MOS score using UTMOS (automated naturalness rating 1-5).
+    Uses librosa for all formats (WAV + MP3) to avoid soundfile
+    subprocess issues in Gradio's hot-reload worker.
     Args:
+        audio_path: path to synthesized audio file
     Returns:
         predicted MOS score (float, higher = more natural)
     """
     model = _get_utmos()
+    audio, sr = librosa.load(audio_path, sr=16000, mono=True)
     wav_tensor = torch.FloatTensor(audio).unsqueeze(0)
     with torch.no_grad():
     return round(float(score), 3)
 def compute_rtf(latency_seconds: float, audio_path: str) -> float:
     """
     Compute Real Time Factor: synthesis_time / audio_duration.
     RTF < 1.0 means faster than real time.
+    Uses librosa for MP3 (sf.read may fail on MP3 depending on libsndfile version).
     Args:
         latency_seconds: wall-clock synthesis time from engine
     Returns:
         RTF as float
     """
+    if audio_path.endswith(".mp3"):
+        audio, sr = librosa.load(audio_path, sr=None)
+    else:
+        audio, sr = sf.read(audio_path)
     audio_duration = len(audio) / sr
     if audio_duration == 0:
         return 0.0