aankitdas commited on
Commit
24a256c
·
1 Parent(s): c1f5502

upgraded for hf space

Browse files
app/app.py CHANGED
@@ -18,12 +18,12 @@ import pandas as pd
18
  import gradio as gr
19
 
20
  sys.path.insert(0, os.path.dirname(__file__))
21
-
 
22
  from engines import ENGINES, ENGINE_MAP
23
  from engines.kokoro_engine import KOKORO_VOICES, KOKORO_DEFAULT_VOICE
24
  from evaluator import evaluate
25
- from dotenv import load_dotenv
26
- load_dotenv(os.path.join(os.path.dirname(__file__), ".env"))
27
  # ── constants ─────────────────────────────────────────────────────────────────
28
 
29
  BANDS = ["K-2", "3-5", "6-8", "9-12"]
@@ -177,7 +177,7 @@ def build_business_chart(results: list[dict]):
177
  color = color_map.get(engine_type, "#bdc3c7")
178
 
179
  # bubble size: min size 15, scale with cost
180
- size = max(15, cost * 5000 + 15)
181
 
182
  hover = (
183
  f"<b>{engine_name}</b><br>"
 
18
  import gradio as gr
19
 
20
  sys.path.insert(0, os.path.dirname(__file__))
21
+ from dotenv import load_dotenv
22
+ load_dotenv(os.path.join(os.path.dirname(__file__), ".env"))
23
  from engines import ENGINES, ENGINE_MAP
24
  from engines.kokoro_engine import KOKORO_VOICES, KOKORO_DEFAULT_VOICE
25
  from evaluator import evaluate
26
+
 
27
  # ── constants ─────────────────────────────────────────────────────────────────
28
 
29
  BANDS = ["K-2", "3-5", "6-8", "9-12"]
 
177
  color = color_map.get(engine_type, "#bdc3c7")
178
 
179
  # bubble size: min size 15, scale with cost
180
+ size = 20 #max(15, cost * 5000 + 15)
181
 
182
  hover = (
183
  f"<b>{engine_name}</b><br>"
app/engines/__init__.py CHANGED
@@ -4,25 +4,27 @@
4
  # import it here, and add it to ENGINES list.
5
 
6
  from engines.kokoro_engine import KokoroEngine
7
- from engines.edge_tts_engine import EdgeTTSEngine
8
- from engines.pyttsx3_engine import Pyttsx3Engine
9
- from engines.chirp_engine import ChirpEngine
10
  from engines.parler_engine import ParlerEngine
11
  from engines.piper_engine import PiperEngine
12
  from engines.chatterbox_runpod_engine import ChatterboxRunpodEngine
13
-
14
-
 
15
 
16
 
17
  # ordered list — determines dropdown order in UI
18
  # add new engines here when ready
19
  ENGINES = [
20
  KokoroEngine(),
21
- EdgeTTSEngine(),
22
- Pyttsx3Engine(),
23
  ParlerEngine(),
24
  PiperEngine(),
25
  ChatterboxRunpodEngine(),
 
 
26
  # ChirpEngine(), # uncomment when API key is available
27
  ]
28
 
 
4
  # import it here, and add it to ENGINES list.
5
 
6
  from engines.kokoro_engine import KokoroEngine
7
+ # from engines.edge_tts_engine import EdgeTTSEngine
8
+ # from engines.pyttsx3_engine import Pyttsx3Engine
 
9
  from engines.parler_engine import ParlerEngine
10
  from engines.piper_engine import PiperEngine
11
  from engines.chatterbox_runpod_engine import ChatterboxRunpodEngine
12
+ # from engines.voxtral_engine import VoxtralEngine
13
+ # from engines.chirp_engine import ChirpEngine
14
+ from engines.elevenlabs_engine import ElevenLabsEngine
15
 
16
 
17
  # ordered list — determines dropdown order in UI
18
  # add new engines here when ready
19
  ENGINES = [
20
  KokoroEngine(),
21
+ # EdgeTTSEngine(),
22
+ # Pyttsx3Engine(),
23
  ParlerEngine(),
24
  PiperEngine(),
25
  ChatterboxRunpodEngine(),
26
+ ElevenLabsEngine(),
27
+ # VoxtralEngine(),
28
  # ChirpEngine(), # uncomment when API key is available
29
  ]
30
 
app/engines/elevenlabs_engine.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/engines/elevenlabs_engine.py
2
+ # ElevenLabs TTS engine — neural cloud, high naturalness ceiling.
3
+ # Uses the official elevenlabs Python SDK.
4
+ # Free tier: 10,000 chars/month, MP3 output only, no commercial license.
5
+ # Paid tier: WAV output available, commercial license included.
6
+ # Classification: neural-cloud-paid (free tier is eval-only, not production).
7
+
8
+ import time
9
+ import os
10
+ from pathlib import Path
11
+ from elevenlabs.client import ElevenLabs
12
+ from elevenlabs import VoiceSettings
13
+
14
+ from engines.base import TTSEngine
15
+
16
+ # --- pricing ---
17
+ # eleven_turbo_v2_5: $0.15 per 1K chars on Creator tier
18
+ # using Flash rate as conservative estimate for free tier evaluation
19
+ _COST_PER_MILLION_CHARS = 150.0 # $0.15 per 1K = $150 per 1M
20
+
21
+ # --- model ---
22
+ # eleven_turbo_v2_5: best quality/latency tradeoff for non-realtime coaching
23
+ # upgrade to eleven_multilingual_v2 for highest quality (slower)
24
+ _DEFAULT_MODEL = "eleven_turbo_v2_5"
25
+
26
+
27
+ class ElevenLabsEngine(TTSEngine):
28
+
29
+ name = "ElevenLabs (Turbo)"
30
+ engine_type = "neural-cloud-paid"
31
+ cost_per_million_chars = _COST_PER_MILLION_CHARS
32
+ is_production_ready = True
33
+ requires_internet = True
34
+
35
+ # Voice IDs from ElevenLabs shared voice library (free tier accessible)
36
+ # swap voice_id values after listening — IDs are stable across accounts
37
+ BAND_CONFIG = {
38
+ "K-2": {
39
+ "voice_id": "cgSgspJ2msm6clMCkdW9", # Jessica — playful, bright, warm
40
+ "voice_name": "Jessica",
41
+ "stability": 0.75,
42
+ "similarity_boost": 0.75,
43
+ "speed": 0.85,
44
+ },
45
+ "3-5": {
46
+ "voice_id": "XrExE9yKIg1WjnnlVkGX", # Matilda — knowledgeable, professional
47
+ "voice_name": "Matilda",
48
+ "stability": 0.70,
49
+ "similarity_boost": 0.75,
50
+ "speed": 0.95,
51
+ },
52
+ "6-8": {
53
+ "voice_id": "EXAVITQu4vr4xnSDxMaL", # Sarah — mature, reassuring
54
+ "voice_name": "Sarah",
55
+ "stability": 0.65,
56
+ "similarity_boost": 0.75,
57
+ "speed": 1.00,
58
+ },
59
+ "9-12": {
60
+ "voice_id": "nPczCjzI2devNBz1zQrb", # Brian — deep, resonant, comforting
61
+ "voice_name": "Brian",
62
+ "stability": 0.60,
63
+ "similarity_boost": 0.80,
64
+ "speed": 1.10,
65
+ },
66
+ }
67
+
68
+ def __init__(self):
69
+ """
70
+ Initializes the ElevenLabs client using ELEVENLABS_API_KEY from env.
71
+ Raises ValueError early if key is missing so the error is clear.
72
+ """
73
+ api_key = os.getenv("ELEVENLABS_API_KEY")
74
+ if not api_key:
75
+ raise ValueError(
76
+ "ELEVENLABS_API_KEY not set. "
77
+ "Add it to app/.env — see .env.example."
78
+ )
79
+ self._client = ElevenLabs(api_key=api_key)
80
+
81
+ def synthesize(self, text: str, band: str, output_path: str) -> dict:
82
+ """
83
+ Synthesize text using ElevenLabs Turbo v2.5.
84
+ Saves as MP3 (free tier limitation — WAV requires Creator tier).
85
+ evaluator.py handles MP3 natively via librosa.
86
+
87
+ Args:
88
+ text: coaching text to synthesize
89
+ band: grade band — "K-2", "3-5", "6-8", "9-12"
90
+ output_path: path without extension — .mp3 will be appended
91
+
92
+ Returns:
93
+ standard TTSEngine dict: audio_path, latency_seconds, voice, speed, engine
94
+ """
95
+ config = self.get_band_config(band)
96
+ full_path = output_path + ".mp3"
97
+
98
+ voice_settings = VoiceSettings(
99
+ stability=config["stability"],
100
+ similarity_boost=config["similarity_boost"],
101
+ speed=config["speed"],
102
+ )
103
+
104
+ start = time.time()
105
+
106
+ # text_to_speech.convert returns a generator of audio chunks
107
+ audio_chunks = self._client.text_to_speech.convert(
108
+ text=text,
109
+ voice_id=config["voice_id"],
110
+ model_id=_DEFAULT_MODEL,
111
+ output_format="mp3_44100_128", # highest quality available on free tier
112
+ voice_settings=voice_settings,
113
+ )
114
+
115
+ # write chunks to file
116
+ Path(full_path).parent.mkdir(parents=True, exist_ok=True)
117
+ with open(full_path, "wb") as f:
118
+ for chunk in audio_chunks:
119
+ f.write(chunk)
120
+
121
+ latency = round(time.time() - start, 3)
122
+
123
+ return {
124
+ "audio_path": full_path,
125
+ "latency_seconds": latency,
126
+ "voice": config["voice_name"],
127
+ "speed": config["speed"],
128
+ "engine": self.name,
129
+ }
app/engines/piper_engine.py CHANGED
@@ -1,32 +1,76 @@
1
  # app/engines/piper_engine.py
2
  # Piper TTS engine — fast ONNX-based neural TTS, fully offline.
3
- # Voices are small (~63MB) ONNX models downloaded separately.
 
4
  # Designed for low-latency, low-resource deployment (runs on Raspberry Pi).
5
  # Faster than Kokoro on CPU, lower naturalness ceiling.
6
- # Good fallback: offline, no API key, minimal VRAM.
7
 
8
  import wave
9
  import time
 
10
  from pathlib import Path
11
  from piper import PiperVoice
 
12
 
13
  from engines.base import TTSEngine
14
 
15
- # voice files live in voices/piper/ relative to project root
16
  _VOICES_DIR = Path(__file__).parent.parent.parent / "voices" / "piper"
17
 
18
  # cache loaded voices — loading ONNX takes ~0.5s, reuse across calls
19
  _voice_cache: dict[str, PiperVoice] = {}
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def _get_voice(voice_file: str) -> PiperVoice:
 
23
  if voice_file not in _voice_cache:
 
24
  model_path = _VOICES_DIR / voice_file
25
- if not model_path.exists():
26
- raise FileNotFoundError(
27
- f"Piper voice not found: {model_path}\n"
28
- f"Download it first — see voices/piper/ directory."
29
- )
30
  _voice_cache[voice_file] = PiperVoice.load(
31
  str(model_path),
32
  use_cuda=False, # ONNX CUDA provider requires separate install
@@ -40,12 +84,12 @@ class PiperEngine(TTSEngine):
40
  engine_type = "neural-local"
41
  cost_per_million_chars = 0.0
42
  is_production_ready = False # lower naturalness than Kokoro, no band-tuned voices yet
43
- requires_internet = False
44
 
45
  BAND_CONFIG = {
46
- "K-2": {"voice_file": "en_US-amy-medium.onnx", "speed": 0.9},
47
- "3-5": {"voice_file": "en_US-amy-medium.onnx", "speed": 1.0},
48
- "6-8": {"voice_file": "en_US-amy-medium.onnx", "speed": 1.0},
49
  "9-12": {"voice_file": "en_US-lessac-medium.onnx", "speed": 1.1},
50
  }
51
 
 
1
  # app/engines/piper_engine.py
2
  # Piper TTS engine — fast ONNX-based neural TTS, fully offline.
3
+ # Voices are downloaded on demand from rhasspy/piper-voices on HF Hub
4
+ # and cached flat in voices/piper/ for subsequent runs.
5
  # Designed for low-latency, low-resource deployment (runs on Raspberry Pi).
6
  # Faster than Kokoro on CPU, lower naturalness ceiling.
7
+ # Good fallback: offline after first download, no API key, minimal VRAM.
8
 
9
  import wave
10
  import time
11
+ import shutil
12
  from pathlib import Path
13
  from piper import PiperVoice
14
+ from huggingface_hub import hf_hub_download
15
 
16
  from engines.base import TTSEngine
17
 
18
+ # voice files live flat in voices/piper/ relative to project root
19
  _VOICES_DIR = Path(__file__).parent.parent.parent / "voices" / "piper"
20
 
21
  # cache loaded voices — loading ONNX takes ~0.5s, reuse across calls
22
  _voice_cache: dict[str, PiperVoice] = {}
23
 
24
 
25
+ def _ensure_model_downloaded(voice_file: str) -> None:
26
+ """
27
+ Checks for model and config at flat voices/piper/ path.
28
+ If missing, downloads from rhasspy/piper-voices on HF Hub
29
+ and moves to flat location. Handles .onnx and .json separately
30
+ so a partial download can be recovered.
31
+ """
32
+ _VOICES_DIR.mkdir(parents=True, exist_ok=True)
33
+
34
+ model_path = _VOICES_DIR / voice_file
35
+ config_path = _VOICES_DIR / f"{voice_file}.json"
36
+
37
+ # parse voice file name into HF Hub repo subfolder structure
38
+ # e.g. en_US-amy-medium.onnx -> en/en_US/amy/medium/
39
+ parts = voice_file.split("-")
40
+ lang_family = parts[0].split("_")[0] # "en"
41
+ lang_full = parts[0] # "en_US"
42
+ speaker = parts[1] # "amy"
43
+ quality = parts[2].replace(".onnx", "") # "medium"
44
+ repo_subfolder = f"{lang_family}/{lang_full}/{speaker}/{quality}"
45
+
46
+ if not model_path.exists():
47
+ print(f"[Piper] Downloading {voice_file} from HF Hub...")
48
+ downloaded = hf_hub_download(
49
+ repo_id="rhasspy/piper-voices",
50
+ filename=f"{repo_subfolder}/{voice_file}",
51
+ local_dir=str(_VOICES_DIR),
52
+ local_dir_use_symlinks=False,
53
+ )
54
+ shutil.move(downloaded, model_path)
55
+ print(f"[Piper] Saved to {model_path}")
56
+
57
+ if not config_path.exists():
58
+ print(f"[Piper] Downloading {voice_file}.json from HF Hub...")
59
+ downloaded = hf_hub_download(
60
+ repo_id="rhasspy/piper-voices",
61
+ filename=f"{repo_subfolder}/{voice_file}.json",
62
+ local_dir=str(_VOICES_DIR),
63
+ local_dir_use_symlinks=False,
64
+ )
65
+ shutil.move(downloaded, config_path)
66
+ print(f"[Piper] Saved to {config_path}")
67
+
68
+
69
  def _get_voice(voice_file: str) -> PiperVoice:
70
+ """Returns a cached PiperVoice, downloading the model first if needed."""
71
  if voice_file not in _voice_cache:
72
+ _ensure_model_downloaded(voice_file)
73
  model_path = _VOICES_DIR / voice_file
 
 
 
 
 
74
  _voice_cache[voice_file] = PiperVoice.load(
75
  str(model_path),
76
  use_cuda=False, # ONNX CUDA provider requires separate install
 
84
  engine_type = "neural-local"
85
  cost_per_million_chars = 0.0
86
  is_production_ready = False # lower naturalness than Kokoro, no band-tuned voices yet
87
+ requires_internet = False # only on first run; fully offline after download
88
 
89
  BAND_CONFIG = {
90
+ "K-2": {"voice_file": "en_US-amy-medium.onnx", "speed": 0.9},
91
+ "3-5": {"voice_file": "en_US-amy-medium.onnx", "speed": 1.0},
92
+ "6-8": {"voice_file": "en_US-amy-medium.onnx", "speed": 1.0},
93
  "9-12": {"voice_file": "en_US-lessac-medium.onnx", "speed": 1.1},
94
  }
95
 
app/engines/voxtral_engine.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/engines/voxtral_engine.py
2
+ # Mistral Voxtral TTS via Mistral La Plateforme API.
3
+ # Requires MISTRAL_API_KEY environment variable.
4
+ # Pricing: $16.00 per 1M characters (Mistral, 2026)
5
+ # Voices: paul (US), oliver (UK), marie (FR), nick (ES), jane_confident
6
+
7
+ import os
8
+ import time
9
+ import requests
10
+ from engines.base import TTSEngine
11
+
12
+ MISTRAL_ENDPOINT = "https://api.mistral.ai/v1/audio/speech"
13
+
14
+ class VoxtralEngine(TTSEngine):
15
+ name = "Voxtral (Mistral AI)"
16
+ engine_type = "neural-cloud-paid"
17
+ cost_per_million_chars = 16.0 # $16.00 USD
18
+ is_production_ready = True
19
+ requires_internet = True
20
+
21
+ # Mapping Mistral's 2026 preset voices to your pedagogical bands
22
+ BAND_CONFIG = {
23
+ "K-2": {"voice_id": "paul"}, # Friendly US male
24
+ "3-5": {"voice_id": "oliver"}, # Energetic UK male
25
+ "6-8": {"voice_id": "jane_confident"}, # Clear, professional female
26
+ "9-12": {"voice_id": "paul"}, # Mature US male
27
+ }
28
+
29
+ def synthesize(self, text: str, band: str, output_path: str) -> dict:
30
+ api_key = os.environ.get("MISTRAL_API_KEY")
31
+ if not api_key:
32
+ raise ValueError("MISTRAL_API_KEY not set in environment.")
33
+
34
+ config = self.get_band_config(band)
35
+ voice_id = config["voice_id"]
36
+ full_path = output_path + ".wav"
37
+
38
+ payload = {
39
+ "model": "voxtral-mini-tts-2603",
40
+ "input": text,
41
+ "voice": voice_id,
42
+ "format": "wav"
43
+ }
44
+ headers = {
45
+ "Authorization": f"Bearer {api_key}",
46
+ "Content-Type": "application/json",
47
+ }
48
+
49
+ start = time.time()
50
+ response = requests.post(MISTRAL_ENDPOINT, json=payload, headers=headers, timeout=60)
51
+ response.raise_for_status()
52
+ latency = round(time.time() - start, 3)
53
+
54
+ with open(full_path, "wb") as f:
55
+ f.write(response.content)
56
+
57
+ return {
58
+ "audio_path": full_path,
59
+ "latency_seconds": latency,
60
+ "voice": voice_id,
61
+ "speed": 1.0,
62
+ "engine": self.name,
63
+ "actual_cost_usd": self.estimate_cost(text),
64
+ }
65
+
66
+ def estimate_cost(self, text: str) -> float:
67
+ # $16 per 1,000,000 characters
68
+ char_count = len(text)
69
+ return round((char_count / 1_000_000) * self.cost_per_million_chars, 6)
app/evaluator.py CHANGED
@@ -72,24 +72,17 @@ def compute_wer(reference_text: str, audio_path: str) -> float:
72
  def compute_utmos(audio_path: str) -> float:
73
  """
74
  Predict MOS score using UTMOS (automated naturalness rating 1-5).
 
 
75
 
76
  Args:
77
- audio_path: path to synthesized audio file (.wav only)
78
 
79
  Returns:
80
  predicted MOS score (float, higher = more natural)
81
  """
82
  model = _get_utmos()
83
- # convert mp3 to wav if needed
84
- if audio_path.endswith(".mp3"):
85
- audio, sr = librosa.load(audio_path, sr=16000)
86
- else:
87
- audio, sr = sf.read(audio_path)
88
- if audio.ndim > 1:
89
- audio = audio.mean(axis=1)
90
- if sr != 16000:
91
- audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
92
-
93
  wav_tensor = torch.FloatTensor(audio).unsqueeze(0)
94
 
95
  with torch.no_grad():
@@ -97,11 +90,11 @@ def compute_utmos(audio_path: str) -> float:
97
 
98
  return round(float(score), 3)
99
 
100
-
101
  def compute_rtf(latency_seconds: float, audio_path: str) -> float:
102
  """
103
  Compute Real Time Factor: synthesis_time / audio_duration.
104
  RTF < 1.0 means faster than real time.
 
105
 
106
  Args:
107
  latency_seconds: wall-clock synthesis time from engine
@@ -110,7 +103,11 @@ def compute_rtf(latency_seconds: float, audio_path: str) -> float:
110
  Returns:
111
  RTF as float
112
  """
113
- audio, sr = sf.read(audio_path)
 
 
 
 
114
  audio_duration = len(audio) / sr
115
  if audio_duration == 0:
116
  return 0.0
 
72
  def compute_utmos(audio_path: str) -> float:
73
  """
74
  Predict MOS score using UTMOS (automated naturalness rating 1-5).
75
+ Uses librosa for all formats (WAV + MP3) to avoid soundfile
76
+ subprocess issues in Gradio's hot-reload worker.
77
 
78
  Args:
79
+ audio_path: path to synthesized audio file
80
 
81
  Returns:
82
  predicted MOS score (float, higher = more natural)
83
  """
84
  model = _get_utmos()
85
+ audio, sr = librosa.load(audio_path, sr=16000, mono=True)
 
 
 
 
 
 
 
 
 
86
  wav_tensor = torch.FloatTensor(audio).unsqueeze(0)
87
 
88
  with torch.no_grad():
 
90
 
91
  return round(float(score), 3)
92
 
 
93
  def compute_rtf(latency_seconds: float, audio_path: str) -> float:
94
  """
95
  Compute Real Time Factor: synthesis_time / audio_duration.
96
  RTF < 1.0 means faster than real time.
97
+ Uses librosa for MP3 (sf.read may fail on MP3 depending on libsndfile version).
98
 
99
  Args:
100
  latency_seconds: wall-clock synthesis time from engine
 
103
  Returns:
104
  RTF as float
105
  """
106
+ if audio_path.endswith(".mp3"):
107
+ audio, sr = librosa.load(audio_path, sr=None)
108
+ else:
109
+ audio, sr = sf.read(audio_path)
110
+
111
  audio_duration = len(audio) / sr
112
  if audio_duration == 0:
113
  return 0.0