Spaces:
Sleeping
Sleeping
Commit
·
bf9e8b9
1
Parent(s):
15307bb
Switch back to Supertonic 2 TTS for CPU compatibility
Browse files- app/ora_server.py +11 -14
app/ora_server.py
CHANGED
|
@@ -240,7 +240,7 @@ async def detect_emotion(req: EmotionRequest):
|
|
| 240 |
|
| 241 |
|
| 242 |
|
| 243 |
-
# TTS endpoint using
|
| 244 |
tts_model = None
|
| 245 |
tts_processor = None
|
| 246 |
|
|
@@ -248,16 +248,17 @@ tts_processor = None
|
|
| 248 |
async def load_tts():
|
| 249 |
global tts_model, tts_processor
|
| 250 |
try:
|
| 251 |
-
print("Loading
|
| 252 |
-
from transformers import AutoProcessor,
|
| 253 |
|
| 254 |
-
|
| 255 |
-
|
|
|
|
| 256 |
|
| 257 |
if device == "cuda":
|
| 258 |
tts_model = tts_model.to("cuda")
|
| 259 |
|
| 260 |
-
print("✓
|
| 261 |
except Exception as e:
|
| 262 |
print(f"Could not load TTS model: {e}")
|
| 263 |
print("Voice will fall back to browser TTS.")
|
|
@@ -273,24 +274,20 @@ async def text_to_speech(req: TTSRequest):
|
|
| 273 |
raise HTTPException(status_code=503, detail="TTS model not loaded, use browser fallback")
|
| 274 |
|
| 275 |
try:
|
| 276 |
-
#
|
| 277 |
-
inputs = tts_processor(
|
| 278 |
-
text=req.text,
|
| 279 |
-
voice_preset="v2/en_speaker_6", # Warm, natural female voice
|
| 280 |
-
return_tensors="pt"
|
| 281 |
-
)
|
| 282 |
|
| 283 |
if device == "cuda":
|
| 284 |
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 285 |
|
| 286 |
with torch.no_grad():
|
| 287 |
-
|
| 288 |
|
| 289 |
# Convert to WAV format
|
| 290 |
import io
|
| 291 |
import wave
|
| 292 |
|
| 293 |
-
audio_np =
|
| 294 |
|
| 295 |
# Normalize to 16-bit PCM
|
| 296 |
audio_np = (audio_np * 32767).astype('int16')
|
|
|
|
| 240 |
|
| 241 |
|
| 242 |
|
| 243 |
+
# TTS endpoint using Supertonic 2 (CPU-friendly)
|
| 244 |
tts_model = None
|
| 245 |
tts_processor = None
|
| 246 |
|
|
|
|
| 248 |
async def load_tts():
|
| 249 |
global tts_model, tts_processor
|
| 250 |
try:
|
| 251 |
+
print("Loading Supertonic 2 TTS...")
|
| 252 |
+
from transformers import AutoProcessor, AutoModelForTextToWaveform
|
| 253 |
|
| 254 |
+
model_id = "Supertone/supertonic-2"
|
| 255 |
+
tts_processor = AutoProcessor.from_pretrained(model_id)
|
| 256 |
+
tts_model = AutoModelForTextToWaveform.from_pretrained(model_id)
|
| 257 |
|
| 258 |
if device == "cuda":
|
| 259 |
tts_model = tts_model.to("cuda")
|
| 260 |
|
| 261 |
+
print("✓ Supertonic 2 TTS loaded successfully!")
|
| 262 |
except Exception as e:
|
| 263 |
print(f"Could not load TTS model: {e}")
|
| 264 |
print("Voice will fall back to browser TTS.")
|
|
|
|
| 274 |
raise HTTPException(status_code=503, detail="TTS model not loaded, use browser fallback")
|
| 275 |
|
| 276 |
try:
|
| 277 |
+
# Process text with Supertonic 2
|
| 278 |
+
inputs = tts_processor(text=req.text, return_tensors="pt", sampling_rate=24000)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
|
| 280 |
if device == "cuda":
|
| 281 |
inputs = {k: v.to("cuda") for k, v in inputs.items()}
|
| 282 |
|
| 283 |
with torch.no_grad():
|
| 284 |
+
audio_values = tts_model.generate(**inputs)
|
| 285 |
|
| 286 |
# Convert to WAV format
|
| 287 |
import io
|
| 288 |
import wave
|
| 289 |
|
| 290 |
+
audio_np = audio_values.cpu().numpy().squeeze()
|
| 291 |
|
| 292 |
# Normalize to 16-bit PCM
|
| 293 |
audio_np = (audio_np * 32767).astype('int16')
|