Spaces:

Abdalkaderdev
/

ORA

Paused

App Files Files Community

Abdalkaderdev commited on Jan 13

Commit

bf9e8b9

1 Parent(s): 15307bb

Switch back to Supertonic 2 TTS for CPU compatibility

Browse files

Files changed (1) hide show

app/ora_server.py +11 -14

app/ora_server.py CHANGED Viewed

@@ -240,7 +240,7 @@ async def detect_emotion(req: EmotionRequest):
-# TTS endpoint using Bark (Natural, Expressive Voice)
 tts_model = None
 tts_processor = None
@@ -248,16 +248,17 @@ tts_processor = None
 async def load_tts():
     global tts_model, tts_processor
     try:
-        print("Loading Bark TTS for natural voice...")
-        from transformers import AutoProcessor, BarkModel
-        tts_processor = AutoProcessor.from_pretrained("suno/bark-small")
-        tts_model = BarkModel.from_pretrained("suno/bark-small")
         if device == "cuda":
             tts_model = tts_model.to("cuda")
-        print("✓ Bark TTS loaded - Natural voice ready!")
     except Exception as e:
         print(f"Could not load TTS model: {e}")
         print("Voice will fall back to browser TTS.")
@@ -273,24 +274,20 @@ async def text_to_speech(req: TTSRequest):
         raise HTTPException(status_code=503, detail="TTS model not loaded, use browser fallback")
     try:
-        # Use Bark with natural voice preset
-        inputs = tts_processor(
-            text=req.text,
-            voice_preset="v2/en_speaker_6",  # Warm, natural female voice
-            return_tensors="pt"
-        )
         if device == "cuda":
             inputs = {k: v.to("cuda") for k, v in inputs.items()}
         with torch.no_grad():
-            audio_array = tts_model.generate(**inputs)
         # Convert to WAV format
         import io
         import wave
-        audio_np = audio_array.cpu().numpy().squeeze()
         # Normalize to 16-bit PCM
         audio_np = (audio_np * 32767).astype('int16')

+# TTS endpoint using Supertonic 2 (CPU-friendly)
 tts_model = None
 tts_processor = None
 async def load_tts():
     global tts_model, tts_processor
     try:
+        print("Loading Supertonic 2 TTS...")
+        from transformers import AutoProcessor, AutoModelForTextToWaveform
+        model_id = "Supertone/supertonic-2"
+        tts_processor = AutoProcessor.from_pretrained(model_id)
+        tts_model = AutoModelForTextToWaveform.from_pretrained(model_id)
         if device == "cuda":
             tts_model = tts_model.to("cuda")
+        print("✓ Supertonic 2 TTS loaded successfully!")
     except Exception as e:
         print(f"Could not load TTS model: {e}")
         print("Voice will fall back to browser TTS.")
         raise HTTPException(status_code=503, detail="TTS model not loaded, use browser fallback")
     try:
+        # Process text with Supertonic 2
+        inputs = tts_processor(text=req.text, return_tensors="pt", sampling_rate=24000)
         if device == "cuda":
             inputs = {k: v.to("cuda") for k, v in inputs.items()}
         with torch.no_grad():
+            audio_values = tts_model.generate(**inputs)
         # Convert to WAV format
         import io
         import wave
+        audio_np = audio_values.cpu().numpy().squeeze()
         # Normalize to 16-bit PCM
         audio_np = (audio_np * 32767).astype('int16')