Spaces:

Abdalkaderdev
/

ORA

Paused

App Files Files Community

Abdalkaderdev commited on Jan 12

Commit

b062b38

1 Parent(s): aae5a7b

Implement Supertonic 2 TTS properly

Browse files

Files changed (1) hide show

app/ora_server.py +47 -29

app/ora_server.py CHANGED Viewed

@@ -108,51 +108,69 @@ async def chat_endpoint(req: ChatRequest):
     return {"response": response_text}
-# TTS endpoint using ElevenLabs (most natural voice)
 @app.on_event("startup")
 async def load_tts():
-    print("TTS: Using ElevenLabs for natural voice synthesis")
-    # ElevenLabs doesn't require model loading, uses API
 class TTSRequest(BaseModel):
     text: str
 @app.post("/api/tts")
 async def text_to_speech(req: TTSRequest):
     try:
-        # Use ElevenLabs free tier with their best voice
-        import requests
-        # Rachel voice (warm, natural female voice)
-        voice_id = "21m00Tcm4TlvDq8ikWAM"
-        url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
-        headers = {
-            "Content-Type": "application/json",
-        }
-        data = {
-            "text": req.text,
-            "model_id": "eleven_monolingual_v1",
-            "voice_settings": {
-                "stability": 0.5,
-                "similarity_boost": 0.75
-            }
-        }
-        # Try with API key from environment if available
-        api_key = os.getenv("ELEVENLABS_API_KEY")
-        if api_key:
-            headers["xi-api-key"] = api_key
-        response = requests.post(url, json=data, headers=headers)
-        if response.status_code == 200:
-            return Response(content=response.content, media_type="audio/mpeg")
-        else:
-            # Fallback to browser TTS if ElevenLabs fails
-            raise HTTPException(status_code=503, detail="TTS service unavailable, use browser fallback")
     except Exception as e:
         print(f"TTS error: {e}")

     return {"response": response_text}
+# TTS endpoint using Supertonic 2
+tts_model = None
+tts_processor = None
 @app.on_event("startup")
 async def load_tts():
+    global tts_model, tts_processor
+    try:
+        print("Loading Supertonic 2 TTS...")
+        from transformers import AutoProcessor, AutoModelForTextToWaveform
+        model_id = "Supertone/supertonic-2"
+        tts_processor = AutoProcessor.from_pretrained(model_id)
+        tts_model = AutoModelForTextToWaveform.from_pretrained(model_id)
+        if device == "cuda":
+            tts_model = tts_model.to("cuda")
+        print("Supertonic 2 TTS loaded successfully!")
+    except Exception as e:
+        print(f"Could not load TTS model: {e}")
+        print("Voice will fall back to browser TTS.")
 class TTSRequest(BaseModel):
     text: str
 @app.post("/api/tts")
 async def text_to_speech(req: TTSRequest):
+    global tts_model, tts_processor
+    if tts_model is None or tts_processor is None:
+        raise HTTPException(status_code=503, detail="TTS model not loaded, use browser fallback")
     try:
+        # Process text with Supertonic 2
+        inputs = tts_processor(text=req.text, return_tensors="pt", sampling_rate=24000)
+        if device == "cuda":
+            inputs = {k: v.to("cuda") for k, v in inputs.items()}
+        with torch.no_grad():
+            audio_values = tts_model.generate(**inputs)
+        # Convert to WAV format
+        import io
+        import wave
+        audio_np = audio_values.cpu().numpy().squeeze()
+        # Normalize to 16-bit PCM
+        audio_np = (audio_np * 32767).astype('int16')
+        # Create WAV in memory
+        wav_io = io.BytesIO()
+        with wave.open(wav_io, 'wb') as wav_file:
+            wav_file.setnchannels(1)  # Mono
+            wav_file.setsampwidth(2)  # 16-bit
+            wav_file.setframerate(24000)  # 24kHz
+            wav_file.writeframes(audio_np.tobytes())
+        wav_io.seek(0)
+        return Response(content=wav_io.read(), media_type="audio/wav")
     except Exception as e:
         print(f"TTS error: {e}")