Abdalkaderdev commited on
Commit
5058539
·
1 Parent(s): d2505af

Switch to ElevenLabs TTS for natural voice

Browse files
Files changed (2) hide show
  1. app/ora_server.py +32 -34
  2. frontend/app/voice/page.tsx +28 -20
app/ora_server.py CHANGED
@@ -110,57 +110,55 @@ async def chat_endpoint(req: ChatRequest):
110
 
111
  return {"response": response_text}
112
 
113
- # TTS endpoint using Supertonic 2
114
- tts_model = None
115
- tts_processor = None
116
 
 
117
  @app.on_event("startup")
118
  async def load_tts():
119
- global tts_model, tts_processor
120
- try:
121
- print("Loading Supertonic 2 TTS...")
122
- from transformers import AutoProcessor, AutoModel
123
- tts_processor = AutoProcessor.from_pretrained("Supertone/supertonic-2")
124
- tts_model = AutoModel.from_pretrained("Supertone/supertonic-2")
125
- if device == "cuda":
126
- tts_model = tts_model.to("cuda")
127
- print("TTS Model loaded successfully!")
128
- except Exception as e:
129
- print(f"Could not load TTS model: {e}")
130
- print("Voice will not be available.")
131
 
132
  class TTSRequest(BaseModel):
133
  text: str
134
 
135
  @app.post("/api/tts")
136
  async def text_to_speech(req: TTSRequest):
137
- global tts_model, tts_processor
138
-
139
- if tts_model is None or tts_processor is None:
140
- raise HTTPException(status_code=503, detail="TTS model not loaded")
141
-
142
  try:
143
- inputs = tts_processor(text=req.text, return_tensors="pt")
144
- if device == "cuda":
145
- inputs = {k: v.to("cuda") for k, v in inputs.items()}
 
 
 
 
146
 
147
- with torch.no_grad():
148
- audio = tts_model.generate(**inputs)
 
149
 
150
- # Convert to numpy and create WAV
151
- audio_np = audio.cpu().numpy().squeeze()
 
 
 
 
 
 
152
 
153
- # Normalize audio
154
- audio_np = np.int16(audio_np / np.max(np.abs(audio_np)) * 32767)
 
 
155
 
156
- # Create WAV file in memory
157
- wav_io = io.BytesIO()
158
- wavfile.write(wav_io, 22050, audio_np)
159
- wav_io.seek(0)
160
 
161
- return Response(content=wav_io.read(), media_type="audio/wav")
 
 
 
 
162
 
163
  except Exception as e:
 
164
  raise HTTPException(status_code=500, detail=f"TTS generation failed: {str(e)}")
165
 
166
  # Mount Static Frontend (Must be last)
 
110
 
111
  return {"response": response_text}
112
 
 
 
 
113
 
114
+ # TTS endpoint using ElevenLabs (most natural voice)
115
  @app.on_event("startup")
116
  async def load_tts():
117
+ print("TTS: Using ElevenLabs for natural voice synthesis")
118
+ # ElevenLabs doesn't require model loading, uses API
 
 
 
 
 
 
 
 
 
 
119
 
120
  class TTSRequest(BaseModel):
121
  text: str
122
 
123
  @app.post("/api/tts")
124
  async def text_to_speech(req: TTSRequest):
 
 
 
 
 
125
  try:
126
+ # Use ElevenLabs free tier with their best voice
127
+ import requests
128
+
129
+ # Rachel voice (warm, natural female voice)
130
+ voice_id = "21m00Tcm4TlvDq8ikWAM"
131
+
132
+ url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
133
 
134
+ headers = {
135
+ "Content-Type": "application/json",
136
+ }
137
 
138
+ data = {
139
+ "text": req.text,
140
+ "model_id": "eleven_monolingual_v1",
141
+ "voice_settings": {
142
+ "stability": 0.5,
143
+ "similarity_boost": 0.75
144
+ }
145
+ }
146
 
147
+ # Try with API key from environment if available
148
+ api_key = os.getenv("ELEVENLABS_API_KEY")
149
+ if api_key:
150
+ headers["xi-api-key"] = api_key
151
 
152
+ response = requests.post(url, json=data, headers=headers)
 
 
 
153
 
154
+ if response.status_code == 200:
155
+ return Response(content=response.content, media_type="audio/mpeg")
156
+ else:
157
+ # Fallback to browser TTS if ElevenLabs fails
158
+ raise HTTPException(status_code=503, detail="TTS service unavailable, use browser fallback")
159
 
160
  except Exception as e:
161
+ print(f"TTS error: {e}")
162
  raise HTTPException(status_code=500, detail=f"TTS generation failed: {str(e)}")
163
 
164
  # Mount Static Frontend (Must be last)
frontend/app/voice/page.tsx CHANGED
@@ -66,37 +66,45 @@ export default function OraVoice() {
66
  setState("SPEAKING");
67
 
68
  try {
69
- // Call backend TTS API for professional voice
70
  const res = await fetch("/api/tts", {
71
  method: "POST",
72
  headers: { "Content-Type": "application/json" },
73
  body: JSON.stringify({ text }),
74
  });
75
 
76
- if (!res.ok) {
77
- throw new Error("TTS failed");
78
- }
79
-
80
- // Get audio blob and play it
81
- const audioBlob = await res.blob();
82
- const audioUrl = URL.createObjectURL(audioBlob);
83
- const audio = new Audio(audioUrl);
84
 
85
- audio.onended = () => {
86
- setState("IDLE");
87
- URL.revokeObjectURL(audioUrl);
88
- };
89
 
90
- audio.onerror = () => {
91
- setState("IDLE");
92
- URL.revokeObjectURL(audioUrl);
93
- };
94
 
95
- await audio.play();
 
 
 
 
96
 
97
  } catch (error) {
98
- console.error("TTS error:", error);
99
- setState("IDLE");
 
 
 
 
 
 
 
100
  }
101
  };
102
 
 
66
  setState("SPEAKING");
67
 
68
  try {
69
+ // Try backend TTS first (ElevenLabs - most natural)
70
  const res = await fetch("/api/tts", {
71
  method: "POST",
72
  headers: { "Content-Type": "application/json" },
73
  body: JSON.stringify({ text }),
74
  });
75
 
76
+ if (res.ok) {
77
+ // Backend TTS succeeded
78
+ const audioBlob = await res.blob();
79
+ const audioUrl = URL.createObjectURL(audioBlob);
80
+ const audio = new Audio(audioUrl);
 
 
 
81
 
82
+ audio.onended = () => {
83
+ setState("IDLE");
84
+ URL.revokeObjectURL(audioUrl);
85
+ };
86
 
87
+ audio.onerror = () => {
88
+ setState("IDLE");
89
+ URL.revokeObjectURL(audioUrl);
90
+ };
91
 
92
+ await audio.play();
93
+ } else {
94
+ // Fallback to browser TTS
95
+ throw new Error("Backend TTS unavailable");
96
+ }
97
 
98
  } catch (error) {
99
+ console.log("Using browser TTS fallback");
100
+
101
+ // Browser TTS fallback
102
+ const utterance = new SpeechSynthesisUtterance(text);
103
+ utterance.rate = 0.9;
104
+ utterance.pitch = 0.95;
105
+ utterance.onend = () => setState("IDLE");
106
+ utterance.onerror = () => setState("IDLE");
107
+ window.speechSynthesis.speak(utterance);
108
  }
109
  };
110