Abdalkaderdev commited on
Commit
b062b38
·
1 Parent(s): aae5a7b

Implement Supertonic 2 TTS properly

Browse files
Files changed (1) hide show
  1. app/ora_server.py +47 -29
app/ora_server.py CHANGED
@@ -108,51 +108,69 @@ async def chat_endpoint(req: ChatRequest):
108
  return {"response": response_text}
109
 
110
 
111
- # TTS endpoint using ElevenLabs (most natural voice)
 
 
 
 
112
  @app.on_event("startup")
113
  async def load_tts():
114
- print("TTS: Using ElevenLabs for natural voice synthesis")
115
- # ElevenLabs doesn't require model loading, uses API
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
  class TTSRequest(BaseModel):
118
  text: str
119
 
120
  @app.post("/api/tts")
121
  async def text_to_speech(req: TTSRequest):
 
 
 
 
 
122
  try:
123
- # Use ElevenLabs free tier with their best voice
124
- import requests
125
 
126
- # Rachel voice (warm, natural female voice)
127
- voice_id = "21m00Tcm4TlvDq8ikWAM"
128
 
129
- url = f"https://api.elevenlabs.io/v1/text-to-speech/{voice_id}"
 
130
 
131
- headers = {
132
- "Content-Type": "application/json",
133
- }
134
 
135
- data = {
136
- "text": req.text,
137
- "model_id": "eleven_monolingual_v1",
138
- "voice_settings": {
139
- "stability": 0.5,
140
- "similarity_boost": 0.75
141
- }
142
- }
143
 
144
- # Try with API key from environment if available
145
- api_key = os.getenv("ELEVENLABS_API_KEY")
146
- if api_key:
147
- headers["xi-api-key"] = api_key
148
 
149
- response = requests.post(url, json=data, headers=headers)
 
 
 
 
 
 
150
 
151
- if response.status_code == 200:
152
- return Response(content=response.content, media_type="audio/mpeg")
153
- else:
154
- # Fallback to browser TTS if ElevenLabs fails
155
- raise HTTPException(status_code=503, detail="TTS service unavailable, use browser fallback")
156
 
157
  except Exception as e:
158
  print(f"TTS error: {e}")
 
108
  return {"response": response_text}
109
 
110
 
111
+
112
+ # TTS endpoint using Supertonic 2
113
+ tts_model = None
114
+ tts_processor = None
115
+
116
  @app.on_event("startup")
117
  async def load_tts():
118
+ global tts_model, tts_processor
119
+ try:
120
+ print("Loading Supertonic 2 TTS...")
121
+ from transformers import AutoProcessor, AutoModelForTextToWaveform
122
+
123
+ model_id = "Supertone/supertonic-2"
124
+ tts_processor = AutoProcessor.from_pretrained(model_id)
125
+ tts_model = AutoModelForTextToWaveform.from_pretrained(model_id)
126
+
127
+ if device == "cuda":
128
+ tts_model = tts_model.to("cuda")
129
+
130
+ print("Supertonic 2 TTS loaded successfully!")
131
+ except Exception as e:
132
+ print(f"Could not load TTS model: {e}")
133
+ print("Voice will fall back to browser TTS.")
134
 
135
  class TTSRequest(BaseModel):
136
  text: str
137
 
138
  @app.post("/api/tts")
139
  async def text_to_speech(req: TTSRequest):
140
+ global tts_model, tts_processor
141
+
142
+ if tts_model is None or tts_processor is None:
143
+ raise HTTPException(status_code=503, detail="TTS model not loaded, use browser fallback")
144
+
145
  try:
146
+ # Process text with Supertonic 2
147
+ inputs = tts_processor(text=req.text, return_tensors="pt", sampling_rate=24000)
148
 
149
+ if device == "cuda":
150
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
151
 
152
+ with torch.no_grad():
153
+ audio_values = tts_model.generate(**inputs)
154
 
155
+ # Convert to WAV format
156
+ import io
157
+ import wave
158
 
159
+ audio_np = audio_values.cpu().numpy().squeeze()
 
 
 
 
 
 
 
160
 
161
+ # Normalize to 16-bit PCM
162
+ audio_np = (audio_np * 32767).astype('int16')
 
 
163
 
164
+ # Create WAV in memory
165
+ wav_io = io.BytesIO()
166
+ with wave.open(wav_io, 'wb') as wav_file:
167
+ wav_file.setnchannels(1) # Mono
168
+ wav_file.setsampwidth(2) # 16-bit
169
+ wav_file.setframerate(24000) # 24kHz
170
+ wav_file.writeframes(audio_np.tobytes())
171
 
172
+ wav_io.seek(0)
173
+ return Response(content=wav_io.read(), media_type="audio/wav")
 
 
 
174
 
175
  except Exception as e:
176
  print(f"TTS error: {e}")