randusertry commited on
Commit
f67748b
·
verified ·
1 Parent(s): 6f11d60

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -36
app.py CHANGED
@@ -10,13 +10,11 @@ from fastapi import FastAPI, UploadFile, File, HTTPException
10
  import soundfile as sf
11
  import numpy as np
12
  import os
13
- from kokoro import generate
14
-
15
 
16
  md = MarkItDown()
17
 
18
- print("Converter initialized successfully with EasyOCR!")
19
-
20
  app = FastAPI()
21
 
22
 
@@ -188,14 +186,23 @@ async def export_epub(file: UploadFile = File(...)):
188
  os.remove(path)
189
 
190
  VOICE_MAP = {
191
- # We point "en" directly to the British English (bm/bf) models
192
- "en": {"male": "bm_lewis", "female": "bf_emma"},
193
- "en-gb": {"male": "bm_lewis", "female": "bf_emma"},
194
- "es": {"male": "em_alex", "female": "ef_dora"},
195
- "fr": {"male": "fr_male", "female": "fr_female"},
196
- "it": {"male": "im_nicola", "female": "if_sara"}
197
  }
198
 
 
 
 
 
 
 
 
 
 
 
199
  class TTSRequest(BaseModel):
200
  text: str
201
  language: str = "en"
@@ -204,34 +211,31 @@ class TTSRequest(BaseModel):
204
  @app.post("/generate-audio-from-text")
205
  async def generate_audio_text(data: TTSRequest):
206
  output_filename = "speech_output.wav"
207
-
208
  try:
209
- lang_key = data.language.lower()
210
- gender_key = data.gender.lower()
 
 
211
 
212
- # 1. Select the specific voice model
213
- # Default to British English ("en") if language is not in the map
214
- lang_config = VOICE_MAP.get(lang_key, VOICE_MAP["en"])
215
- voice_name = lang_config.get(gender_key, lang_config["male"])
216
-
217
- # 2. Setup Phonemizer language code
218
- # Kokoro expects 'b' for British English phonemes
219
- # We ensure 'en' calls use the British phoneme engine
220
- phoneme_lang = 'b' if lang_key.startswith('en') else lang_key[:2]
221
-
222
- # 3. Split text and generate
223
- paragraphs = [p.strip() for p in data.text.split("\n") if p.strip()]
224
- audio_chunks = []
225
 
226
- for p in paragraphs:
227
- # Note: we use phoneme_lang ('b') here for the accent logic
228
- audio, _ = generate(p, voice_name, lang=phoneme_lang, speed=1.1)
229
- audio_chunks.append(audio)
 
 
 
 
 
 
 
230
 
231
  if not audio_chunks:
232
- raise HTTPException(status_code=400, detail="Text was empty or unreadable")
233
 
234
- # 4. Save and return
235
  final_audio = np.concatenate(audio_chunks)
236
  sf.write(output_filename, final_audio, 24000)
237
 
@@ -242,10 +246,9 @@ async def generate_audio_text(data: TTSRequest):
242
  )
243
 
244
  except Exception as e:
245
- print(f"Error during TTS: {e}")
246
- raise HTTPException(status_code=500, detail=str(e))
247
-
248
- @app.get("/health")
249
  async def health():
250
  return {
251
  "status": "ok"
 
10
  import soundfile as sf
11
  import numpy as np
12
  import os
13
+ import torch
14
+ from kokoro import KPipeline
15
 
16
  md = MarkItDown()
17
 
 
 
18
  app = FastAPI()
19
 
20
 
 
186
  os.remove(path)
187
 
188
  VOICE_MAP = {
189
+ "en": {"male": "bm_lewis", "female": "bf_emma", "code": "b"},
190
+ "es": {"male": "em_alex", "female": "ef_dora", "code": "e"},
191
+ "fr": {"male": "fr_male", "female": "fr_female", "code": "f"},
192
+ "pt": {"male": "pm_santa", "female": "pf_dora", "code": "p"}, # Portuguese
193
+ "it": {"male": "im_nicola", "female": "if_sara", "code": "i"},
 
194
  }
195
 
196
+ print("Loading TTS Pipelines... please wait.")
197
+ PIPELINES = {
198
+ "b": KPipeline(lang_code='b'), # British English
199
+ "e": KPipeline(lang_code='e'), # Spanish
200
+ "f": KPipeline(lang_code='f'), # French
201
+ "p": KPipeline(lang_code='p'), # Portuguese
202
+ "i": KPipeline(lang_code='i'), # Italian
203
+ }
204
+ print("All pipelines loaded and ready!")
205
+
206
  class TTSRequest(BaseModel):
207
  text: str
208
  language: str = "en"
 
211
  @app.post("/generate-audio-from-text")
212
  async def generate_audio_text(data: TTSRequest):
213
  output_filename = "speech_output.wav"
 
214
  try:
215
+ # 1. Look up the language configuration
216
+ # Defaults to English (British) if the requested language isn't found
217
+ lang_config = VOICE_MAP.get(data.language.lower(), VOICE_MAP["en"])
218
+ phoneme_code = lang_config["code"]
219
 
220
+ # 2. Select the global pipeline
221
+ active_pipeline = PIPELINES.get(phoneme_code, PIPELINES["b"])
 
 
 
 
 
 
 
 
 
 
 
222
 
223
+ # 3. Select the voice (Male is the base default)
224
+ voice_name = lang_config.get(data.gender.lower(), lang_config["male"])
225
+
226
+ # 4. Generate audio chunks
227
+ generator = active_pipeline(
228
+ data.text,
229
+ voice=voice_name,
230
+ speed=1.1
231
+ )
232
+
233
+ audio_chunks = [audio for _, _, audio in generator if audio is not None]
234
 
235
  if not audio_chunks:
236
+ raise HTTPException(status_code=400, detail="TTS generation failed")
237
 
238
+ # 5. Concatenate and Save
239
  final_audio = np.concatenate(audio_chunks)
240
  sf.write(output_filename, final_audio, 24000)
241
 
 
246
  )
247
 
248
  except Exception as e:
249
+ print(f"Detailed Error: {e}")
250
+ raise HTTPException(status_code=500, detail=str(e))@app.get("/health")
251
+
 
252
  async def health():
253
  return {
254
  "status": "ok"