Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -10,13 +10,11 @@ from fastapi import FastAPI, UploadFile, File, HTTPException
|
|
| 10 |
import soundfile as sf
|
| 11 |
import numpy as np
|
| 12 |
import os
|
| 13 |
-
|
| 14 |
-
|
| 15 |
|
| 16 |
md = MarkItDown()
|
| 17 |
|
| 18 |
-
print("Converter initialized successfully with EasyOCR!")
|
| 19 |
-
|
| 20 |
app = FastAPI()
|
| 21 |
|
| 22 |
|
|
@@ -188,14 +186,23 @@ async def export_epub(file: UploadFile = File(...)):
|
|
| 188 |
os.remove(path)
|
| 189 |
|
| 190 |
VOICE_MAP = {
|
| 191 |
-
|
| 192 |
-
"
|
| 193 |
-
"
|
| 194 |
-
"
|
| 195 |
-
"
|
| 196 |
-
"it": {"male": "im_nicola", "female": "if_sara"}
|
| 197 |
}
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
class TTSRequest(BaseModel):
|
| 200 |
text: str
|
| 201 |
language: str = "en"
|
|
@@ -204,34 +211,31 @@ class TTSRequest(BaseModel):
|
|
| 204 |
@app.post("/generate-audio-from-text")
|
| 205 |
async def generate_audio_text(data: TTSRequest):
|
| 206 |
output_filename = "speech_output.wav"
|
| 207 |
-
|
| 208 |
try:
|
| 209 |
-
|
| 210 |
-
|
|
|
|
|
|
|
| 211 |
|
| 212 |
-
#
|
| 213 |
-
|
| 214 |
-
lang_config = VOICE_MAP.get(lang_key, VOICE_MAP["en"])
|
| 215 |
-
voice_name = lang_config.get(gender_key, lang_config["male"])
|
| 216 |
-
|
| 217 |
-
# 2. Setup Phonemizer language code
|
| 218 |
-
# Kokoro expects 'b' for British English phonemes
|
| 219 |
-
# We ensure 'en' calls use the British phoneme engine
|
| 220 |
-
phoneme_lang = 'b' if lang_key.startswith('en') else lang_key[:2]
|
| 221 |
-
|
| 222 |
-
# 3. Split text and generate
|
| 223 |
-
paragraphs = [p.strip() for p in data.text.split("\n") if p.strip()]
|
| 224 |
-
audio_chunks = []
|
| 225 |
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
if not audio_chunks:
|
| 232 |
-
raise HTTPException(status_code=400, detail="
|
| 233 |
|
| 234 |
-
#
|
| 235 |
final_audio = np.concatenate(audio_chunks)
|
| 236 |
sf.write(output_filename, final_audio, 24000)
|
| 237 |
|
|
@@ -242,10 +246,9 @@ async def generate_audio_text(data: TTSRequest):
|
|
| 242 |
)
|
| 243 |
|
| 244 |
except Exception as e:
|
| 245 |
-
print(f"
|
| 246 |
-
raise HTTPException(status_code=500, detail=str(e))
|
| 247 |
-
|
| 248 |
-
@app.get("/health")
|
| 249 |
async def health():
|
| 250 |
return {
|
| 251 |
"status": "ok"
|
|
|
|
| 10 |
import soundfile as sf
|
| 11 |
import numpy as np
|
| 12 |
import os
|
| 13 |
+
import torch
|
| 14 |
+
from kokoro import KPipeline
|
| 15 |
|
| 16 |
md = MarkItDown()
|
| 17 |
|
|
|
|
|
|
|
| 18 |
app = FastAPI()
|
| 19 |
|
| 20 |
|
|
|
|
| 186 |
os.remove(path)
|
| 187 |
|
| 188 |
VOICE_MAP = {
|
| 189 |
+
"en": {"male": "bm_lewis", "female": "bf_emma", "code": "b"},
|
| 190 |
+
"es": {"male": "em_alex", "female": "ef_dora", "code": "e"},
|
| 191 |
+
"fr": {"male": "fr_male", "female": "fr_female", "code": "f"},
|
| 192 |
+
"pt": {"male": "pm_santa", "female": "pf_dora", "code": "p"}, # Portuguese
|
| 193 |
+
"it": {"male": "im_nicola", "female": "if_sara", "code": "i"},
|
|
|
|
| 194 |
}
|
| 195 |
|
| 196 |
+
print("Loading TTS Pipelines... please wait.")
|
| 197 |
+
PIPELINES = {
|
| 198 |
+
"b": KPipeline(lang_code='b'), # British English
|
| 199 |
+
"e": KPipeline(lang_code='e'), # Spanish
|
| 200 |
+
"f": KPipeline(lang_code='f'), # French
|
| 201 |
+
"p": KPipeline(lang_code='p'), # Portuguese
|
| 202 |
+
"i": KPipeline(lang_code='i'), # Italian
|
| 203 |
+
}
|
| 204 |
+
print("All pipelines loaded and ready!")
|
| 205 |
+
|
| 206 |
class TTSRequest(BaseModel):
|
| 207 |
text: str
|
| 208 |
language: str = "en"
|
|
|
|
| 211 |
@app.post("/generate-audio-from-text")
|
| 212 |
async def generate_audio_text(data: TTSRequest):
|
| 213 |
output_filename = "speech_output.wav"
|
|
|
|
| 214 |
try:
|
| 215 |
+
# 1. Look up the language configuration
|
| 216 |
+
# Defaults to English (British) if the requested language isn't found
|
| 217 |
+
lang_config = VOICE_MAP.get(data.language.lower(), VOICE_MAP["en"])
|
| 218 |
+
phoneme_code = lang_config["code"]
|
| 219 |
|
| 220 |
+
# 2. Select the global pipeline
|
| 221 |
+
active_pipeline = PIPELINES.get(phoneme_code, PIPELINES["b"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
+
# 3. Select the voice (Male is the base default)
|
| 224 |
+
voice_name = lang_config.get(data.gender.lower(), lang_config["male"])
|
| 225 |
+
|
| 226 |
+
# 4. Generate audio chunks
|
| 227 |
+
generator = active_pipeline(
|
| 228 |
+
data.text,
|
| 229 |
+
voice=voice_name,
|
| 230 |
+
speed=1.1
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
audio_chunks = [audio for _, _, audio in generator if audio is not None]
|
| 234 |
|
| 235 |
if not audio_chunks:
|
| 236 |
+
raise HTTPException(status_code=400, detail="TTS generation failed")
|
| 237 |
|
| 238 |
+
# 5. Concatenate and Save
|
| 239 |
final_audio = np.concatenate(audio_chunks)
|
| 240 |
sf.write(output_filename, final_audio, 24000)
|
| 241 |
|
|
|
|
| 246 |
)
|
| 247 |
|
| 248 |
except Exception as e:
|
| 249 |
+
print(f"Detailed Error: {e}")
|
| 250 |
+
raise HTTPException(status_code=500, detail=str(e))@app.get("/health")
|
| 251 |
+
|
|
|
|
| 252 |
async def health():
|
| 253 |
return {
|
| 254 |
"status": "ok"
|