Spaces:

randusertry
/

pdf_analysis

Sleeping

App Files Files Community

randusertry commited on 26 days ago

Commit

f67748b

verified ·

1 Parent(s): 6f11d60

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -36

app.py CHANGED Viewed

@@ -10,13 +10,11 @@ from fastapi import FastAPI, UploadFile, File, HTTPException
 import soundfile as sf
 import numpy as np
 import os
-from kokoro import generate
 md = MarkItDown()
-print("Converter initialized successfully with EasyOCR!")
 app = FastAPI()
@@ -188,14 +186,23 @@ async def export_epub(file: UploadFile = File(...)):
                 os.remove(path)
 VOICE_MAP = {
-    # We point "en" directly to the British English (bm/bf) models
-    "en": {"male": "bm_lewis", "female": "bf_emma"},
-    "en-gb": {"male": "bm_lewis", "female": "bf_emma"},
-    "es": {"male": "em_alex", "female": "ef_dora"},
-    "fr": {"male": "fr_male", "female": "fr_female"},
-    "it": {"male": "im_nicola", "female": "if_sara"}
 }
 class TTSRequest(BaseModel):
     text: str
     language: str = "en"
@@ -204,34 +211,31 @@ class TTSRequest(BaseModel):
 @app.post("/generate-audio-from-text")
 async def generate_audio_text(data: TTSRequest):
     output_filename = "speech_output.wav"
     try:
-        lang_key = data.language.lower()
-        gender_key = data.gender.lower()
-        # 1. Select the specific voice model
-        # Default to British English ("en") if language is not in the map
-        lang_config = VOICE_MAP.get(lang_key, VOICE_MAP["en"])
-        voice_name = lang_config.get(gender_key, lang_config["male"])
-        # 2. Setup Phonemizer language code
-        # Kokoro expects 'b' for British English phonemes
-        # We ensure 'en' calls use the British phoneme engine
-        phoneme_lang = 'b' if lang_key.startswith('en') else lang_key[:2]
-        # 3. Split text and generate
-        paragraphs = [p.strip() for p in data.text.split("\n") if p.strip()]
-        audio_chunks = []
-        for p in paragraphs:
-            # Note: we use phoneme_lang ('b') here for the accent logic
-            audio, _ = generate(p, voice_name, lang=phoneme_lang, speed=1.1)
-            audio_chunks.append(audio)
         if not audio_chunks:
-            raise HTTPException(status_code=400, detail="Text was empty or unreadable")
-        # 4. Save and return
         final_audio = np.concatenate(audio_chunks)
         sf.write(output_filename, final_audio, 24000)
@@ -242,10 +246,9 @@ async def generate_audio_text(data: TTSRequest):
         )
     except Exception as e:
-        print(f"Error during TTS: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-@app.get("/health")
 async def health():
     return {
         "status": "ok"

 import soundfile as sf
 import numpy as np
 import os
+import torch
+from kokoro import KPipeline
 md = MarkItDown()
 app = FastAPI()
                 os.remove(path)
 VOICE_MAP = {
+    "en": {"male": "bm_lewis", "female": "bf_emma", "code": "b"},
+    "es": {"male": "em_alex", "female": "ef_dora", "code": "e"},
+    "fr": {"male": "fr_male", "female": "fr_female", "code": "f"},
+    "pt": {"male": "pm_santa", "female": "pf_dora", "code": "p"}, # Portuguese
+    "it": {"male": "im_nicola", "female": "if_sara", "code": "i"},
 }
+print("Loading TTS Pipelines... please wait.")
+PIPELINES = {
+    "b": KPipeline(lang_code='b'),  # British English
+    "e": KPipeline(lang_code='e'),  # Spanish
+    "f": KPipeline(lang_code='f'),  # French
+    "p": KPipeline(lang_code='p'),  # Portuguese
+    "i": KPipeline(lang_code='i'),  # Italian
+}
+print("All pipelines loaded and ready!")
 class TTSRequest(BaseModel):
     text: str
     language: str = "en"
 @app.post("/generate-audio-from-text")
 async def generate_audio_text(data: TTSRequest):
     output_filename = "speech_output.wav"
     try:
+        # 1. Look up the language configuration
+        # Defaults to English (British) if the requested language isn't found
+        lang_config = VOICE_MAP.get(data.language.lower(), VOICE_MAP["en"])
+        phoneme_code = lang_config["code"]
+        # 2. Select the global pipeline
+        active_pipeline = PIPELINES.get(phoneme_code, PIPELINES["b"])
+        # 3. Select the voice (Male is the base default)
+        voice_name = lang_config.get(data.gender.lower(), lang_config["male"])
+        # 4. Generate audio chunks
+        generator = active_pipeline(
+            data.text,
+            voice=voice_name,
+            speed=1.1
+        )
+        audio_chunks = [audio for _, _, audio in generator if audio is not None]
         if not audio_chunks:
+            raise HTTPException(status_code=400, detail="TTS generation failed")
+        # 5. Concatenate and Save
         final_audio = np.concatenate(audio_chunks)
         sf.write(output_filename, final_audio, 24000)
         )
     except Exception as e:
+        print(f"Detailed Error: {e}")
+        raise HTTPException(status_code=500, detail=str(e))@app.get("/health")
 async def health():
     return {
         "status": "ok"