vox-beta

Sleeping

App Files Files Community

EllenBeta commited on Nov 15, 2025

Commit

8fa01de

verified ·

1 Parent(s): e2d7c7f

Update app.py

Browse files

Files changed (1) hide show

app.py +84 -152

app.py CHANGED Viewed

@@ -2,13 +2,13 @@ from flask import Flask, request, jsonify, render_template
 from datetime import datetime
 from flask_cors import CORS
 from TTS.api import TTS
-from TTS.utils.manage import ModelManager
 import os
 import base64
-import shutil
-import wave
 import logging
 import threading
 from helper import (
     save_audio,
@@ -17,6 +17,7 @@ from helper import (
     video_to_audio,
     validate_audio_file,
     ensure_wav_format,
 )
 # ---------- Basic config ----------
@@ -28,138 +29,19 @@ CORS(app)
 os.environ["COQUI_TOS_AGREED"] = "1"
 device = "cpu"
-# ============================================================
-# MODEL STORAGE PATHS & NAMES
-# ============================================================
-DATASET_MODEL_DIR = "/datasets/EllenBeta/Xtts_2/model"  # dataset mount (destination)
-LOCAL_CACHE_DIR = os.path.expanduser("~/.local/share/tts/xtts_v2_cache")  # local cache
 MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"  # coqui model id
-# Maximum audio (MB)
 MAX_AUDIO_SIZE_MB = 15
-# ============================================================
-# Utilities for resolving model download path (defensive)
-# ============================================================
-def resolve_model_path(raw):
-    """
-    Given the return value from ModelManager.download_model(...) try to
-    return a filesystem path (string) pointing at the downloaded model folder.
-    Handles strings, tuples/lists, or dict-like returns.
-    """
-    # If already a path string
-    if isinstance(raw, str):
-        return raw
-    # If a list/tuple, try first string-like element
-    if isinstance(raw, (list, tuple)):
-        for element in raw:
-            if isinstance(element, str) and os.path.exists(element):
-                return element
-        # fallback: try to join tuple items into a path if meaningful
-        try:
-            cand = os.path.join(*[str(x) for x in raw])
-            if os.path.exists(cand):
-                return cand
-        except Exception:
-            pass
-    # If dict-like, try common keys
-    if isinstance(raw, dict):
-        for key in ("model_path", "path", "directory"):
-            val = raw.get(key)
-            if isinstance(val, str) and os.path.exists(val):
-                return val
-    # final fallback: try to find the typical download directory
-    fallback = os.path.expanduser("~/.local/share/tts")
-    if os.path.exists(fallback):
-        # find matching folder
-        for root, dirs, files in os.walk(fallback):
-            if MODEL_NAME.split("/")[-1] in root:
-                return root
-    # Nothing found
-    return None
-# ============================================================
-# Ensure model is present (download once and copy into dataset)
-# ============================================================
 tts = None
 try:
-    if os.path.exists(DATASET_MODEL_DIR) and os.listdir(DATASET_MODEL_DIR):
-        log.info("✅ Loading XTTS model directly from dataset mount: %s", DATASET_MODEL_DIR)
-        tts = TTS(model_path=DATASET_MODEL_DIR).to(device)
-    else:
-        log.info("⬇️ Dataset model not found — downloading XTTS model (first run)...")
-        manager = ModelManager()
-        raw_path = manager.download_model(MODEL_NAME)
-        model_path = resolve_model_path(raw_path)
-        if not model_path or not os.path.exists(model_path):
-            # As a robust fallback, call TTS() with model id then try to locate typical folder
-            log.warning("Could not resolve model path from ModelManager result; falling back to direct TTS init.")
-            tts_tmp = TTS(MODEL_NAME).to(device)
-            # try to locate in default coqui location
-            candidate = os.path.expanduser("~/.local/share/tts")
-            model_path = None
-            if os.path.exists(candidate):
-                # pick the directory that contains the xtts_v2 name
-                for root, dirs, files in os.walk(candidate):
-                    if "xtts_v2" in root or "xtts" in root:
-                        model_path = root
-                        break
-            # if still None, set model_path to candidate root
-            if not model_path:
-                model_path = candidate
-            # assign tts from tts_tmp
-            tts = tts_tmp
-        # Ensure model_path now points to a directory
-        if model_path and os.path.exists(model_path):
-            # create local cache dir and copy files (ensure string)
-            os.makedirs(LOCAL_CACHE_DIR, exist_ok=True)
-            try:
-                shutil.copytree(model_path, LOCAL_CACHE_DIR, dirs_exist_ok=True)
-            except Exception as e:
-                # if copytree fails (we still continue)
-                log.warning("Copy to LOCAL_CACHE_DIR failed: %s", e)
-            # Copy into dataset mount for persistence (if writable)
-            try:
-                os.makedirs(DATASET_MODEL_DIR, exist_ok=True)
-                for item in os.listdir(model_path):
-                    s = os.path.join(model_path, item)
-                    d = os.path.join(DATASET_MODEL_DIR, item)
-                    if os.path.isdir(s):
-                        shutil.copytree(s, d, dirs_exist_ok=True)
-                    else:
-                        shutil.copy2(s, d)
-                log.info("📦 Model copied into dataset mount: %s", DATASET_MODEL_DIR)
-            except Exception as e:
-                log.warning("Could not copy model into dataset mount (may be read-only or missing perms): %s", e)
-            # If tts not already set (from fallback), initialize from model_path or dataset mount
-            if tts is None:
-                # prefer dataset dir if copy succeeded, otherwise local cache
-                init_path = DATASET_MODEL_DIR if os.path.exists(DATASET_MODEL_DIR) and os.listdir(DATASET_MODEL_DIR) else LOCAL_CACHE_DIR
-                tts = TTS(model_path=init_path).to(device)
-        else:
-            # final fallback: initialize directly from model name (internet)
-            log.warning("Could not find downloaded model folder; initializing TTS from model id directly.")
-            tts = TTS(MODEL_NAME).to(device)
-    log.info("✅ TTS ready.")
 except Exception as exc:
-    log.exception("Failed to prepare TTS model: %s", exc)
-    # Try a minimal fallback to avoid crash - attempt to init directly.
-    try:
-        tts = TTS(MODEL_NAME).to(device)
-    except Exception as exc2:
-        log.exception("Fatal: TTS could not be initialized: %s", exc2)
-        # re-raise so app startup fails loudly (preferred)
-        raise
 # ============================================================
 # Application logic (routes & helpers)
@@ -213,7 +95,7 @@ def generate_voice():
             "created_at": datetime.now(),
         }
-        # Run processing (synchronous here - see note below about background processing)
         process_vox(user_id, text, video, audio_base64, task_id)
         return jsonify({"message": "Processing started", "task_id": task_id}), 202
@@ -224,7 +106,13 @@ def generate_voice():
 def process_vox(user_id, text, video, audio_base64, task_id):
     temp_audio_path = None
     try:
         # 1) Prepare input audio
         if audio_base64:
             if audio_base64.startswith("data:audio/"):
@@ -241,8 +129,8 @@ def process_vox(user_id, text, video, audio_base64, task_id):
         if not valid:
             raise Exception(f"Invalid audio file: {msg}")
-        # 3) Generate TTS (clone)
-        result_file = clone(text, temp_audio_path)
         # 4) Save output to user_audios
         out_dir = "user_audios"
@@ -250,16 +138,17 @@ def process_vox(user_id, text, video, audio_base64, task_id):
         file_name = generate_random_filename("mp3")
         file_path = os.path.join(out_dir, file_name)
-        with open(result_file, "rb") as src, open(file_path, "wb") as dst:
             dst.write(src.read())
         # 5) Gather metadata
         with wave.open(file_path, "rb") as wf:
             dura = wf.getnframes() / float(wf.getframerate())
             duration = f"{dura:.2f}"
             title = text[:20]
-        # 6) Upload and save
         audio_url = save_to_dataset_repo(file_path, f"user/data/audios/{file_name}", file_name)
         active_tasks[task_id].update(
             {
@@ -279,28 +168,70 @@ def process_vox(user_id, text, video, audio_base64, task_id):
         }
     finally:
-        # cleanup
-        try:
-            if temp_audio_path and os.path.exists(temp_audio_path):
-                os.remove(temp_audio_path)
-            task = active_tasks.get(task_id)
-            if task:
-                if task["status"]== "completed":
-                    remove_task_after_delay(task_id, delay_seconds=300)
-                elif task["status"] == "failed":
-                    del active_tasks[task_id]
-        except Exception:
-            # ignore cleanup issues
-            pass
 def clone(text, audio):
     """
-    Use the TTS instance to produce an output file. Returns the path to the output file.
     """
-    out_path = "./output.wav"
-    # use tts to write audio; let TTS manage model specifics
-    tts.tts_to_file(text=text, speaker_wav=audio, language="en", file_path=out_path)
     return out_path
@@ -342,4 +273,5 @@ def remove_task_after_delay(task_id, delay_seconds=300):
     timer.start()
-# Run only when invoked directly (Gunicorn will ignore this block)

 from datetime import datetime
 from flask_cors import CORS
 from TTS.api import TTS
 import os
 import base64
 import logging
 import threading
+import tempfile  # for better temp handling
+from pydub import AudioSegment  # for WAV concat (OOM fix)
+import psutil  # for RAM check
 from helper import (
     save_audio,
     video_to_audio,
     validate_audio_file,
     ensure_wav_format,
+    # Assume you add: create_connection (with retry below)
 )
 # ---------- Basic config ----------
 os.environ["COQUI_TOS_AGREED"] = "1"
 device = "cpu"
 MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"  # coqui model id
 MAX_AUDIO_SIZE_MB = 15
+MAX_TEXT_LEN = 250  # per chunk for OOM safety
+# Simplified TTS init: Direct from model name (handles download/config auto)
 tts = None
 try:
+    log.info(f"⬇️ Initializing XTTS from {MODEL_NAME}...")
+    tts = TTS(model_name=MODEL_NAME).to(device)  # Uses model_name kwarg for HF-style load
+    log.info("✅ TTS ready (direct init).")
 except Exception as exc:
+    log.exception("Fatal: TTS init failed: %s", exc)
+    raise
 # ============================================================
 # Application logic (routes & helpers)
             "created_at": datetime.now(),
         }
+        # Run processing (synchronous; consider Celery for prod scaling)
         process_vox(user_id, text, video, audio_base64, task_id)
         return jsonify({"message": "Processing started", "task_id": task_id}), 202
 def process_vox(user_id, text, video, audio_base64, task_id):
     temp_audio_path = None
+    temp_output_path = None
     try:
+        # RAM check (OOM guard)
+        ram_gb = psutil.virtual_memory().available / (1024 ** 3)
+        if ram_gb < 2:  # XTTS needs ~2GB free
+            raise Exception("Low RAM: Please try a shorter text.")
         # 1) Prepare input audio
         if audio_base64:
             if audio_base64.startswith("data:audio/"):
         if not valid:
             raise Exception(f"Invalid audio file: {msg}")
+        # 3) Generate TTS (clone) with chunking for long text
+        temp_output_path = clone(text, temp_audio_path)  # now returns possibly concatenated path
         # 4) Save output to user_audios
         out_dir = "user_audios"
         file_name = generate_random_filename("mp3")
         file_path = os.path.join(out_dir, file_name)
+        with open(temp_output_path, "rb") as src, open(file_path, "wb") as dst:
             dst.write(src.read())
         # 5) Gather metadata
+        import wave
         with wave.open(file_path, "rb") as wf:
             dura = wf.getnframes() / float(wf.getframerate())
             duration = f"{dura:.2f}"
             title = text[:20]
+        # 6) Upload and save (with DB retry in helper)
         audio_url = save_to_dataset_repo(file_path, f"user/data/audios/{file_name}", file_name)
         active_tasks[task_id].update(
             {
         }
     finally:
+        # Better cleanup with tempfile
+        for path in [temp_audio_path, temp_output_path]:
+            if path and os.path.exists(path):
+                try:
+                    os.remove(path)
+                except:
+                    pass
+        task = active_tasks.get(task_id)
+        if task and task["status"] == "completed":
+            remove_task_after_delay(task_id, delay_seconds=300)
+        elif task and task["status"] == "failed":
+            # Keep failed for 60s then del
+            threading.Timer(60, lambda: active_tasks.pop(task_id, None)).start()
 def clone(text, audio):
     """
+    Generate cloned audio; chunk long text to avoid OOM.
+    Returns path to (possibly concatenated) output WAV.
     """
+    # Simple lang detect (improve with langdetect lib if needed)
+    lang = "en"  # default
+    if any(c in text for c in "अइउ"): lang = "hi"  # Hindi example
+    elif any(c in text for c in "äöü"): lang = "de"  # German
+    out_path = tempfile.mktemp(suffix=".wav")
+    chunks = []
+    sentences = text.split(". ")  # Basic split
+    current_chunk = ""
+    for sent in sentences + ["."]:  # Add final
+        if len(current_chunk + sent) < MAX_TEXT_LEN:
+            current_chunk += sent + ". "
+        else:
+            if current_chunk:
+                chunks.append(current_chunk.strip())
+            current_chunk = sent + ". "
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    chunk_files = []
+    for chunk in chunks:
+        if not chunk: continue
+        chunk_out = tempfile.mktemp(suffix=".wav")
+        tts.tts_to_file(
+            text=chunk,
+            speaker_wav=audio,
+            language=lang,
+            file_path=chunk_out,
+            split_sentences=False  # Avoid double-split
+        )
+        chunk_files.append(chunk_out)
+    # Concat if multi-chunk
+    if len(chunk_files) > 1:
+        combined = AudioSegment.empty()
+        for f in chunk_files:
+            combined += AudioSegment.from_wav(f)
+        combined.export(out_path, format="wav")
+        # Clean chunk temps
+        for f in chunk_files: os.remove(f)
+    else:
+        shutil.move(chunk_files[0] if chunk_files else out_path, out_path)
+        os.remove(chunk_files[0]) if chunk_files else None
     return out_path
     timer.start()
+if __name__ == "__main__":
+    app.run(debug=True, host="0.0.0.0", port=7860)