Spaces:

yukee1992
/

parler-tts-api

Paused

App Files Files Community

yukee1992 commited on Sep 14, 2025

Commit

125c3d1

verified ·

1 Parent(s): 48cbdbe

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -252

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py - Enhanced with multiple Parler-TTS loading strategies
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
@@ -11,7 +11,7 @@ import os
 import torch
 import numpy as np
 import soundfile as sf
-import importlib
 # Set up logging
 logging.basicConfig(level=logging.INFO)
@@ -31,10 +31,7 @@ app.add_middleware(
 OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "https://yukee1992-oci-video-storage.hf.space")
 # Global variables
-parler_model = None
-parler_processor = None
-bark_model = None
-bark_processor = None
 model_loaded = False
 model_type = "none"
@@ -44,278 +41,177 @@ class VoiceoverRequest(BaseModel):
     voiceover_scenes: List[str]
     upload_to_oci: Optional[bool] = False
-class ModelRequest(BaseModel):
-    action: str  # "reload", "status", "switch"
 @app.on_event("startup")
 async def startup_event():
-    """Initialize the application with multiple loading strategies"""
-    global parler_model, parler_processor, bark_model, bark_processor, model_loaded, model_type
     logger.info("=== TTS API Starting ===")
-    logger.info("Attempting to load Parler-TTS with enhanced strategies...")
-    # Try multiple loading strategies for Parler-TTS
-    parler_loaded = await load_parler_tts_enhanced()
-    if parler_loaded:
         model_loaded = True
-        model_type = "parler-tts"
-        logger.info("✅ Parler-TTS loaded successfully!")
-    else:
-        # Fallback to Bark
-        logger.info("Parler-TTS failed, loading Bark as fallback...")
-        bark_loaded = await load_bark_model()
-        if bark_loaded:
-            model_loaded = True
-            model_type = "bark"
-            logger.info("✅ Bark model loaded as fallback!")
-        else:
-            logger.error("❌ All models failed to load")
-            model_loaded = False
-async def load_parler_tts_enhanced():
-    """Enhanced Parler-TTS loading with multiple strategies"""
-    strategies = [
-        try_strategy_1,  # Direct import with trust_remote_code
-        try_strategy_2,  # Force model download
-        try_strategy_3,  # Manual configuration
-        try_strategy_4   # Alternative import approach
-    ]
-    for i, strategy in enumerate(strategies, 1):
-        logger.info(f"Trying Parler-TTS loading strategy {i}...")
-        success = strategy()
-        if success:
-            return True
-        logger.warning(f"Strategy {i} failed")
-    return False
-def try_strategy_1():
-    """Strategy 1: Direct import with trust_remote_code"""
-    try:
-        from transformers import AutoProcessor, AutoModel
-        global parler_processor, parler_model
-        # Clear cache and force download
-        import transformers
-        transformers.utils.move_cache()
-        parler_processor = AutoProcessor.from_pretrained(
-            "parler-tts/parler-tts-mini-v1",
-            trust_remote_code=True,
-            force_download=True,
-            resume_download=False,
-            local_files_only=False
-        )
-        parler_model = AutoModel.from_pretrained(
-            "parler-tts/parler-tts-mini-v1",
-            trust_remote_code=True,
-            force_download=True,
-            resume_download=False,
-            local_files_only=False
-        )
-        # Test the model
-        test_inputs = parler_processor(
-            text="Test",
-            description="A test voice",
-            return_tensors="pt"
-        )
-        with torch.no_grad():
-            test_output = parler_model.generate(**test_inputs)
-        logger.info("✅ Strategy 1 successful!")
-        return True
-    except Exception as e:
-        logger.warning(f"Strategy 1 failed: {e}")
-        return False
-def try_strategy_2():
-    """Strategy 2: Manual model configuration"""
     try:
-        from transformers import AutoConfig
-        import torch
-        # First get the config to understand the model
-        config = AutoConfig.from_pretrained(
-            "parler-tts/parler-tts-mini-v1",
-            trust_remote_code=True
-        )
-        logger.info(f"Model config: {config.model_type}")
-        # Now try to load with explicit architecture
-        if hasattr(config, 'architectures') and config.architectures:
-            model_class = getattr(
-                importlib.import_module('transformers'),
-                config.architectures[0]
-            )
-            global parler_processor, parler_model
-            parler_processor = AutoProcessor.from_pretrained(
-                "parler-tts/parler-tts-mini-v1",
-                trust_remote_code=True
-            )
-            parler_model = model_class.from_pretrained(
-                "parler-tts/parler-tts-mini-v1",
-                trust_remote_code=True,
-                config=config
-            )
-            logger.info("✅ Strategy 2 successful!")
             return True
     except Exception as e:
-        logger.warning(f"Strategy 2 failed: {e}")
-        return False
     return False
-def try_strategy_3():
-    """Strategy 3: Use model-specific classes"""
-    try:
-        # Try to import ParlerTTS specific classes
-        try:
-            from transformers import ParlerTTSForConditionalGeneration, ParlerTTSProcessor
-            model_class = ParlerTTSForConditionalGeneration
-            processor_class = ParlerTTSProcessor
-        except ImportError:
-            # If specific classes don't exist, try to create them dynamically
-            from transformers import AutoModel, AutoProcessor
-            model_class = AutoModel
-            processor_class = AutoProcessor
-        global parler_processor, parler_model
-        parler_processor = processor_class.from_pretrained(
-            "parler-tts/parler-tts-mini-v1",
-            trust_remote_code=True
-        )
-        parler_model = model_class.from_pretrained(
-            "parler-tts/parler-tts-mini-v1",
-            trust_remote_code=True
-        )
-        logger.info("✅ Strategy 3 successful!")
-        return True
-    except Exception as e:
-        logger.warning(f"Strategy 3 failed: {e}")
-        return False
-def try_strategy_4():
-    """Strategy 4: Alternative approach with different parameters"""
-    try:
-        from transformers import AutoProcessor, AutoModel
-        global parler_processor, parler_model
-        # Try with different parameters
-        parler_processor = AutoProcessor.from_pretrained(
-            "parler-tts/parler-tts-mini-v1",
-            trust_remote_code=True,
-            use_fast=True,
-            revision="main"
-        )
-        parler_model = AutoModel.from_pretrained(
-            "parler-tts/parler-tts-mini-v1",
-            trust_remote_code=True,
-            low_cpu_mem_usage=True,
-            torch_dtype=torch.float32,
-            revision="main"
-        )
-        logger.info("✅ Strategy 4 successful!")
-        return True
-    except Exception as e:
-        logger.warning(f"Strategy 4 failed: {e}")
-        return False
 async def load_bark_model():
     """Load Bark model as fallback"""
     try:
         from transformers import AutoProcessor, AutoModel
-        global bark_processor, bark_model
-        bark_processor = AutoProcessor.from_pretrained("suno/bark-small")
-        bark_model = AutoModel.from_pretrained("suno/bark-small")
         return True
     except Exception as e:
         logger.error(f"Bark model loading failed: {e}")
         return False
-def generate_with_parler(text, description="A male speaker with a low-pitched voice"):
-    """Generate voiceover using Parler-TTS"""
     try:
-        inputs = parler_processor(
-            text=text,
-            description=description,
-            return_tensors="pt"
-        )
-        with torch.no_grad():
-            speech = parler_model.generate(**inputs)
-        # Save audio
-        speech = speech.cpu().numpy().squeeze()
-        temp_dir = tempfile.gettempdir()
-        temp_file = os.path.join(temp_dir, "parler_generated.wav")
-        sample_rate = getattr(parler_model.config, "sampling_rate", 16000)
-        sf.write(temp_file, speech, sample_rate)
-        return temp_file, None
     except Exception as e:
         return None, str(e)
-def generate_with_bark(text):
-    """Generate voiceover using Bark"""
     try:
-        inputs = bark_processor(text=[text], return_tensors="pt")
-        with torch.no_grad():
-            speech_values = bark_model.generate(**inputs, do_sample=True)
-        # Convert and save
-        audio_array = speech_values.cpu().numpy().squeeze()
-        temp_dir = tempfile.gettempdir()
-        temp_file = os.path.join(temp_dir, "bark_generated.wav")
-        sf.write(temp_file, audio_array, 24000)
-        return temp_file, None
     except Exception as e:
         return None, str(e)
 @app.get("/")
 async def root():
     return {
-        "message": "TTS API with Enhanced Parler-TTS Loading",
         "model_loaded": model_loaded,
         "model_type": model_type,
-        "strategies_tested": "4 enhanced strategies",
         "endpoints": {
             "health": "/health",
             "model_status": "/api/model-status",
-            "generate_voiceovers": "/api/generate-voiceovers",
-            "reload_model": "/api/reload-model"
         }
     }
@@ -325,8 +221,7 @@ async def health():
         "status": "healthy" if model_loaded else "degraded",
         "model_loaded": model_loaded,
         "model_type": model_type,
-        "parler_loaded": parler_model is not None,
-        "bark_loaded": bark_model is not None
     }
 @app.get("/api/model-status")
@@ -335,21 +230,11 @@ async def model_status():
     return {
         "model_loaded": model_loaded,
         "model_type": model_type,
-        "parler_model_available": parler_model is not None,
-        "bark_model_available": bark_model is not None,
-        "loading_strategies": "4 enhanced strategies implemented"
     }
-@app.post("/api/reload-model")
-async def reload_model(request: ModelRequest):
-    """Reload model with different strategy"""
-    if request.action == "reload":
-        # Re-initialize
-        await startup_event()
-        return {"status": "reloaded", "model_type": model_type}
-    else:
-        return {"status": "unknown_action"}
 @app.post("/api/generate-voiceovers")
 async def generate_voiceovers_endpoint(request: VoiceoverRequest):
     """Main API endpoint"""
@@ -364,13 +249,7 @@ async def generate_voiceovers_endpoint(request: VoiceoverRequest):
                 filename = f"voiceover_{i:02d}.wav"
                 logger.info(f"Generating voiceover {i} with {model_type}...")
-                if model_type == "parler-tts" and parler_model is not None:
-                    temp_file, error = generate_with_parler(scene_text)
-                elif model_type == "bark" and bark_model is not None:
-                    temp_file, error = generate_with_bark(scene_text)
-                else:
-                    error = "No valid model available"
                 if error:
                     results.append({
@@ -408,7 +287,8 @@ async def generate_voiceovers_endpoint(request: VoiceoverRequest):
                     "filename": filename,
                     "text_preview": scene_text[:100] + "..." if len(scene_text) > 100 else scene_text,
                     "uploaded_to_oci": bool(upload_result),
-                    "model": model_type
                 })
             except Exception as e:
@@ -426,6 +306,7 @@ async def generate_voiceovers_endpoint(request: VoiceoverRequest):
             "successful": len([r for r in results if r['status'] == 'success']),
             "failed": len([r for r in results if r['status'] != 'success']),
             "model_type": model_type,
             "results": results
         }

+# app.py - Using Coqui XTTS instead of Parler-TTS
 from fastapi import FastAPI, HTTPException
 from fastapi.responses import JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 import torch
 import numpy as np
 import soundfile as sf
+import io
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 OCI_UPLOAD_API_URL = os.getenv("OCI_UPLOAD_API_URL", "https://yukee1992-oci-video-storage.hf.space")
 # Global variables
+tts_model = None
 model_loaded = False
 model_type = "none"
     voiceover_scenes: List[str]
     upload_to_oci: Optional[bool] = False
 @app.on_event("startup")
 async def startup_event():
+    """Initialize the application with Coqui XTTS"""
+    global tts_model, model_loaded, model_type
     logger.info("=== TTS API Starting ===")
+    # Try Coqui XTTS first (most reliable)
+    if await load_coqui_xtts():
         model_loaded = True
+        model_type = "coqui-xtts"
+        logger.info("✅ Coqui XTTS loaded successfully!")
+        return
+    # Fallback to Bark
+    if await load_bark_model():
+        model_loaded = True
+        model_type = "bark"
+        logger.info("✅ Bark model loaded as fallback!")
+        return
+    logger.error("❌ All models failed to load")
+    model_loaded = False
+async def load_coqui_xtts():
+    """Load Coqui XTTS model"""
     try:
+        logger.info("Loading Coqui XTTS model...")
+        # Method 1: Try using transformers
+        try:
+            from transformers import AutoProcessor, AutoModel
+            processor = AutoProcessor.from_pretrained("coqui/XTTS-v2")
+            model = AutoModel.from_pretrained("coqui/XTTS-v2")
+            global tts_model
+            tts_model = {"processor": processor, "model": model, "type": "transformers"}
+            return True
+        except Exception as e:
+            logger.warning(f"Transformers XTTS failed: {e}")
+        # Method 2: Try using TTS package
+        try:
+            from TTS.api import TTS
+            tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
+            global tts_model
+            tts_model = {"tts": tts, "type": "coqui"}
             return True
+        except Exception as e:
+            logger.warning(f"Coqui TTS package failed: {e}")
     except Exception as e:
+        logger.error(f"Coqui XTTS loading failed: {e}")
     return False
 async def load_bark_model():
     """Load Bark model as fallback"""
     try:
         from transformers import AutoProcessor, AutoModel
+        processor = AutoProcessor.from_pretrained("suno/bark-small")
+        model = AutoModel.from_pretrained("suno/bark-small")
+        global tts_model
+        tts_model = {"processor": processor, "model": model, "type": "bark"}
         return True
     except Exception as e:
         logger.error(f"Bark model loading failed: {e}")
         return False
+def generate_voiceover(text, speaker_wav=None):
+    """Generate voiceover using available model"""
     try:
+        if tts_model is None:
+            return None, "No model loaded"
+        if tts_model["type"] == "coqui":
+            # Using Coqui TTS package
+            tts = tts_model["tts"]
+            temp_dir = tempfile.gettempdir()
+            temp_file = os.path.join(temp_dir, "coqui_generated.wav")
+            tts.tts_to_file(
+                text=text,
+                speaker_wav=speaker_wav,
+                language="en",
+                file_path=temp_file
+            )
+            return temp_file, None
+        elif tts_model["type"] == "transformers":
+            # Using transformers XTTS
+            processor = tts_model["processor"]
+            model = tts_model["model"]
+            inputs = processor(text=[text], return_tensors="pt")
+            with torch.no_grad():
+                output = model.generate(**inputs)
+            temp_dir = tempfile.gettempdir()
+            temp_file = os.path.join(temp_dir, "xtts_generated.wav")
+            audio_array = output.cpu().numpy().squeeze()
+            sf.write(temp_file, audio_array, 24000)
+            return temp_file, None
+        elif tts_model["type"] == "bark":
+            # Using Bark
+            processor = tts_model["processor"]
+            model = tts_model["model"]
+            inputs = processor(text=[text], return_tensors="pt")
+            with torch.no_grad():
+                speech_values = model.generate(**inputs, do_sample=True)
+            temp_dir = tempfile.gettempdir()
+            temp_file = os.path.join(temp_dir, "bark_generated.wav")
+            audio_array = speech_values.cpu().numpy().squeeze()
+            sf.write(temp_file, audio_array, 24000)
+            return temp_file, None
     except Exception as e:
         return None, str(e)
+    return None, "Unknown model type"
+def upload_to_oci(file_path, filename, project_id):
+    """Upload to OCI storage"""
     try:
+        with open(file_path, 'rb') as f:
+            files = {'file': (filename, f)}
+            data = {
+                'project_id': project_id,
+                'subfolder': 'voiceover'
+            }
+            response = requests.post(
+                f"{OCI_UPLOAD_API_URL}/api/upload",
+                files=files,
+                data=data,
+                timeout=30
+            )
+            if response.status_code == 200:
+                return response.json(), None
+            else:
+                return None, f"Upload failed: {response.status_code}"
     except Exception as e:
         return None, str(e)
 @app.get("/")
 async def root():
     return {
+        "message": "TTS API with High-Quality Voice Generation",
         "model_loaded": model_loaded,
         "model_type": model_type,
+        "supported_models": ["coqui-xtts", "bark"],
         "endpoints": {
             "health": "/health",
             "model_status": "/api/model-status",
+            "generate_voiceovers": "/api/generate-voiceovers"
         }
     }
         "status": "healthy" if model_loaded else "degraded",
         "model_loaded": model_loaded,
         "model_type": model_type,
+        "quality": "high" if model_type == "coqui-xtts" else "good"
     }
 @app.get("/api/model-status")
     return {
         "model_loaded": model_loaded,
         "model_type": model_type,
+        "model_quality": "high" if model_type == "coqui-xtts" else "good",
+        "supported_models": ["Coqui XTTS (recommended)", "Bark (fallback)"],
+        "message": "Using Coqui XTTS for high-quality voice generation"
     }
 @app.post("/api/generate-voiceovers")
 async def generate_voiceovers_endpoint(request: VoiceoverRequest):
     """Main API endpoint"""
                 filename = f"voiceover_{i:02d}.wav"
                 logger.info(f"Generating voiceover {i} with {model_type}...")
+                temp_file, error = generate_voiceover(scene_text)
                 if error:
                     results.append({
                     "filename": filename,
                     "text_preview": scene_text[:100] + "..." if len(scene_text) > 100 else scene_text,
                     "uploaded_to_oci": bool(upload_result),
+                    "model": model_type,
+                    "quality": "high" if model_type == "coqui-xtts" else "good"
                 })
             except Exception as e:
             "successful": len([r for r in results if r['status'] == 'success']),
             "failed": len([r for r in results if r['status'] != 'success']),
             "model_type": model_type,
+            "voice_quality": "high" if model_type == "coqui-xtts" else "good",
             "results": results
         }