Spaces:

ashishkblink
/

awaz

Sleeping

App Files Files Community

Ashish Kumar commited on Jan 5

Commit

45bab5a

1 Parent(s): e903b9f

Fix: Remove TTS from requirements, load on-demand with graceful fallback

Browse files

Files changed (2) hide show

app.py +18 -38
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -16,12 +16,10 @@ import os
 # Model configuration
 MODEL_ID = "ashishkblink/Aawaz"  # Your model repository
 FALLBACK_MODEL = "facebook/mms-tts-hin"  # Fallback if custom model fails
-VOICE_CLONE_MODEL = "coqui/XTTS-v2"  # Voice cloning model
 # Load models (will be loaded on first use)
 model = None
 tokenizer = None
-voice_clone_model = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -62,31 +60,6 @@ def load_model():
         raise
-@spaces.GPU
-def load_voice_clone_model():
-    """Load the voice cloning model (XTTS-v2)."""
-    global voice_clone_model
-    if voice_clone_model is not None:
-        return voice_clone_model
-    try:
-        from TTS.api import TTS
-        print(f"Loading voice cloning model: {VOICE_CLONE_MODEL}...")
-        # Use GPU if available
-        voice_clone_model = TTS(VOICE_CLONE_MODEL, gpu=(device == "cuda"))
-        print(f"✅ Loaded voice cloning model")
-        return voice_clone_model
-    except ImportError as e:
-        error_msg = f"TTS library not found: {e}. Install with: pip install TTS"
-        print(error_msg)
-        raise ImportError(error_msg)
-    except Exception as e:
-        error_msg = f"Error loading voice cloning model: {e}"
-        print(error_msg)
-        raise Exception(error_msg)
 @spaces.GPU
 def synthesize(text, speed=1.0):
     """
@@ -155,7 +128,7 @@ def synthesize(text, speed=1.0):
 @spaces.GPU
 def clone_voice(reference_audio, text, language="hi"):
     """
-    Clone voice from reference audio and synthesize speech.
     Args:
         reference_audio: Tuple of (sample_rate, audio_array) or file path
@@ -169,8 +142,20 @@ def clone_voice(reference_audio, text, language="hi"):
         return None, "Please record or upload a reference audio sample (3-10 seconds recommended)."
     try:
-        # Load voice cloning model
-        tts_model = load_voice_clone_model()
         # Handle audio input (Gradio returns tuple of (sample_rate, audio_array))
         if isinstance(reference_audio, tuple):
@@ -190,7 +175,6 @@ def clone_voice(reference_audio, text, language="hi"):
                 out_path = tmp_out.name
             # Synthesize with voice cloning
-            # XTTS-v2 supports Hindi (language code "hi")
             print(f"Synthesizing with voice cloning: {text[:50]}...")
             tts_model.tts_to_file(
                 text=text,
@@ -214,12 +198,6 @@ def clone_voice(reference_audio, text, language="hi"):
             except:
                 pass
-    except ImportError as e:
-        error_msg = (
-            f"Voice cloning requires TTS library. Error: {str(e)}\n"
-            "The model is loading for the first time - this may take a few minutes."
-        )
-        return None, error_msg
     except Exception as e:
         error_msg = f"Error during voice cloning: {str(e)}"
         print(error_msg)
@@ -305,6 +283,8 @@ def create_interface():
                 - Speak naturally at normal pace
                 - 5-8 seconds works best
                 - First-time model loading may take 2-3 minutes
                 """)
                 with gr.Row():
@@ -350,7 +330,7 @@ def create_interface():
         ---
         **Model Information:**
         - Standard TTS: `ashishkblink/Aawaz` (based on MMS TTS)
-        - Voice Cloning: XTTS-v2 (Coqui TTS)
         """)
     return demo

 # Model configuration
 MODEL_ID = "ashishkblink/Aawaz"  # Your model repository
 FALLBACK_MODEL = "facebook/mms-tts-hin"  # Fallback if custom model fails
 # Load models (will be loaded on first use)
 model = None
 tokenizer = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
         raise
 @spaces.GPU
 def synthesize(text, speed=1.0):
     """
 @spaces.GPU
 def clone_voice(reference_audio, text, language="hi"):
     """
+    Clone voice from reference audio using TTS library (on-demand loading).
     Args:
         reference_audio: Tuple of (sample_rate, audio_array) or file path
         return None, "Please record or upload a reference audio sample (3-10 seconds recommended)."
     try:
+        # Try to import and use TTS library
+        try:
+            from TTS.api import TTS
+        except ImportError:
+            return None, (
+                "Voice cloning requires the TTS library which couldn't be installed in this Space. "
+                "Please use the 'Standard TTS' tab for text-to-speech synthesis. "
+                "For voice cloning, you can run this locally with: pip install TTS"
+            )
+        # Load model (on first use)
+        print("Loading XTTS-v2 voice cloning model...")
+        tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=(device == "cuda"))
+        print("✅ Voice cloning model loaded")
         # Handle audio input (Gradio returns tuple of (sample_rate, audio_array))
         if isinstance(reference_audio, tuple):
                 out_path = tmp_out.name
             # Synthesize with voice cloning
             print(f"Synthesizing with voice cloning: {text[:50]}...")
             tts_model.tts_to_file(
                 text=text,
             except:
                 pass
     except Exception as e:
         error_msg = f"Error during voice cloning: {str(e)}"
         print(error_msg)
                 - Speak naturally at normal pace
                 - 5-8 seconds works best
                 - First-time model loading may take 2-3 minutes
+                **Note:** Voice cloning uses XTTS-v2 model which will be downloaded on first use.
                 """)
                 with gr.Row():
         ---
         **Model Information:**
         - Standard TTS: `ashishkblink/Aawaz` (based on MMS TTS)
+        - Voice Cloning: XTTS-v2 (requires TTS library - will attempt to load on first use)
         """)
     return demo

requirements.txt CHANGED Viewed

@@ -4,4 +4,3 @@ soundfile>=0.12.1
 numpy>=1.24.0
 accelerate>=0.24.0
 librosa>=0.10.0
-TTS==0.22.0

 numpy>=1.24.0
 accelerate>=0.24.0
 librosa>=0.10.0