Spaces:
Sleeping
Sleeping
Ashish Kumar commited on
Commit ·
45bab5a
1
Parent(s): e903b9f
Fix: Remove TTS from requirements, load on-demand with graceful fallback
Browse files- app.py +18 -38
- requirements.txt +0 -1
app.py
CHANGED
|
@@ -16,12 +16,10 @@ import os
|
|
| 16 |
# Model configuration
|
| 17 |
MODEL_ID = "ashishkblink/Aawaz" # Your model repository
|
| 18 |
FALLBACK_MODEL = "facebook/mms-tts-hin" # Fallback if custom model fails
|
| 19 |
-
VOICE_CLONE_MODEL = "coqui/XTTS-v2" # Voice cloning model
|
| 20 |
|
| 21 |
# Load models (will be loaded on first use)
|
| 22 |
model = None
|
| 23 |
tokenizer = None
|
| 24 |
-
voice_clone_model = None
|
| 25 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 26 |
|
| 27 |
|
|
@@ -62,31 +60,6 @@ def load_model():
|
|
| 62 |
raise
|
| 63 |
|
| 64 |
|
| 65 |
-
@spaces.GPU
|
| 66 |
-
def load_voice_clone_model():
|
| 67 |
-
"""Load the voice cloning model (XTTS-v2)."""
|
| 68 |
-
global voice_clone_model
|
| 69 |
-
|
| 70 |
-
if voice_clone_model is not None:
|
| 71 |
-
return voice_clone_model
|
| 72 |
-
|
| 73 |
-
try:
|
| 74 |
-
from TTS.api import TTS
|
| 75 |
-
print(f"Loading voice cloning model: {VOICE_CLONE_MODEL}...")
|
| 76 |
-
# Use GPU if available
|
| 77 |
-
voice_clone_model = TTS(VOICE_CLONE_MODEL, gpu=(device == "cuda"))
|
| 78 |
-
print(f"✅ Loaded voice cloning model")
|
| 79 |
-
return voice_clone_model
|
| 80 |
-
except ImportError as e:
|
| 81 |
-
error_msg = f"TTS library not found: {e}. Install with: pip install TTS"
|
| 82 |
-
print(error_msg)
|
| 83 |
-
raise ImportError(error_msg)
|
| 84 |
-
except Exception as e:
|
| 85 |
-
error_msg = f"Error loading voice cloning model: {e}"
|
| 86 |
-
print(error_msg)
|
| 87 |
-
raise Exception(error_msg)
|
| 88 |
-
|
| 89 |
-
|
| 90 |
@spaces.GPU
|
| 91 |
def synthesize(text, speed=1.0):
|
| 92 |
"""
|
|
@@ -155,7 +128,7 @@ def synthesize(text, speed=1.0):
|
|
| 155 |
@spaces.GPU
|
| 156 |
def clone_voice(reference_audio, text, language="hi"):
|
| 157 |
"""
|
| 158 |
-
Clone voice from reference audio
|
| 159 |
|
| 160 |
Args:
|
| 161 |
reference_audio: Tuple of (sample_rate, audio_array) or file path
|
|
@@ -169,8 +142,20 @@ def clone_voice(reference_audio, text, language="hi"):
|
|
| 169 |
return None, "Please record or upload a reference audio sample (3-10 seconds recommended)."
|
| 170 |
|
| 171 |
try:
|
| 172 |
-
#
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
# Handle audio input (Gradio returns tuple of (sample_rate, audio_array))
|
| 176 |
if isinstance(reference_audio, tuple):
|
|
@@ -190,7 +175,6 @@ def clone_voice(reference_audio, text, language="hi"):
|
|
| 190 |
out_path = tmp_out.name
|
| 191 |
|
| 192 |
# Synthesize with voice cloning
|
| 193 |
-
# XTTS-v2 supports Hindi (language code "hi")
|
| 194 |
print(f"Synthesizing with voice cloning: {text[:50]}...")
|
| 195 |
tts_model.tts_to_file(
|
| 196 |
text=text,
|
|
@@ -214,12 +198,6 @@ def clone_voice(reference_audio, text, language="hi"):
|
|
| 214 |
except:
|
| 215 |
pass
|
| 216 |
|
| 217 |
-
except ImportError as e:
|
| 218 |
-
error_msg = (
|
| 219 |
-
f"Voice cloning requires TTS library. Error: {str(e)}\n"
|
| 220 |
-
"The model is loading for the first time - this may take a few minutes."
|
| 221 |
-
)
|
| 222 |
-
return None, error_msg
|
| 223 |
except Exception as e:
|
| 224 |
error_msg = f"Error during voice cloning: {str(e)}"
|
| 225 |
print(error_msg)
|
|
@@ -305,6 +283,8 @@ def create_interface():
|
|
| 305 |
- Speak naturally at normal pace
|
| 306 |
- 5-8 seconds works best
|
| 307 |
- First-time model loading may take 2-3 minutes
|
|
|
|
|
|
|
| 308 |
""")
|
| 309 |
|
| 310 |
with gr.Row():
|
|
@@ -350,7 +330,7 @@ def create_interface():
|
|
| 350 |
---
|
| 351 |
**Model Information:**
|
| 352 |
- Standard TTS: `ashishkblink/Aawaz` (based on MMS TTS)
|
| 353 |
-
- Voice Cloning: XTTS-v2 (
|
| 354 |
""")
|
| 355 |
|
| 356 |
return demo
|
|
|
|
| 16 |
# Model configuration
|
| 17 |
MODEL_ID = "ashishkblink/Aawaz" # Your model repository
|
| 18 |
FALLBACK_MODEL = "facebook/mms-tts-hin" # Fallback if custom model fails
|
|
|
|
| 19 |
|
| 20 |
# Load models (will be loaded on first use)
|
| 21 |
model = None
|
| 22 |
tokenizer = None
|
|
|
|
| 23 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 24 |
|
| 25 |
|
|
|
|
| 60 |
raise
|
| 61 |
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
@spaces.GPU
|
| 64 |
def synthesize(text, speed=1.0):
|
| 65 |
"""
|
|
|
|
| 128 |
@spaces.GPU
|
| 129 |
def clone_voice(reference_audio, text, language="hi"):
|
| 130 |
"""
|
| 131 |
+
Clone voice from reference audio using TTS library (on-demand loading).
|
| 132 |
|
| 133 |
Args:
|
| 134 |
reference_audio: Tuple of (sample_rate, audio_array) or file path
|
|
|
|
| 142 |
return None, "Please record or upload a reference audio sample (3-10 seconds recommended)."
|
| 143 |
|
| 144 |
try:
|
| 145 |
+
# Try to import and use TTS library
|
| 146 |
+
try:
|
| 147 |
+
from TTS.api import TTS
|
| 148 |
+
except ImportError:
|
| 149 |
+
return None, (
|
| 150 |
+
"Voice cloning requires the TTS library which couldn't be installed in this Space. "
|
| 151 |
+
"Please use the 'Standard TTS' tab for text-to-speech synthesis. "
|
| 152 |
+
"For voice cloning, you can run this locally with: pip install TTS"
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
# Load model (on first use)
|
| 156 |
+
print("Loading XTTS-v2 voice cloning model...")
|
| 157 |
+
tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=(device == "cuda"))
|
| 158 |
+
print("✅ Voice cloning model loaded")
|
| 159 |
|
| 160 |
# Handle audio input (Gradio returns tuple of (sample_rate, audio_array))
|
| 161 |
if isinstance(reference_audio, tuple):
|
|
|
|
| 175 |
out_path = tmp_out.name
|
| 176 |
|
| 177 |
# Synthesize with voice cloning
|
|
|
|
| 178 |
print(f"Synthesizing with voice cloning: {text[:50]}...")
|
| 179 |
tts_model.tts_to_file(
|
| 180 |
text=text,
|
|
|
|
| 198 |
except:
|
| 199 |
pass
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
except Exception as e:
|
| 202 |
error_msg = f"Error during voice cloning: {str(e)}"
|
| 203 |
print(error_msg)
|
|
|
|
| 283 |
- Speak naturally at normal pace
|
| 284 |
- 5-8 seconds works best
|
| 285 |
- First-time model loading may take 2-3 minutes
|
| 286 |
+
|
| 287 |
+
**Note:** Voice cloning uses XTTS-v2 model which will be downloaded on first use.
|
| 288 |
""")
|
| 289 |
|
| 290 |
with gr.Row():
|
|
|
|
| 330 |
---
|
| 331 |
**Model Information:**
|
| 332 |
- Standard TTS: `ashishkblink/Aawaz` (based on MMS TTS)
|
| 333 |
+
- Voice Cloning: XTTS-v2 (requires TTS library - will attempt to load on first use)
|
| 334 |
""")
|
| 335 |
|
| 336 |
return demo
|
requirements.txt
CHANGED
|
@@ -4,4 +4,3 @@ soundfile>=0.12.1
|
|
| 4 |
numpy>=1.24.0
|
| 5 |
accelerate>=0.24.0
|
| 6 |
librosa>=0.10.0
|
| 7 |
-
TTS==0.22.0
|
|
|
|
| 4 |
numpy>=1.24.0
|
| 5 |
accelerate>=0.24.0
|
| 6 |
librosa>=0.10.0
|
|
|