Ashish Kumar commited on
Commit
45bab5a
·
1 Parent(s): e903b9f

Fix: Remove TTS from requirements, load on-demand with graceful fallback

Browse files
Files changed (2) hide show
  1. app.py +18 -38
  2. requirements.txt +0 -1
app.py CHANGED
@@ -16,12 +16,10 @@ import os
16
  # Model configuration
17
  MODEL_ID = "ashishkblink/Aawaz" # Your model repository
18
  FALLBACK_MODEL = "facebook/mms-tts-hin" # Fallback if custom model fails
19
- VOICE_CLONE_MODEL = "coqui/XTTS-v2" # Voice cloning model
20
 
21
  # Load models (will be loaded on first use)
22
  model = None
23
  tokenizer = None
24
- voice_clone_model = None
25
  device = "cuda" if torch.cuda.is_available() else "cpu"
26
 
27
 
@@ -62,31 +60,6 @@ def load_model():
62
  raise
63
 
64
 
65
- @spaces.GPU
66
- def load_voice_clone_model():
67
- """Load the voice cloning model (XTTS-v2)."""
68
- global voice_clone_model
69
-
70
- if voice_clone_model is not None:
71
- return voice_clone_model
72
-
73
- try:
74
- from TTS.api import TTS
75
- print(f"Loading voice cloning model: {VOICE_CLONE_MODEL}...")
76
- # Use GPU if available
77
- voice_clone_model = TTS(VOICE_CLONE_MODEL, gpu=(device == "cuda"))
78
- print(f"✅ Loaded voice cloning model")
79
- return voice_clone_model
80
- except ImportError as e:
81
- error_msg = f"TTS library not found: {e}. Install with: pip install TTS"
82
- print(error_msg)
83
- raise ImportError(error_msg)
84
- except Exception as e:
85
- error_msg = f"Error loading voice cloning model: {e}"
86
- print(error_msg)
87
- raise Exception(error_msg)
88
-
89
-
90
  @spaces.GPU
91
  def synthesize(text, speed=1.0):
92
  """
@@ -155,7 +128,7 @@ def synthesize(text, speed=1.0):
155
  @spaces.GPU
156
  def clone_voice(reference_audio, text, language="hi"):
157
  """
158
- Clone voice from reference audio and synthesize speech.
159
 
160
  Args:
161
  reference_audio: Tuple of (sample_rate, audio_array) or file path
@@ -169,8 +142,20 @@ def clone_voice(reference_audio, text, language="hi"):
169
  return None, "Please record or upload a reference audio sample (3-10 seconds recommended)."
170
 
171
  try:
172
- # Load voice cloning model
173
- tts_model = load_voice_clone_model()
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  # Handle audio input (Gradio returns tuple of (sample_rate, audio_array))
176
  if isinstance(reference_audio, tuple):
@@ -190,7 +175,6 @@ def clone_voice(reference_audio, text, language="hi"):
190
  out_path = tmp_out.name
191
 
192
  # Synthesize with voice cloning
193
- # XTTS-v2 supports Hindi (language code "hi")
194
  print(f"Synthesizing with voice cloning: {text[:50]}...")
195
  tts_model.tts_to_file(
196
  text=text,
@@ -214,12 +198,6 @@ def clone_voice(reference_audio, text, language="hi"):
214
  except:
215
  pass
216
 
217
- except ImportError as e:
218
- error_msg = (
219
- f"Voice cloning requires TTS library. Error: {str(e)}\n"
220
- "The model is loading for the first time - this may take a few minutes."
221
- )
222
- return None, error_msg
223
  except Exception as e:
224
  error_msg = f"Error during voice cloning: {str(e)}"
225
  print(error_msg)
@@ -305,6 +283,8 @@ def create_interface():
305
  - Speak naturally at normal pace
306
  - 5-8 seconds works best
307
  - First-time model loading may take 2-3 minutes
 
 
308
  """)
309
 
310
  with gr.Row():
@@ -350,7 +330,7 @@ def create_interface():
350
  ---
351
  **Model Information:**
352
  - Standard TTS: `ashishkblink/Aawaz` (based on MMS TTS)
353
- - Voice Cloning: XTTS-v2 (Coqui TTS)
354
  """)
355
 
356
  return demo
 
16
  # Model configuration
17
  MODEL_ID = "ashishkblink/Aawaz" # Your model repository
18
  FALLBACK_MODEL = "facebook/mms-tts-hin" # Fallback if custom model fails
 
19
 
20
  # Load models (will be loaded on first use)
21
  model = None
22
  tokenizer = None
 
23
  device = "cuda" if torch.cuda.is_available() else "cpu"
24
 
25
 
 
60
  raise
61
 
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  @spaces.GPU
64
  def synthesize(text, speed=1.0):
65
  """
 
128
  @spaces.GPU
129
  def clone_voice(reference_audio, text, language="hi"):
130
  """
131
+ Clone voice from reference audio using TTS library (on-demand loading).
132
 
133
  Args:
134
  reference_audio: Tuple of (sample_rate, audio_array) or file path
 
142
  return None, "Please record or upload a reference audio sample (3-10 seconds recommended)."
143
 
144
  try:
145
+ # Try to import and use TTS library
146
+ try:
147
+ from TTS.api import TTS
148
+ except ImportError:
149
+ return None, (
150
+ "Voice cloning requires the TTS library which couldn't be installed in this Space. "
151
+ "Please use the 'Standard TTS' tab for text-to-speech synthesis. "
152
+ "For voice cloning, you can run this locally with: pip install TTS"
153
+ )
154
+
155
+ # Load model (on first use)
156
+ print("Loading XTTS-v2 voice cloning model...")
157
+ tts_model = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=(device == "cuda"))
158
+ print("✅ Voice cloning model loaded")
159
 
160
  # Handle audio input (Gradio returns tuple of (sample_rate, audio_array))
161
  if isinstance(reference_audio, tuple):
 
175
  out_path = tmp_out.name
176
 
177
  # Synthesize with voice cloning
 
178
  print(f"Synthesizing with voice cloning: {text[:50]}...")
179
  tts_model.tts_to_file(
180
  text=text,
 
198
  except:
199
  pass
200
 
 
 
 
 
 
 
201
  except Exception as e:
202
  error_msg = f"Error during voice cloning: {str(e)}"
203
  print(error_msg)
 
283
  - Speak naturally at normal pace
284
  - 5-8 seconds works best
285
  - First-time model loading may take 2-3 minutes
286
+
287
+ **Note:** Voice cloning uses XTTS-v2 model which will be downloaded on first use.
288
  """)
289
 
290
  with gr.Row():
 
330
  ---
331
  **Model Information:**
332
  - Standard TTS: `ashishkblink/Aawaz` (based on MMS TTS)
333
+ - Voice Cloning: XTTS-v2 (requires TTS library - will attempt to load on first use)
334
  """)
335
 
336
  return demo
requirements.txt CHANGED
@@ -4,4 +4,3 @@ soundfile>=0.12.1
4
  numpy>=1.24.0
5
  accelerate>=0.24.0
6
  librosa>=0.10.0
7
- TTS==0.22.0
 
4
  numpy>=1.24.0
5
  accelerate>=0.24.0
6
  librosa>=0.10.0